In [286]:
# Import Dependencies
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix


In [3]:
# Display all columns
pd.set_option('display.max_columns', None)

### Preprocessing data

In [4]:
# Load data into dataframe
df = pd.read_csv("csv_files/beer_info_from_db.csv")

In [323]:
df.sort_values("beer_id")

Unnamed: 0,beer_id,beer_name,beer_style,style_key,brewery,description,abv,ave_rating,min_ibu,max_ibu,astringency,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty,avg_ibu,condensed_style
0,1,Amber,Altbier,8,Alaskan Brewing Co.,"Notes:Richly malty and long on the palate, wit...",5.3,3.65,25,50,13,32,9,47,74,33,0,33,57,8,111,37.5,Altbier
1,2,Double Bag,Altbier,8,Long Trail Brewing Co.,"Notes:This malty, full-bodied double alt is al...",7.2,3.90,25,50,12,57,18,33,55,16,0,24,35,12,84,37.5,Altbier
2,3,Long Trail Ale,Altbier,8,Long Trail Brewing Co.,Notes:Long Trail Ale is a full-bodied amber al...,5.0,3.58,25,50,14,37,6,42,43,11,0,10,54,4,62,37.5,Altbier
3,4,Doppelsticke,Altbier,8,Uerige Obergärige Hausbrauerei,Notes:,8.5,4.15,25,50,13,55,31,47,101,18,1,49,40,16,119,37.5,Altbier
4,5,Scurry,Altbier,8,Off Color Brewing,Notes:Just cause it's dark and German doesn't ...,5.3,3.67,25,50,21,69,10,63,120,14,0,19,36,15,218,37.5,Altbier
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5423,5552,Red Nose Winter Ale,Winter Warmer,17,Natty Greene's Pub & Brewing Co.,Notes:,6.8,3.59,35,50,8,44,24,19,52,21,0,26,21,96,77,42.5,Winter Warmer
5552,5553,Our Special Ale 2019 (Anchor Christmas Ale),Winter Warmer,17,Anchor Brewing Company,Notes:This is the forty-fifth annual Our Speci...,6.9,3.79,35,50,22,73,9,58,87,23,0,48,91,44,104,42.5,Winter Warmer
5553,5554,Fish Tale Winterfish,Winter Warmer,17,Fish Brewing Company / Fishbowl Brewpub,Notes:,7.5,3.76,35,50,11,36,50,70,72,59,0,81,110,18,73,42.5,Winter Warmer
5554,5555,"He'Brew Hanukkah, Chanukah: Pass The Beer",Winter Warmer,17,Shmaltz Brewing Company,Notes:Chanukah Beer pours a rich crystal clear...,8.0,3.61,35,50,6,64,30,57,78,15,1,28,57,23,129,42.5,Winter Warmer


In [405]:
df["sum of factors"] = df.astringency + df.body + df.alcohol + df.bitter + df.sweet + df.sour + df.salty + df.fruits + df.hoppy + df.spices + df.malty

In [409]:
df = df[df["sum of factors"] >= 100]

In [410]:
styles = [
    {"style": "Bock", "substyles":["Bock - Doppelbock", "Bock - Eisbock", "Bock - Maibock", "Bock - Traditional", "Bock - Weizenbock"]},
    {"style": "Brown Ales", "substyles": ["Altbier", "Brown Ale - American", "Brown Ale - Belgian Dark", "Brown Ale - English", "Mild Ale - English Dark"]},
    {"style": "Dark Ales", "substyles": ["Dubbel", "Rye Beer - Roggenbier", "Scottish Ale", "Winter Warmer"]},
    {"style": "Dark Lagers", "substyles": ["Lager - American Amber / Red", "Lager - European Dark", "Lager - Märzen", "Lager - Munich Dunkel", "Lager - Rauchbier", "Lager - Schwarzbier", "Lager - Vienna"]},
    {"style": "Hybrid Beers", "substyles": ["Bière de Champagne / Bière Brut", "Braggot", "California Common / Steam Beer", "Cream Ale"]},
    {"style": "India Pale Ales", "substyles": ["IPA - American", "IPA - Belgian", "IPA - Black / Cascadian Dark Ale", "IPA - Brut", "IPA - English", "IPA - Imperial", "IPA - New England"]},
    {"style": "Pale Ales", "substyles": ["Bitter - English", "Bitter - English Extra Special / Strong Bitter (ESB)", "Blonde Ale - Belgian", "Blonde Ale - American", "Farmhouse Ale - Bière de Garde", "Farmhouse Ale - Saison", "Kölsch", "Mild Ale - English Pale", "Pale Ale - American", "Pale Ale - Belgian", "Pale Ale - English", "Red Ale - American Amber / Red", "Red Ale - Irish"]},
    {"style": "Porters", "substyles": ["Porter - American", "Porter - Baltic", "Porter - English", "Porter - Imperial", "Porter - Robust", "Porter - Smoked"]},
    {"style": "Specialty Beers", "substyles": ["Chile Beer", "Farmhouse Ale - Sahti", "Fruit and Field Beer", "Gruit / Ancient Herbed Ale", "Happoshu", "Herb and Spice Beer", "Kvass", "Lager - Japanese Rice", "Low Alcohol Beer", "Pumpkin Beer", "Rye Beer", "Smoked Beer"]},
    {"style": "Stouts", "substyles":  ["Stout - Sweet / Milk", "Stout - Russian Imperial", "Stout - Oatmeal", "Stout - Irish Dry", "Stout - Foreign / Export", "Stout - English", "Stout - American Imperial", "Stout - American" ]},
    {"style": "Strong Ales", "substyles": ["Wheat Beer - Wheatwine", "Tripel", "Strong Ale - English", "Strong Ale - Belgian Pale", "Strong Ale - Belgian Dark" , "Strong Ale - American", "Scotch Ale / Wee Heavy", "Red Ale - Imperial", "Quadrupel (Quad)", "Old Ale", "Barleywine - English", "Barleywine - American"]},
    {"style": "Wheat Beers", "substyles": ["Wheat Beer - Witbier", "Wheat Beer - Kristallweizen", "Wheat Beer - Hefeweizen", "Wheat Beer - Dunkelweizen", "Wheat Beer - American Pale", "Wheat Beer - American Dark"]},
    {"style": "Wild/Sour ", "substyles": [ "Brett Beer", "Lambic - Faro", "Lambic - Fruit", "Lambic - Gueuze", "Lambic - Traditional", "Sour - Berliner Weisse", "Sour - Flanders Oud Bruin", "Sour - Flanders Red Ale", "Sour - Fruited Kettle Sour", "Sour - Gose", "Wild Ale"]},
    {"style": "Pale Lagers", "substyles": ["Lager - Adjunct", "Lager - American", "Lager - European / Dortmunder Export", "Lager - European Pale", "Lager - European Strong", "Lager - Festbier / Wiesnbier", "Lager - Helles", "Lager - India Pale Lager (IPL)", "Lager - India Pale Lager", "Lager - Kellerbier / Zwickelbier", "Lager - Light", "Lager - Malt Liquor", "Pilsner - Bohemian / Czech", "Pilsner - German", "Pilsner - Imperial", "Lager - Märzen / Oktoberfest"]}
]

In [411]:
def get_big_style(x):
    for item in styles:
        if x in item["substyles"]:
            return item["style"]

In [412]:
df["BA_Big_styles"] = df.beer_style.apply(get_big_style)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [413]:
set(df.BA_Big_styles.to_list())

{'Bock',
 'Brown Ales',
 'Dark Ales',
 'Dark Lagers',
 'Hybrid Beers',
 'India Pale Ales',
 'Pale Ales',
 'Pale Lagers',
 'Porters',
 'Specialty Beers',
 'Stouts',
 'Strong Ales',
 'Wheat Beers',
 'Wild/Sour '}

In [414]:
# Calculate average ibu and add to column
df["avg_ibu"] = (df.min_ibu + df.max_ibu) /2



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [415]:
# Filter out beers with abv > 13
df = df[df.abv <= 13]
df = df[df.abv >= 3]

In [416]:
# Get list of unique styles
beer_styles = df.beer_style.to_list()
beer_styles_unique = set(beer_styles)
print(len(beer_styles))
print(len(beer_styles_unique))

4802
111


In [417]:
# Calculate number of items for each style
for item in beer_styles_unique:
    counter = 0
    for x in beer_styles:
        if item == x:
            counter += 1
    print(item, counter)

Porter - English 50
Scottish Ale 49
Lambic - Faro 8
Lager - Malt Liquor 46
Brown Ale - English 50
Stout - English 48
Wheat Beer - Hefeweizen 49
Strong Ale - American 41
Brown Ale - Belgian Dark 43
Fruit and Field Beer 48
Lambic - Gueuze 43
Porter - Imperial 49
Barleywine - American 44
Sour - Flanders Red Ale 48
Wheat Beer - Wheatwine 31
Lager - European / Dortmunder Export 48
Pale Ale - Belgian 50
Stout - American Imperial 42
Lager - Light 40
Blonde Ale - Belgian 49
Pale Ale - English 50
Strong Ale - Belgian Pale 50
Lager - European Pale 49
Cream Ale 50
Lager - European Strong 44
Sour - Flanders Oud Bruin 48
Kölsch 50
Stout - Irish Dry 49
Sour - Berliner Weisse 48
Porter - Smoked 43
Lager - Märzen / Oktoberfest 50
Wheat Beer - Dunkelweizen 46
Bitter - English 50
Stout - Russian Imperial 43
Old Ale 45
Lager - European Dark 48
Sour - Gose 49
Wheat Beer - American Pale 50
Red Ale - Irish 50
Bock - Weizenbock 49
Brett Beer 40
Herb and Spice Beer 49
Red Ale - American Amber / Red 50
Lager -

In [418]:
beer_styles_condensed = ["IPA", "Porter", "Lager", "Pale Ale", "Pilsner", "Stout", "Wheat Beer", "Bock",  
                         "Blonde Ale", "Sour", "Lambic", "Brown Ale", "Barleywine", "Strong Ale", "Farmhouse Ale",
                         "Bitter", "Red Ale"]

In [419]:
# Function to reduce styles

def reduce_styles(style):
    for count, item in enumerate (beer_styles_condensed, start=1):
#         print(item, style)
        if item in style:
            return(item)
        elif count == len(beer_styles_condensed):
            return(style)
        else:
            continue

In [420]:
df["condensed_style"] = df.beer_style.apply(reduce_styles)

In [421]:
df

Unnamed: 0,beer_id,beer_name,beer_style,style_key,brewery,description,abv,ave_rating,min_ibu,max_ibu,astringency,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty,avg_ibu,condensed_style,BA_Big_styles,sum of factors
0,1,Amber,Altbier,8,Alaskan Brewing Co.,"Notes:Richly malty and long on the palate, wit...",5.3,3.65,25,50,13,32,9,47,74,33,0,33,57,8,111,37.5,Altbier,Brown Ales,417
1,2,Double Bag,Altbier,8,Long Trail Brewing Co.,"Notes:This malty, full-bodied double alt is al...",7.2,3.90,25,50,12,57,18,33,55,16,0,24,35,12,84,37.5,Altbier,Brown Ales,346
2,3,Long Trail Ale,Altbier,8,Long Trail Brewing Co.,Notes:Long Trail Ale is a full-bodied amber al...,5.0,3.58,25,50,14,37,6,42,43,11,0,10,54,4,62,37.5,Altbier,Brown Ales,283
3,4,Doppelsticke,Altbier,8,Uerige Obergärige Hausbrauerei,Notes:,8.5,4.15,25,50,13,55,31,47,101,18,1,49,40,16,119,37.5,Altbier,Brown Ales,490
4,5,Scurry,Altbier,8,Off Color Brewing,Notes:Just cause it's dark and German doesn't ...,5.3,3.67,25,50,21,69,10,63,120,14,0,19,36,15,218,37.5,Altbier,Brown Ales,585
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5551,5551,The First Snow Ale,Winter Warmer,17,RJ Rockers Brewing Company,Notes:This hearty American pale ale contains a...,6.0,3.63,35,50,15,31,23,16,54,43,0,54,14,140,58,42.5,Winter Warmer,Dark Ales,448
5552,5553,Our Special Ale 2019 (Anchor Christmas Ale),Winter Warmer,17,Anchor Brewing Company,Notes:This is the forty-fifth annual Our Speci...,6.9,3.79,35,50,22,73,9,58,87,23,0,48,91,44,104,42.5,Winter Warmer,Dark Ales,559
5553,5554,Fish Tale Winterfish,Winter Warmer,17,Fish Brewing Company / Fishbowl Brewpub,Notes:,7.5,3.76,35,50,11,36,50,70,72,59,0,81,110,18,73,42.5,Winter Warmer,Dark Ales,580
5554,5555,"He'Brew Hanukkah, Chanukah: Pass The Beer",Winter Warmer,17,Shmaltz Brewing Company,Notes:Chanukah Beer pours a rich crystal clear...,8.0,3.61,35,50,6,64,30,57,78,15,1,28,57,23,129,42.5,Winter Warmer,Dark Ales,488


In [422]:
# Count of Unique styles
# passing "set()" to a list adds only unique values from the list to the set
condensed_styles = set(df.condensed_style.to_list())
len(condensed_styles)

44

In [423]:
condensed_styles

{'Altbier',
 'Barleywine',
 'Bitter',
 'Bière de Champagne / Bière Brut',
 'Blonde Ale',
 'Bock',
 'Braggot',
 'Brett Beer',
 'Brown Ale',
 'California Common / Steam Beer',
 'Chile Beer',
 'Cream Ale',
 'Dubbel',
 'Farmhouse Ale',
 'Fruit and Field Beer',
 'Gruit / Ancient Herbed Ale',
 'Happoshu',
 'Herb and Spice Beer',
 'IPA',
 'Kvass',
 'Kölsch',
 'Lager',
 'Lambic',
 'Mild Ale - English Dark',
 'Mild Ale - English Pale',
 'Old Ale',
 'Pale Ale',
 'Pilsner',
 'Porter',
 'Pumpkin Beer',
 'Quadrupel (Quad)',
 'Red Ale',
 'Rye Beer',
 'Rye Beer - Roggenbier',
 'Scotch Ale / Wee Heavy',
 'Scottish Ale',
 'Smoked Beer',
 'Sour',
 'Stout',
 'Strong Ale',
 'Tripel',
 'Wheat Beer',
 'Wild Ale',
 'Winter Warmer'}

### K-Means

In [424]:
df.columns

Index(['beer_id', 'beer_name', 'beer_style', 'style_key', 'brewery',
       'description', 'abv', 'ave_rating', 'min_ibu', 'max_ibu', 'astringency',
       'body', 'alcohol', 'bitter', 'sweet', 'sour', 'salty', 'fruits',
       'hoppy', 'spices', 'malty', 'avg_ibu', 'condensed_style',
       'BA_Big_styles', 'sum of factors'],
      dtype='object')

In [444]:
# Drop columns unnecessary for analysis
new_df = df.drop(['beer_id', 'beer_name', 'beer_style', 'style_key', 'brewery',
       'description', 'ave_rating', "min_ibu", "max_ibu", "salty", "alcohol",
        "condensed_style", "avg_ibu", "BA_Big_styles", "sum of factors", "abv"], axis=1)

In [445]:
new_df.head()

Unnamed: 0,astringency,body,bitter,sweet,sour,fruits,hoppy,spices,malty
0,13,32,47,74,33,33,57,8,111
1,12,57,33,55,16,24,35,12,84
2,14,37,42,43,11,10,54,4,62
3,13,55,47,101,18,49,40,16,119
4,21,69,63,120,14,19,36,15,218


In [446]:
# define standard scaler
scaler = StandardScaler()
# transform data
scaled_df = scaler.fit_transform(new_df)

In [447]:
# Looking for the best K - unscaled
# inertia = []
# k = list(range(1, 15))

# for i in k:
#     km = KMeans(n_clusters=i, random_state=0)
#     km.fit(new_df)
#     inertia.append(km.inertia_)
    
# Looking for the best K - scaled
inertia = []
k = list(range(1, 15))

for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(scaled_df)
    inertia.append(km.inertia_)


In [448]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [450]:
# Initializing model with K = 3 
model = KMeans(n_clusters=10, random_state=5)
model

KMeans(n_clusters=10, random_state=5)

In [451]:
# Fitting model
model.fit(new_df)

KMeans(n_clusters=10, random_state=5)

In [452]:
# Get the predictions
predictions = model.predict(new_df)
print(predictions)

[0 0 2 ... 5 0 7]


In [453]:
# Add a new class column to the df
new_df["class"] = model.labels_
new_df.head()

Unnamed: 0,astringency,body,bitter,sweet,sour,fruits,hoppy,spices,malty,class
0,13,32,47,74,33,33,57,8,111,0
1,12,57,33,55,16,24,35,12,84,0
2,14,37,42,43,11,10,54,4,62,2
3,13,55,47,101,18,49,40,16,119,3
4,21,69,63,120,14,19,36,15,218,8


In [454]:
style_df = df[["beer_style", "style_key", "condensed_style", "ave_rating", "BA_Big_styles"]]

In [455]:
combined_df = new_df.join(style_df)

In [456]:
combined_df.sort_values("abv", ascending=False)

KeyError: 'abv'

In [457]:
combined_df.head(50)

Unnamed: 0,astringency,body,bitter,sweet,sour,fruits,hoppy,spices,malty,class,beer_style,style_key,condensed_style,ave_rating,BA_Big_styles
0,13,32,47,74,33,33,57,8,111,0,Altbier,8,Altbier,3.65,Brown Ales
1,12,57,33,55,16,24,35,12,84,0,Altbier,8,Altbier,3.9,Brown Ales
2,14,37,42,43,11,10,54,4,62,2,Altbier,8,Altbier,3.58,Brown Ales
3,13,55,47,101,18,49,40,16,119,3,Altbier,8,Altbier,4.15,Brown Ales
4,21,69,63,120,14,19,36,15,218,8,Altbier,8,Altbier,3.67,Brown Ales
5,25,51,44,45,9,11,51,20,95,0,Altbier,8,Altbier,3.78,Brown Ales
6,22,45,46,62,25,34,60,4,103,0,Altbier,8,Altbier,4.1,Brown Ales
7,28,40,40,58,29,36,54,8,97,0,Altbier,8,Altbier,3.46,Brown Ales
8,18,49,37,73,22,21,37,4,98,0,Altbier,8,Altbier,3.6,Brown Ales
9,25,35,38,39,13,8,60,16,97,0,Altbier,8,Altbier,4.1,Brown Ales


In [458]:
combined_df.groupby(["BA_Big_styles"]).median().sort_values(["class"])

Unnamed: 0_level_0,astringency,body,bitter,sweet,sour,fruits,hoppy,spices,malty,class,style_key,ave_rating
BA_Big_styles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Dark Lagers,15.0,45.0,36.0,52.0,11.0,12.0,34.0,7.0,96.0,0,22.0,3.67
Brown Ales,12.0,52.5,38.0,65.0,13.0,18.0,35.0,8.0,106.5,1,10.0,3.72
Wheat Beers,16.0,36.0,17.0,32.0,40.0,58.5,24.0,25.5,62.0,1,113.0,3.68
Pale Ales,19.5,37.0,39.0,49.0,40.0,42.5,60.0,9.0,64.0,2,46.0,3.75
Pale Lagers,21.0,29.0,33.0,30.0,17.0,18.0,52.0,6.0,58.5,2,59.0,3.55
Bock,12.0,50.0,29.0,80.0,23.0,37.0,28.0,11.0,95.0,3,4.0,3.81
Dark Ales,10.0,47.0,25.5,76.5,24.0,33.0,28.0,26.0,82.0,3,16.0,3.775
Strong Ales,11.0,48.0,28.0,88.0,34.0,57.0,31.0,20.5,85.0,3,102.0,3.99
Wild/Sour,32.0,31.0,7.0,61.0,135.0,89.0,13.0,7.0,25.0,4,122.0,4.07
Hybrid Beers,16.0,42.0,22.0,46.0,23.0,25.0,35.0,8.0,56.0,5,29.0,3.7


In [459]:
bar_df = combined_df.groupby(["class"]).mean().sort_values(["class"])
bar_df.reset_index(inplace=True)
bar_df

Unnamed: 0,class,astringency,body,bitter,sweet,sour,fruits,hoppy,spices,malty,style_key,ave_rating
0,0,13.921769,56.630385,38.814059,63.378685,15.461451,19.793651,36.726757,12.831066,103.921769,47.930839,3.715828
1,1,18.729831,33.863039,18.318949,54.801126,59.311445,67.934334,28.885553,26.789869,44.812383,83.067542,3.783283
2,2,22.995098,33.452614,48.120915,34.714052,27.437908,29.707516,72.29085,8.903595,65.27451,52.148693,3.676454
3,3,12.457627,56.878935,26.401937,120.002421,39.341404,70.871671,27.893462,24.571429,97.452785,67.033898,3.928184
4,4,37.394737,34.087719,11.818713,74.646199,154.98538,105.78655,20.649123,11.842105,32.488304,111.055556,4.092485
5,5,20.473988,51.82948,80.083815,59.112717,42.537572,50.442197,106.040462,13.355491,93.193642,54.00578,3.853757
6,6,25.929889,40.317343,63.372694,57.944649,77.380074,108.376384,87.874539,14.365314,41.542435,49.645756,4.119151
7,7,11.149321,51.113122,25.58371,45.678733,14.090498,30.900452,21.506787,101.049774,71.782805,67.00905,3.751765
8,8,13.006565,102.035011,72.374179,79.40919,13.021882,17.592998,33.549234,22.719912,148.645514,72.857768,4.022801
9,9,12.053793,24.336552,17.873103,27.813793,14.757241,16.626207,23.067586,9.813793,41.582069,58.307586,3.397269


In [460]:
fig = px.bar(bar_df, x="class", y="ave_rating")
fig.show()

In [442]:
bar_df.columns

Index(['class', 'abv', 'astringency', 'body', 'bitter', 'sweet', 'sour',
       'fruits', 'hoppy', 'spices', 'malty', 'style_key', 'ave_rating'],
      dtype='object')

In [461]:
fig = px.bar(bar_df, x="class", y=['astringency', 'body', 'bitter', 'sweet', 'sour',
       'fruits', 'hoppy', 'spices'])
fig.show()

In [292]:

# fig = px.scatter_3d(combined_df, x='abv', y='style_key', z='malty',
#               color='class')
# fig.show()

### KNN

In [373]:
# Drop columns unnecessary for analysis
knn_df = df.drop(['beer_id', 'beer_name', 'beer_style', 'style_key', 'brewery',
       'description', 'ave_rating', "min_ibu", "max_ibu", "salty", "alcohol",
        "avg_ibu"], axis=1)

In [374]:
knn_df

Unnamed: 0,abv,astringency,body,bitter,sweet,sour,fruits,hoppy,spices,malty,condensed_style,BA_Big_styles
0,5.3,13,32,47,74,33,33,57,8,111,Altbier,Brown Ales
1,7.2,12,57,33,55,16,24,35,12,84,Altbier,Brown Ales
2,5.0,14,37,42,43,11,10,54,4,62,Altbier,Brown Ales
3,8.5,13,55,47,101,18,49,40,16,119,Altbier,Brown Ales
4,5.3,21,69,63,120,14,19,36,15,218,Altbier,Brown Ales
...,...,...,...,...,...,...,...,...,...,...,...,...
5551,6.0,15,31,16,54,43,54,14,140,58,Winter Warmer,Dark Ales
5552,6.9,22,73,58,87,23,48,91,44,104,Winter Warmer,Dark Ales
5553,7.5,11,36,70,72,59,81,110,18,73,Winter Warmer,Dark Ales
5554,8.0,6,64,57,78,15,28,57,23,129,Winter Warmer,Dark Ales


In [375]:
knn_df.columns

Index(['abv', 'astringency', 'body', 'bitter', 'sweet', 'sour', 'fruits',
       'hoppy', 'spices', 'malty', 'condensed_style', 'BA_Big_styles'],
      dtype='object')

In [376]:
# Dependent value/value we are trying to predict = style
y = df["BA_Big_styles"]
X = df[['abv', 'astringency', 'body', 'bitter', 'sweet', 'sour', 'fruits',
       'hoppy', 'spices', 'malty']]

In [377]:
# Split into testing and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [378]:
# Scale data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [384]:
# Instatiate KNN class and train model
classifier = KNeighborsClassifier(n_neighbors=7)
classifier.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=7)

In [385]:
# View results
y_pred = classifier.predict(X_test)
# print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
# output_dict = classification_report(y_test, y_pred, output_dict=True)

                 precision    recall  f1-score   support

           Bock       0.31      0.40      0.35        45
     Brown Ales       0.40      0.31      0.35        59
      Dark Ales       0.46      0.41      0.43        39
    Dark Lagers       0.32      0.31      0.31        62
   Hybrid Beers       0.19      0.15      0.17        33
India Pale Ales       0.68      0.75      0.71        81
      Pale Ales       0.50      0.55      0.53       130
    Pale Lagers       0.66      0.69      0.67       133
        Porters       0.52      0.57      0.54        58
Specialty Beers       0.45      0.40      0.43        94
         Stouts       0.63      0.62      0.62        71
    Strong Ales       0.66      0.69      0.67       112
    Wheat Beers       0.62      0.47      0.54        59
     Wild/Sour        0.85      0.83      0.84        94

       accuracy                           0.56      1070
      macro avg       0.52      0.51      0.51      1070
   weighted avg       0.56   

In [311]:
output_dict

{'Altbier': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5},
 'Barleywine': {'precision': 0.4583333333333333,
  'recall': 0.6470588235294118,
  'f1-score': 0.5365853658536585,
  'support': 17},
 'Bitter': {'precision': 0.6666666666666666,
  'recall': 0.6086956521739131,
  'f1-score': 0.6363636363636365,
  'support': 23},
 'Bière de Champagne / Bière Brut': {'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0.0,
  'support': 5},
 'Blonde Ale': {'precision': 0.2222222222222222,
  'recall': 0.35294117647058826,
  'f1-score': 0.27272727272727276,
  'support': 17},
 'Bock': {'precision': 0.3125,
  'recall': 0.39215686274509803,
  'f1-score': 0.34782608695652173,
  'support': 51},
 'Braggot': {'precision': 0.5,
  'recall': 0.25,
  'f1-score': 0.3333333333333333,
  'support': 4},
 'Brett Beer': {'precision': 0.25,
  'recall': 0.08333333333333333,
  'f1-score': 0.125,
  'support': 12},
 'Brown Ale': {'precision': 0.2631578947368421,
  'recall': 0.17857142857142858,
  'f1-score'

In [306]:
output_dict["macro avg"]["precision"]

0.36388221679951477

In [380]:
accuracies = []
precisions = []
recalls = []

for n in range(1,31):
    classifier = KNeighborsClassifier(n_neighbors=n)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    # print(confusion_matrix(y_test, y_pred))
    # print(classification_report(y_test, y_pred))
    output_dict = classification_report(y_test, y_pred, output_dict=True)
    accuracies.append(output_dict["accuracy"])
    precisions.append(output_dict["macro avg"]["precision"])
    recalls.append(output_dict["macro avg"]["recall"])

In [381]:
fig = px.scatter(x=range(1,31), y=accuracies)
fig.show()

In [382]:
fig = px.scatter(x=range(1,31), y=precisions)
fig.show()

In [383]:
fig = px.scatter(x=range(1,31), y=recalls)
fig.show()