In [286]:
# Import Dependencies
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix


In [3]:
# Display all columns
pd.set_option('display.max_columns', None)

### Preprocessing data

In [4]:
# Load data into dataframe
df = pd.read_csv("csv_files/beer_info_from_db.csv")

In [184]:
df.sort_values("abv", ascending=False)

Unnamed: 0,beer_id,beer_name,beer_style,style_key,brewery,description,abv,ave_rating,min_ibu,max_ibu,astringency,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty,avg_ibu,condensed_style
4991,4990,BelzebuthBrasserie Grain d' Orge (Brasserie Je...,Strong Ale - Belgian Pale,105,Brasserie Grain d' Orge (Brasserie Jeanne d'Ar...,Notes:,13.0,3.47,20,40,6,20,67,10,70,28,1,32,14,7,50,30.0,Strong Ale
121,119,Leon,Barleywine - English,98,Cigar City Brewing,Notes:Church on a Hill aged in apple brandy ba...,13.0,4.46,40,60,5,54,64,6,151,36,0,51,4,14,120,50.0,Barleywine
5381,5380,Wood Ya Honey With Nuts,Wheat Beer - Wheatwine,108,Jackie O's Pub & Brewery,Notes:Honey Wheat Wine Style Ale aged in bourb...,13.0,4.20,45,85,6,25,36,3,50,1,1,11,4,5,77,65.0,Wheat Beer
4780,4777,Black Albert,Stout - Russian Imperial,94,De Struise Brouwers,Notes:Originally brewed exclusively for Ebenez...,13.0,4.28,50,90,10,116,38,73,102,27,1,63,39,19,123,70.0,Stout
5396,5395,Maple in the Wood,Wheat Beer - Wheatwine,108,Side Project Brewing,Notes:When we finally opened our production fa...,13.0,4.49,45,85,0,14,21,2,35,5,0,5,2,5,56,65.0,Wheat Beer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5354,5354,Spaten Munich Club-Weissbier,Wheat Beer - Kristallweizen,114,Spaten-Franziskaner-Bräu,Notes:,0.0,3.76,10,15,0,0,0,0,0,0,0,0,0,0,0,12.5,Wheat Beer
3759,3752,Stagecoach Smoked Porter,Porter - Smoked,73,Stagecoach Brewing Company,Notes:,0.0,3.92,20,40,7,71,4,28,37,9,0,6,10,49,68,30.0,Porter
5353,5353,Edelweiss Kristallklar Weissbier,Wheat Beer - Kristallweizen,114,Brau Union Österreich AG,Notes:,0.0,3.20,10,15,7,8,0,2,7,2,0,6,2,3,7,12.5,Wheat Beer
5255,5254,Steingadener Weisse Dunkel,Wheat Beer - Dunkelweizen,112,Aktienbrauerei Kaufbeuren AG,Notes:,0.0,3.32,10,15,27,37,9,9,58,64,4,62,13,29,74,12.5,Wheat Beer


In [6]:
# Calculate average ibu and add to column
df["avg_ibu"] = (df.min_ibu + df.max_ibu) /2

In [241]:
# Filter out beers with abv > 13
df = df[df.abv <= 13]
df = df[df.abv >= 3]

In [242]:
# Get list of unique styles
beer_styles = df.beer_style.to_list()
beer_styles_unique = set(beer_styles)
print(len(beer_styles))
print(len(beer_styles_unique))

5348
111


In [243]:
# Calculate number of items for each style
for item in beer_styles_unique:
    counter = 0
    for x in beer_styles:
        if item == x:
            counter += 1
    print(item, counter)

Porter - English 50
Scottish Ale 50
Lambic - Faro 16
Lager - Malt Liquor 50
Brown Ale - English 50
Stout - English 50
Wheat Beer - Hefeweizen 49
Strong Ale - American 41
Brown Ale - Belgian Dark 48
Fruit and Field Beer 48
Lambic - Gueuze 49
Porter - Imperial 49
Barleywine - American 44
Sour - Flanders Red Ale 49
Wheat Beer - Wheatwine 42
Lager - European / Dortmunder Export 50
Pale Ale - Belgian 50
Stout - American Imperial 42
Lager - Light 47
Blonde Ale - Belgian 50
Pale Ale - English 50
Strong Ale - Belgian Pale 50
Lager - European Pale 49
Cream Ale 50
Lager - European Strong 48
Sour - Flanders Oud Bruin 49
Kölsch 50
Stout - Irish Dry 50
Sour - Berliner Weisse 48
Porter - Smoked 48
Lager - Märzen / Oktoberfest 50
Wheat Beer - Dunkelweizen 49
Bitter - English 50
Stout - Russian Imperial 43
Old Ale 46
Lager - European Dark 49
Sour - Gose 49
Wheat Beer - American Pale 50
Red Ale - Irish 50
Bock - Weizenbock 50
Brett Beer 50
Herb and Spice Beer 50
Red Ale - American Amber / Red 50
Lager 

In [244]:
beer_styles_condensed = ["IPA", "Porter", "Lager", "Pale Ale", "Pilsner", "Stout", "Wheat Beer", "Bock",  
                         "Blonde Ale", "Sour", "Lambic", "Brown Ale", "Barleywine", "Strong Ale", "Farmhouse Ale",
                         "Bitter", "Red Ale"]

In [245]:
# Function to reduce styles

def reduce_styles(style):
    for count, item in enumerate (beer_styles_condensed, start=1):
#         print(item, style)
        if item in style:
            return(item)
        elif count == len(beer_styles_condensed):
            return(style)
        else:
            continue

In [246]:
df["condensed_style"] = df.beer_style.apply(reduce_styles)

In [247]:
df

Unnamed: 0,beer_id,beer_name,beer_style,style_key,brewery,description,abv,ave_rating,min_ibu,max_ibu,astringency,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty,avg_ibu,condensed_style
0,1,Amber,Altbier,8,Alaskan Brewing Co.,"Notes:Richly malty and long on the palate, wit...",5.3,3.65,25,50,13,32,9,47,74,33,0,33,57,8,111,37.5,Altbier
1,2,Double Bag,Altbier,8,Long Trail Brewing Co.,"Notes:This malty, full-bodied double alt is al...",7.2,3.90,25,50,12,57,18,33,55,16,0,24,35,12,84,37.5,Altbier
2,3,Long Trail Ale,Altbier,8,Long Trail Brewing Co.,Notes:Long Trail Ale is a full-bodied amber al...,5.0,3.58,25,50,14,37,6,42,43,11,0,10,54,4,62,37.5,Altbier
3,4,Doppelsticke,Altbier,8,Uerige Obergärige Hausbrauerei,Notes:,8.5,4.15,25,50,13,55,31,47,101,18,1,49,40,16,119,37.5,Altbier
4,5,Scurry,Altbier,8,Off Color Brewing,Notes:Just cause it's dark and German doesn't ...,5.3,3.67,25,50,21,69,10,63,120,14,0,19,36,15,218,37.5,Altbier
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5551,5551,The First Snow Ale,Winter Warmer,17,RJ Rockers Brewing Company,Notes:This hearty American pale ale contains a...,6.0,3.63,35,50,15,31,23,16,54,43,0,54,14,140,58,42.5,Winter Warmer
5552,5553,Our Special Ale 2019 (Anchor Christmas Ale),Winter Warmer,17,Anchor Brewing Company,Notes:This is the forty-fifth annual Our Speci...,6.9,3.79,35,50,22,73,9,58,87,23,0,48,91,44,104,42.5,Winter Warmer
5553,5554,Fish Tale Winterfish,Winter Warmer,17,Fish Brewing Company / Fishbowl Brewpub,Notes:,7.5,3.76,35,50,11,36,50,70,72,59,0,81,110,18,73,42.5,Winter Warmer
5554,5555,"He'Brew Hanukkah, Chanukah: Pass The Beer",Winter Warmer,17,Shmaltz Brewing Company,Notes:Chanukah Beer pours a rich crystal clear...,8.0,3.61,35,50,6,64,30,57,78,15,1,28,57,23,129,42.5,Winter Warmer


In [248]:
# Count of Unique styles
# passing "set()" to a list adds only unique values from the list to the set
condensed_styles = set(df.condensed_style.to_list())
len(condensed_styles)

44

In [249]:
condensed_styles

{'Altbier',
 'Barleywine',
 'Bitter',
 'Bière de Champagne / Bière Brut',
 'Blonde Ale',
 'Bock',
 'Braggot',
 'Brett Beer',
 'Brown Ale',
 'California Common / Steam Beer',
 'Chile Beer',
 'Cream Ale',
 'Dubbel',
 'Farmhouse Ale',
 'Fruit and Field Beer',
 'Gruit / Ancient Herbed Ale',
 'Happoshu',
 'Herb and Spice Beer',
 'IPA',
 'Kvass',
 'Kölsch',
 'Lager',
 'Lambic',
 'Mild Ale - English Dark',
 'Mild Ale - English Pale',
 'Old Ale',
 'Pale Ale',
 'Pilsner',
 'Porter',
 'Pumpkin Beer',
 'Quadrupel (Quad)',
 'Red Ale',
 'Rye Beer',
 'Rye Beer - Roggenbier',
 'Scotch Ale / Wee Heavy',
 'Scottish Ale',
 'Smoked Beer',
 'Sour',
 'Stout',
 'Strong Ale',
 'Tripel',
 'Wheat Beer',
 'Wild Ale',
 'Winter Warmer'}

### K-Means

In [250]:
df.columns

Index(['beer_id', 'beer_name', 'beer_style', 'style_key', 'brewery',
       'description', 'abv', 'ave_rating', 'min_ibu', 'max_ibu', 'astringency',
       'body', 'alcohol', 'bitter', 'sweet', 'sour', 'salty', 'fruits',
       'hoppy', 'spices', 'malty', 'avg_ibu', 'condensed_style'],
      dtype='object')

In [251]:
# Drop columns unnecessary for analysis
new_df = df.drop(['beer_id', 'beer_name', 'beer_style', 'style_key', 'brewery',
       'description', 'ave_rating', "min_ibu", "max_ibu", "salty", "alcohol",
        "condensed_style", "avg_ibu", "malty"], axis=1)

In [252]:
new_df.head()

Unnamed: 0,abv,astringency,body,bitter,sweet,sour,fruits,hoppy,spices
0,5.3,13,32,47,74,33,33,57,8
1,7.2,12,57,33,55,16,24,35,12
2,5.0,14,37,42,43,11,10,54,4
3,8.5,13,55,47,101,18,49,40,16
4,5.3,21,69,63,120,14,19,36,15


In [253]:
# define standard scaler
scaler = StandardScaler()
# transform data
scaled_df = scaler.fit_transform(new_df)

In [254]:
# Looking for the best K - unscaled
# inertia = []
# k = list(range(1, 15))

# for i in k:
#     km = KMeans(n_clusters=i, random_state=0)
#     km.fit(new_df)
#     inertia.append(km.inertia_)
    
# Looking for the best K - scaled
inertia = []
k = list(range(1, 15))

for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(scaled_df)
    inertia.append(km.inertia_)


In [255]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [256]:
# Initializing model with K = 3 
model = KMeans(n_clusters=7, random_state=5)
model

KMeans(n_clusters=7, random_state=5)

In [257]:
# Fitting model
model.fit(new_df)

KMeans(n_clusters=7, random_state=5)

In [258]:
# Get the predictions
predictions = model.predict(new_df)
print(predictions)

[4 4 4 ... 5 6 0]


In [259]:
# Add a new class column to the df
new_df["class"] = model.labels_
new_df.head()

Unnamed: 0,abv,astringency,body,bitter,sweet,sour,fruits,hoppy,spices,class
0,5.3,13,32,47,74,33,33,57,8,4
1,7.2,12,57,33,55,16,24,35,12,4
2,5.0,14,37,42,43,11,10,54,4,4
3,8.5,13,55,47,101,18,49,40,16,3
4,5.3,21,69,63,120,14,19,36,15,6


In [260]:
style_df = df[["beer_style", "style_key", "condensed_style", "ave_rating"]]

In [261]:
combined_df = new_df.join(style_df)

In [262]:
combined_df.sort_values("abv", ascending=False)

Unnamed: 0,abv,astringency,body,bitter,sweet,sour,fruits,hoppy,spices,class,beer_style,style_key,condensed_style,ave_rating
4507,13.0,11,112,80,61,9,17,14,88,6,Stout - American Imperial,89,Stout,4.34
4991,13.0,6,20,10,70,28,32,14,7,4,Strong Ale - Belgian Pale,105,Strong Ale,3.47
801,13.0,8,73,69,79,3,9,7,23,6,Brown Ale - American,9,Brown Ale,4.29
4894,13.0,3,75,31,73,8,87,4,35,3,Strong Ale - American,103,Strong Ale,4.53
4875,13.0,4,105,77,69,8,31,10,91,0,Strong Ale - American,103,Strong Ale,4.57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4297,3.0,39,23,7,40,127,66,5,1,2,Sour - Berliner Weisse,122,Sour,3.55
4267,3.0,58,27,5,83,181,124,16,10,2,Sour - Berliner Weisse,122,Sour,4.18
2479,3.0,0,4,2,1,0,1,2,0,1,Lager - Light,62,Lager,1.50
4280,3.0,37,27,6,44,130,55,7,0,2,Sour - Berliner Weisse,122,Sour,3.76


In [263]:
combined_df.head(50)

Unnamed: 0,abv,astringency,body,bitter,sweet,sour,fruits,hoppy,spices,class,beer_style,style_key,condensed_style,ave_rating
0,5.3,13,32,47,74,33,33,57,8,4,Altbier,8,Altbier,3.65
1,7.2,12,57,33,55,16,24,35,12,4,Altbier,8,Altbier,3.9
2,5.0,14,37,42,43,11,10,54,4,4,Altbier,8,Altbier,3.58
3,8.5,13,55,47,101,18,49,40,16,3,Altbier,8,Altbier,4.15
4,5.3,21,69,63,120,14,19,36,15,6,Altbier,8,Altbier,3.67
5,7.2,25,51,44,45,9,11,51,20,4,Altbier,8,Altbier,3.78
6,6.0,22,45,46,62,25,34,60,4,4,Altbier,8,Altbier,4.1
7,5.3,28,40,40,58,29,36,54,8,4,Altbier,8,Altbier,3.46
8,5.0,18,49,37,73,22,21,37,4,4,Altbier,8,Altbier,3.6
9,4.8,25,35,38,39,13,8,60,16,4,Altbier,8,Altbier,4.1


In [264]:
combined_df.groupby(["condensed_style"]).median().sort_values(["class"])

Unnamed: 0_level_0,abv,astringency,body,bitter,sweet,sour,fruits,hoppy,spices,class,style_key,ave_rating
condensed_style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Pumpkin Beer,6.9,9.0,37.0,13.5,39.0,13.0,87.0,16.0,80.5,0.0,84.0,3.755
Chile Beer,5.4,6.5,21.5,11.0,16.0,6.0,5.5,9.0,87.0,0.0,75.0,3.615
Smoked Beer,6.45,11.5,37.5,20.5,40.0,11.0,13.0,21.0,81.5,0.0,86.0,3.805
Bière de Champagne / Bière Brut,8.2,3.0,2.0,0.0,4.0,6.0,6.0,2.0,0.0,1.0,27.0,3.95
Mild Ale - English Pale,4.3,3.0,7.0,10.0,8.0,4.0,3.0,13.0,1.0,1.0,47.0,3.56
Braggot,8.5,3.0,7.5,3.0,25.5,7.0,8.5,6.0,6.0,1.0,28.0,3.87
Herb and Spice Beer,6.0,12.0,31.0,19.0,55.0,33.5,46.5,32.5,60.0,1.0,80.0,3.6
Rye Beer - Roggenbier,5.35,1.5,5.0,1.0,5.0,4.0,5.0,1.0,4.0,1.0,15.0,3.765
Kvass,4.4,1.0,0.0,0.0,4.0,14.0,9.0,1.0,2.0,1.0,81.0,4.0
Gruit / Ancient Herbed Ale,5.6,6.0,12.0,7.0,15.0,12.5,11.5,28.0,5.0,1.0,78.0,3.76


In [265]:
bar_df = combined_df.groupby(["class"]).mean().sort_values(["class"])
bar_df.reset_index(inplace=True)
bar_df

Unnamed: 0,class,abv,astringency,body,bitter,sweet,sour,fruits,hoppy,spices,style_key,ave_rating
0,0,6.861122,11.806122,52.207483,25.663265,47.557823,18.353741,35.823129,22.244898,94.438776,69.452381,3.786429
1,1,6.115292,5.839583,10.940625,7.884375,12.135417,7.680208,7.836458,10.005208,5.345833,62.784375,3.541198
2,2,6.377,34.208163,34.895918,16.840816,69.720408,138.087755,105.030612,27.242857,13.912245,102.52449,4.066633
3,3,8.31723,13.493075,50.739612,24.865651,103.576177,41.15374,66.452909,28.473684,23.560942,68.264543,3.859668
4,4,5.86032,17.453723,37.744607,32.987474,45.59151,21.951983,25.196242,44.161447,11.475296,51.089074,3.59904
5,5,6.69471,22.62677,42.705277,69.133848,50.45045,50.126126,62.631918,96.239382,12.787645,52.380952,3.91009
6,6,7.06479,12.889222,94.77994,66.17515,71.239521,12.285928,15.988024,32.832335,17.513473,70.937126,3.958099


In [266]:
fig = px.bar(bar_df, x="class", y="ave_rating")
fig.show()

In [267]:
bar_df.columns

Index(['class', 'abv', 'astringency', 'body', 'bitter', 'sweet', 'sour',
       'fruits', 'hoppy', 'spices', 'style_key', 'ave_rating'],
      dtype='object')

In [268]:
fig = px.bar(bar_df, x="class", y=['abv', 'astringency', 'body', 'bitter', 'sweet', 'sour',
       'fruits', 'hoppy', 'spices'])
fig.show()

In [269]:

fig = px.scatter_3d(combined_df, x='abv', y='style_key', z='malty',
              color='class')
fig.show()

ValueError: Value of 'z' is not the name of a column in 'data_frame'. Expected one of ['abv', 'astringency', 'body', 'bitter', 'sweet', 'sour', 'fruits', 'hoppy', 'spices', 'class', 'beer_style', 'style_key', 'condensed_style', 'ave_rating'] but received: malty

In [116]:
# Plotting the clusters with three features
fig = px.scatter(combined_df, x="class", y="condensed_style", color="fruits", width=800)
# fig.update_layout(legend=dict(x=0,y=1))
fig.show()

### KNN

In [275]:
# Drop columns unnecessary for analysis
knn_df = df.drop(['beer_id', 'beer_name', 'beer_style', 'style_key', 'brewery',
       'description', 'ave_rating', "min_ibu", "max_ibu", "salty", "alcohol",
        "avg_ibu"], axis=1)

In [276]:
knn_df

Unnamed: 0,abv,astringency,body,bitter,sweet,sour,fruits,hoppy,spices,malty,condensed_style
0,5.3,13,32,47,74,33,33,57,8,111,Altbier
1,7.2,12,57,33,55,16,24,35,12,84,Altbier
2,5.0,14,37,42,43,11,10,54,4,62,Altbier
3,8.5,13,55,47,101,18,49,40,16,119,Altbier
4,5.3,21,69,63,120,14,19,36,15,218,Altbier
...,...,...,...,...,...,...,...,...,...,...,...
5551,6.0,15,31,16,54,43,54,14,140,58,Winter Warmer
5552,6.9,22,73,58,87,23,48,91,44,104,Winter Warmer
5553,7.5,11,36,70,72,59,81,110,18,73,Winter Warmer
5554,8.0,6,64,57,78,15,28,57,23,129,Winter Warmer


In [277]:
knn_df.columns

Index(['abv', 'astringency', 'body', 'bitter', 'sweet', 'sour', 'fruits',
       'hoppy', 'spices', 'malty', 'condensed_style'],
      dtype='object')

In [279]:
# Dependent value/value we are trying to predict = style
y = df["condensed_style"]
X = df[['abv', 'astringency', 'body', 'bitter', 'sweet', 'sour', 'fruits',
       'hoppy', 'spices', 'malty']]

In [283]:
# Split into testing and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [284]:
# Scale data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [290]:
# Instatiate KNN class and train model
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [291]:
# View results
y_pred = classifier.predict(X_test)
# print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 1  0  1 ...  0  0  0]
 [ 1 11  0 ...  0  0  0]
 [ 2  0 11 ...  0  0  0]
 ...
 [ 0  4  0 ... 37  1  0]
 [ 0  0  0 ...  0  2  0]
 [ 0  0  0 ...  0  0  4]]
                                 precision    recall  f1-score   support

                        Altbier       0.08      0.20      0.12         5
                     Barleywine       0.33      0.65      0.44        17
                         Bitter       0.48      0.48      0.48        23
Bière de Champagne / Bière Brut       0.09      0.20      0.13         5
                     Blonde Ale       0.19      0.41      0.26        17
                           Bock       0.28      0.39      0.33        51
                        Braggot       0.00      0.00      0.00         4
                     Brett Beer       0.17      0.08      0.11        12
                      Brown Ale       0.37      0.39      0.38        28
 California Common / Steam Beer       0.18      0.29      0.22         7
                     Chile Beer       0.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

