In [91]:
# Import Dependencies
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [3]:
# Display all columns
pd.set_option('display.max_columns', None)

### Preprocessing data

In [4]:
# Load data into dataframe
df = pd.read_csv("csv_files/beer_info_from_db.csv")

In [5]:
df.head()

Unnamed: 0,beer_id,beer_name,beer_style,style_key,brewery,description,abv,ave_rating,min_ibu,max_ibu,astringency,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty
0,1,Amber,Altbier,8,Alaskan Brewing Co.,"Notes:Richly malty and long on the palate, wit...",5.3,3.65,25,50,13,32,9,47,74,33,0,33,57,8,111
1,2,Double Bag,Altbier,8,Long Trail Brewing Co.,"Notes:This malty, full-bodied double alt is al...",7.2,3.9,25,50,12,57,18,33,55,16,0,24,35,12,84
2,3,Long Trail Ale,Altbier,8,Long Trail Brewing Co.,Notes:Long Trail Ale is a full-bodied amber al...,5.0,3.58,25,50,14,37,6,42,43,11,0,10,54,4,62
3,4,Doppelsticke,Altbier,8,Uerige Obergärige Hausbrauerei,Notes:,8.5,4.15,25,50,13,55,31,47,101,18,1,49,40,16,119
4,5,Scurry,Altbier,8,Off Color Brewing,Notes:Just cause it's dark and German doesn't ...,5.3,3.67,25,50,21,69,10,63,120,14,0,19,36,15,218


In [6]:
# Calculate average ibu and add to column
df["avg_ibu"] = (df.min_ibu + df.max_ibu) /2

In [11]:
# Get list of unique styles
beer_styles = df.beer_style.to_list()
beer_styles_unique = set(beer_styles)
print(len(beer_styles))
print(len(beer_styles_unique))

5556
112


In [13]:
# Calculate number of items for each style
for item in beer_styles_unique:
    counter = 0
    for x in beer_styles:
        if item == x:
            counter += 1
    print(item, counter)

Porter - English 50
Scottish Ale 50
Lambic - Faro 16
Lager - Malt Liquor 50
Brown Ale - English 50
Stout - English 50
Wheat Beer - Hefeweizen 50
Low Alcohol Beer 50
Strong Ale - American 50
Brown Ale - Belgian Dark 50
Fruit and Field Beer 50
Lambic - Gueuze 49
Porter - Imperial 50
Barleywine - American 50
Sour - Flanders Red Ale 49
Wheat Beer - Wheatwine 50
Lager - European / Dortmunder Export 50
Pale Ale - Belgian 50
Stout - American Imperial 50
Lager - Light 50
Blonde Ale - Belgian 50
Pale Ale - English 50
Strong Ale - Belgian Pale 50
Lager - European Pale 49
Cream Ale 50
Lager - European Strong 50
Sour - Flanders Oud Bruin 49
Kölsch 50
Stout - Irish Dry 50
Sour - Berliner Weisse 49
Porter - Smoked 50
Lager - Märzen / Oktoberfest 50
Wheat Beer - Dunkelweizen 50
Bitter - English 50
Stout - Russian Imperial 50
Old Ale 50
Lager - European Dark 50
Sour - Gose 49
Wheat Beer - American Pale 50
Red Ale - Irish 50
Bock - Weizenbock 50
Brett Beer 50
Herb and Spice Beer 50
Red Ale - American A

In [14]:
beer_styles_condensed = ["IPA", "Porter", "Lager", "Pale Ale", "Pilsner", "Stout", "Wheat Beer", "Bock", 
                         "Blonde Ale", "Sour", "Lambic", "Brown Ale", "Barleywine", "Strong Ale", "Farmhouse Ale",
                         "Bitter", "Red Ale"]

In [15]:
# Function to reduce styles

def reduce_styles(style):
    for count, item in enumerate (beer_styles_condensed, start=1):
#         print(item, style)
        if item in style:
            return(item)
        elif count == len(beer_styles_condensed):
            return(style)
        else:
            continue

In [16]:
df["condensed_style"] = df.beer_style.apply(reduce_styles)

In [17]:
# Count of Unique styles
# passing "set()" to a list adds only unique values from the list to the set
condensed_styles = set(df.condensed_style.to_list())
len(condensed_styles)

45

In [18]:
condensed_styles

{'Altbier',
 'Barleywine',
 'Bitter',
 'Bière de Champagne / Bière Brut',
 'Blonde Ale',
 'Bock',
 'Braggot',
 'Brett Beer',
 'Brown Ale',
 'California Common / Steam Beer',
 'Chile Beer',
 'Cream Ale',
 'Dubbel',
 'Farmhouse Ale',
 'Fruit and Field Beer',
 'Gruit / Ancient Herbed Ale',
 'Happoshu',
 'Herb and Spice Beer',
 'IPA',
 'Kvass',
 'Kölsch',
 'Lager',
 'Lambic',
 'Low Alcohol Beer',
 'Mild Ale - English Dark',
 'Mild Ale - English Pale',
 'Old Ale',
 'Pale Ale',
 'Pilsner',
 'Porter',
 'Pumpkin Beer',
 'Quadrupel (Quad)',
 'Red Ale',
 'Rye Beer',
 'Rye Beer - Roggenbier',
 'Scotch Ale / Wee Heavy',
 'Scottish Ale',
 'Smoked Beer',
 'Sour',
 'Stout',
 'Strong Ale',
 'Tripel',
 'Wheat Beer',
 'Wild Ale',
 'Winter Warmer'}

### K-Means

In [50]:
df.columns

Index(['beer_id', 'beer_name', 'beer_style', 'style_key', 'brewery',
       'description', 'abv', 'ave_rating', 'min_ibu', 'max_ibu', 'astringency',
       'body', 'alcohol', 'bitter', 'sweet', 'sour', 'salty', 'fruits',
       'hoppy', 'spices', 'malty', 'avg_ibu', 'condensed_style'],
      dtype='object')

In [92]:
# Drop columns unnecessary for analysis
new_df = df.drop(['beer_id', 'beer_name', 'beer_style', 'style_key', 'brewery',
       'description', 'ave_rating', "min_ibu", "max_ibu", "salty", "bitter", "alcohol",
        "condensed_style", "avg_ibu"], axis=1)

In [93]:
new_df.head()

Unnamed: 0,abv,astringency,body,sweet,sour,fruits,hoppy,spices,malty
0,5.3,13,32,74,33,33,57,8,111
1,7.2,12,57,55,16,24,35,12,84
2,5.0,14,37,43,11,10,54,4,62
3,8.5,13,55,101,18,49,40,16,119
4,5.3,21,69,120,14,19,36,15,218


In [101]:
# define standard scaler
scaler = StandardScaler()
# transform data
scaled_df = scaler.fit_transform(new_df)

In [102]:
# Looking for the best K - unscaled
# inertia = []
# k = list(range(1, 15))

# for i in k:
#     km = KMeans(n_clusters=i, random_state=0)
#     km.fit(new_df)
#     inertia.append(km.inertia_)
    
# Looking for the best K - scaled
inertia = []
k = list(range(1, 15))

for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(scaled_df)
    inertia.append(km.inertia_)


In [103]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [104]:
# Initializing model with K = 3 
model = KMeans(n_clusters=10, random_state=5)
model

KMeans(n_clusters=10, random_state=5)

In [105]:
# Fitting model
model.fit(new_df)

KMeans(n_clusters=10, random_state=5)

In [106]:
# Get the predictions
predictions = model.predict(new_df)
print(predictions)

[6 6 5 ... 0 6 7]


In [107]:
# Add a new class column to the df
new_df["class"] = model.labels_
new_df.head()

Unnamed: 0,abv,astringency,body,sweet,sour,fruits,hoppy,spices,malty,class
0,5.3,13,32,74,33,33,57,8,111,6
1,7.2,12,57,55,16,24,35,12,84,6
2,5.0,14,37,43,11,10,54,4,62,5
3,8.5,13,55,101,18,49,40,16,119,6
4,5.3,21,69,120,14,19,36,15,218,3


In [108]:
style_df = df[["beer_style", "style_key", "condensed_style", "ave_rating"]]

In [109]:
combined_df = new_df.join(style_df)

In [110]:
combined_df.head(50)

Unnamed: 0,abv,astringency,body,sweet,sour,fruits,hoppy,spices,malty,class,beer_style,style_key,condensed_style,ave_rating
0,5.3,13,32,74,33,33,57,8,111,6,Altbier,8,Altbier,3.65
1,7.2,12,57,55,16,24,35,12,84,6,Altbier,8,Altbier,3.9
2,5.0,14,37,43,11,10,54,4,62,5,Altbier,8,Altbier,3.58
3,8.5,13,55,101,18,49,40,16,119,6,Altbier,8,Altbier,4.15
4,5.3,21,69,120,14,19,36,15,218,3,Altbier,8,Altbier,3.67
5,7.2,25,51,45,9,11,51,20,95,6,Altbier,8,Altbier,3.78
6,6.0,22,45,62,25,34,60,4,103,6,Altbier,8,Altbier,4.1
7,5.3,28,40,58,29,36,54,8,97,6,Altbier,8,Altbier,3.46
8,5.0,18,49,73,22,21,37,4,98,6,Altbier,8,Altbier,3.6
9,4.8,25,35,39,13,8,60,16,97,6,Altbier,8,Altbier,4.1


In [111]:
combined_df.groupby(["condensed_style"]).median().sort_values(["class"])

Unnamed: 0_level_0,abv,astringency,body,sweet,sour,fruits,hoppy,spices,malty,class,style_key,ave_rating
condensed_style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Bitter,5.0,15.5,47.5,55.0,30.0,30.0,81.0,9.0,86.0,0.0,40.5,3.73
Rye Beer,6.5,16.5,41.5,46.0,32.5,40.0,69.5,29.0,117.0,0.5,85.0,3.875
Pale Ale,5.45,21.0,36.0,45.0,47.0,53.5,68.5,8.0,54.0,1.0,49.0,3.78
Blonde Ale,5.3,21.0,34.0,40.0,44.0,47.0,48.0,14.0,49.0,1.0,42.5,3.64
Tripel,9.0,18.5,35.5,71.0,51.5,72.0,29.0,35.5,39.0,1.0,107.0,3.945
IPA,7.1,18.0,38.0,46.0,51.0,67.0,86.0,6.0,40.0,1.0,35.0,4.015
Fruit and Field Beer,5.1,13.5,27.0,86.5,50.5,92.0,14.5,4.0,33.0,1.5,77.0,3.52
Red Ale,6.66,14.0,42.0,62.0,28.0,33.0,69.0,6.0,89.0,3.5,52.0,3.79
Pilsner,5.3,27.0,29.0,25.0,21.0,20.0,78.5,9.0,61.0,4.0,65.0,3.715
Mild Ale - English Pale,4.25,3.5,7.0,6.5,3.5,2.0,12.0,1.0,13.0,4.0,47.0,3.575


In [112]:
bar_df = combined_df.groupby(["class"]).mean().sort_values(["class"])
bar_df.reset_index(inplace=True)
bar_df

Unnamed: 0,class,abv,astringency,body,sweet,sour,fruits,hoppy,spices,malty,style_key,ave_rating
0,0,6.223448,22.416537,43.726989,47.204368,38.168487,43.613105,95.556942,11.75819,80.287051,52.839314,3.808128
1,1,6.583111,17.849573,31.8,48.728205,53.702564,63.528205,30.471795,23.887179,42.512821,79.993162,3.769026
2,2,7.173396,27.093284,41.186567,61.33209,80.86194,111.350746,86.809701,16.906716,42.925373,51.88806,4.113955
3,3,8.086048,12.159389,101.303493,85.172489,13.79476,20.637555,30.456332,23.633188,151.875546,71.674672,4.057162
4,4,6.037757,4.224321,7.438017,8.899646,6.98229,6.717828,7.193625,4.64817,12.132231,64.945691,3.587898
5,5,5.514277,16.318627,30.276961,32.714461,14.82598,15.971814,38.870098,7.800245,57.334559,53.338235,3.380098
6,6,6.364224,14.001109,58.578714,64.774945,15.761641,20.370288,38.133038,13.095344,106.396896,50.284922,3.749889
7,7,6.936125,11.154167,53.029167,47.404167,14.1625,30.675,21.816667,98.654167,73.6125,67.554167,3.781292
8,8,9.325481,12.879195,55.812081,120.342282,41.899329,74.024609,28.364653,24.42953,93.111857,70.190157,3.941812
9,9,6.279034,36.855114,33.946023,73.667614,153.741477,104.744318,20.039773,11.775568,32.446023,111.34375,4.084119


In [113]:
fig = px.bar(bar_df, x="class", y="ave_rating")
fig.show()

In [115]:
fig = px.bar(bar_df, x="class", y=['abv', 'astringency', 'body', 'sweet', 'sour', 'fruits',
       'hoppy', 'spices', 'malty'])
fig.show()

In [86]:
# Plotting the clusters with three features
fig = px.scatter(combined_df, x="class", y="condensed_style", color="fruits", width=800)
# fig.update_layout(legend=dict(x=0,y=1))
fig.show()

### Neural Net