In [294]:
# Import Dependencies
import sqlite3
import csv
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score

In [2]:
# Display all columns
pd.set_option('display.max_columns', None)
# Do not display false positive warnings
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
# Connect to database
connection = sqlite3.connect('SQL/beer.sqlite')

In [46]:
# Read database table to dataframe
df = pd.read_sql_query("SELECT * from all_scraped", connection)

In [47]:
df.drop(columns=["index"], inplace=True)

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3518 entries, 0 to 3517
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   beer_name       3517 non-null   object 
 1   beer_style      3518 non-null   object 
 2   brewery         3518 non-null   object 
 3   abv             3518 non-null   float64
 4   rating          3518 non-null   object 
 5   astringency     3518 non-null   int64  
 6   body            3518 non-null   int64  
 7   alcohol         3518 non-null   int64  
 8   bitter          3518 non-null   int64  
 9   sweet           3518 non-null   int64  
 10  sour            3518 non-null   int64  
 11  salty           3518 non-null   int64  
 12  fruits          3518 non-null   int64  
 13  hoppy           3518 non-null   int64  
 14  spices          3518 non-null   int64  
 15  malty           3518 non-null   int64  
 16  sum_of_factors  3518 non-null   int64  
 17  BA_Big_styles   3518 non-null   o

In [49]:
df.dropna(subset=["beer_name"], inplace=True)

In [50]:
# Filter out beers with abv > 13
original_length = len(df)
df = df[df.abv <= 13]
df = df[df.abv >= 3]
new_length = len(df)
print(f'Dropped {original_length - new_length} rows')
print(f'New length: {new_length}')

Dropped 82 rows
New length: 3435


In [51]:
# Check for and remove duplicate beers
original_length = len(df)
df["beer_brewary"] = df.beer_name + df.brewery
df.drop_duplicates(subset=['beer_brewary'], inplace=True)
df.drop(columns = ["beer_brewary"], inplace=True)
new_length = len(df)
print(f'Dropped {original_length - new_length} rows')
print(f'New length: {new_length}')

Dropped 0 rows
New length: 3435


In [52]:
# Filter for entries with over X cumulative factors
original_length = len(df)
df = df[df['sum_of_factors'] > 50]
new_length = len(df)
print(f'Dropped {original_length - new_length} rows')
print(f'New length: {new_length}')

Dropped 0 rows
New length: 3435


In [53]:
df.columns

Index(['beer_name', 'beer_style', 'brewery', 'abv', 'rating', 'astringency',
       'body', 'alcohol', 'bitter', 'sweet', 'sour', 'salty', 'fruits',
       'hoppy', 'spices', 'malty', 'sum_of_factors', 'BA_Big_styles'],
      dtype='object')

### K-Means
First k-means clustering

#### Testing Standard Scaler

In [107]:
# Drop columns unnecessary for analysis
new_df = df.drop(['beer_name', 'beer_style', 'brewery','rating', 'sum_of_factors', 'BA_Big_styles'], axis=1)

In [108]:
# define standard scaler
scaler = StandardScaler()
# transform data
scaled_df = scaler.fit_transform(new_df)

In [109]:
# Create Elbow Plot
inertia = []
k = list(range(1, 21))

for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(scaled_df)
    inertia.append(km.inertia_)
    
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [110]:
silhouette_avg = []
for num_clusters in range(2, 20):

    # Produce model
    model = KMeans(n_clusters=num_clusters)
    model.fit(scaled_df)
    cluster_labels = model.labels_

    # Get silhoutte score
    silhouette_avg.append(silhouette_score(scaled_df, cluster_labels))
print(f'Silhoutte average: {silhouette_avg}')

# Number of clusters for subclass will be number of clusters with greatest silhoutte score
max_silhouette_avg = max(silhouette_avg)
max_index = silhouette_avg.index(max_silhouette_avg)
k = max_index + 1
print("\n")
print(f'Best K: {k}; Max Silhoutte: {max_silhouette_avg}')

Silhoutte average: [0.19770585092600845, 0.20798784282643829, 0.2302690485016846, 0.2417168762041553, 0.2404942356537703, 0.255823522385476, 0.24125742185719312, 0.22626764865817153, 0.22634200144831187, 0.23153472663868033, 0.2280897799321267, 0.22280356709990623, 0.2182782727626049, 0.21348620588529446, 0.20973262224934655, 0.21190101356179314, 0.2110775778435017, 0.20601095413169634]


Best K: 6; Max Silhoutte: 0.255823522385476


#### Testing Min/Max Scaler

In [111]:
# Drop columns unnecessary for analysis
minmax_df = df.drop(['beer_name', 'beer_style', 'brewery','rating', 'sum_of_factors', 'BA_Big_styles', 'salty'], axis=1)

In [112]:
# define min max scaler
scaler = MinMaxScaler()
# transform data
minmax_scaled_df = scaler.fit_transform(minmax_df)

In [113]:
# Create Elbow Plot
inertia = []
k = list(range(1, 21))

for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(minmax_scaled_df)
    inertia.append(km.inertia_)
    
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [114]:
silhouette_avg = []
for num_clusters in range(2, 20):

    # Produce model
    model = KMeans(n_clusters=num_clusters)
    model.fit(minmax_scaled_df)
    cluster_labels = model.labels_

    # Get silhoutte score
    silhouette_avg.append(silhouette_score(minmax_scaled_df, cluster_labels))
print(f'Silhoutte average: {silhouette_avg}')

# Number of clusters for subclass will be number of clusters with greatest silhoutte score
max_silhouette_avg = max(silhouette_avg)
max_index = silhouette_avg.index(max_silhouette_avg)
k = max_index + 1
print("\n")
print(f'Best K: {k}; Max Silhoutte: {max_silhouette_avg}')

Silhoutte average: [0.23684902922946238, 0.2516871766934433, 0.2752099378970983, 0.25626194952454195, 0.2427804276897948, 0.24287447451538074, 0.20808924700188086, 0.21551098731369178, 0.21246402976231243, 0.21554851545269973, 0.2208939597243931, 0.22434273072991506, 0.21749177095413275, 0.2225272546740516, 0.20763015662033954, 0.21930474651208576, 0.2249258051175948, 0.2015995676281441]


Best K: 3; Max Silhoutte: 0.2752099378970983


#### Running Model with Min/Max Scaler

In [115]:
# Initializing model 
clusters = 3
model = KMeans(n_clusters=clusters)

# Fit model
model.fit(minmax_scaled_df)

# Get predictions
predictions = model.predict(minmax_scaled_df)

# Add a new class column to the df
new_df["class"] = model.labels_
new_df.head()

Unnamed: 0,abv,astringency,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty,class
0,6.7,23,144,43,70,189,37,2,61,45,27,23,1
1,8.2,21,128,59,51,210,48,0,74,52,18,21,0
2,7.6,17,155,51,63,155,37,0,54,54,18,17,0
3,7.9,21,109,83,41,200,58,0,82,53,25,21,0
4,7.4,30,155,41,62,188,56,1,84,52,21,30,0


In [116]:
# Select columns from original df to add to newdf
combined_df = new_df.join(df[["beer_style", "rating", "BA_Big_styles", 'beer_name', 'brewery']])

In [117]:
# View dataframe grouped by major style
combined_df.groupby(["BA_Big_styles"]).median().sort_values(["class"])

Unnamed: 0_level_0,abv,astringency,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty,class
BA_Big_styles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Bock,7.45,30.0,118.0,52.0,60.5,166.0,48.0,0.0,62.5,58.0,21.5,30.0,0
Stouts,8.1,28.5,217.0,44.5,140.0,135.0,26.0,0.0,24.0,66.0,26.5,28.5,0
Strong Ales,9.15,30.0,111.0,82.0,50.0,180.0,90.0,0.0,111.0,64.0,47.0,30.0,0
Brown Ales,5.6,29.0,122.0,20.0,80.0,135.0,30.0,0.0,28.0,81.0,14.0,29.0,1
Dark Ales,7.0,28.0,107.0,46.5,53.5,147.0,53.0,0.0,61.0,61.5,52.5,28.0,1
Dark Lagers,5.4,37.0,90.0,14.0,72.0,121.0,32.0,1.0,31.0,90.0,18.0,37.0,1
Hybrid Beers,5.5,36.0,107.0,19.0,64.0,98.0,60.0,1.0,52.0,85.0,16.0,36.0,1
Pale Ales,5.5,46.0,83.0,19.0,100.0,107.0,101.0,1.0,100.0,145.5,16.0,46.0,1
Pale Lagers,5.0,48.0,66.0,14.0,71.0,60.0,31.0,2.0,28.0,110.0,10.0,48.0,1
Porters,6.5,27.0,190.0,25.0,121.0,127.0,24.0,0.0,21.0,65.0,22.5,27.0,1


In [184]:
# Create DF for barcharts
bar_df = combined_df.groupby(["class"]).mean().sort_values(["class"])
bar_df.reset_index(inplace=True)
bar_df

Unnamed: 0,class,abv,astringency,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty
0,0,9.600788,27.973747,149.377088,87.594272,83.99284,176.9642,68.231504,0.954654,88.315036,68.516706,60.917661,27.973747
1,1,5.494582,38.394068,106.729419,20.62954,85.723366,104.999395,54.02724,1.973971,58.182809,94.599879,32.394068,38.394068
2,2,6.854921,60.359788,90.961905,29.288889,115.750265,132.740741,198.619048,3.226455,194.456085,163.385185,24.862434,60.359788


In [181]:
bar_df["class"] = bar_df["class"].to_string()

In [183]:
bar_df

Unnamed: 0,class,abv,astringency,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty
0,0 0\n1 1\n2 2,9.600788,27.973747,149.377088,87.594272,83.99284,176.9642,68.231504,0.954654,88.315036,68.516706,60.917661,27.973747
1,0 0\n1 1\n2 2,5.494582,38.394068,106.729419,20.62954,85.723366,104.999395,54.02724,1.973971,58.182809,94.599879,32.394068,38.394068
2,0 0\n1 1\n2 2,6.854921,60.359788,90.961905,29.288889,115.750265,132.740741,198.619048,3.226455,194.456085,163.385185,24.862434,60.359788


In [185]:
fig = px.bar(bar_df, x="class", y=['abv', 'astringency', 'body', 'alcohol', 'bitter', 'sweet',
       'sour', 'fruits', 'hoppy', 'spices', 'malty'])
fig.show()

### K-Means on Classes

In [120]:
# Create copy of DF to work on

kmeans_df = combined_df.copy()

In [121]:
# Drop columns unnecessary for analysis
df_for_iteration = kmeans_df.drop(['beer_name', 'beer_style', 'brewery',
       'rating', "BA_Big_styles"], axis=1)

In [122]:
df_for_iteration

Unnamed: 0,abv,astringency,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty,class
0,6.7,23,144,43,70,189,37,2,61,45,27,23,1
1,8.2,21,128,59,51,210,48,0,74,52,18,21,0
2,7.6,17,155,51,63,155,37,0,54,54,18,17,0
3,7.9,21,109,83,41,200,58,0,82,53,25,21,0
4,7.4,30,155,41,62,188,56,1,84,52,21,30,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3513,8.4,100,67,30,8,203,432,0,230,28,17,100,2
3514,6.3,89,73,25,6,117,335,0,142,18,21,89,2
3515,9.5,75,59,32,25,186,390,0,260,13,17,75,2
3516,6.0,101,74,20,8,252,359,4,243,10,6,101,2


In [123]:
try:

    del df_with_subclasses
    
except:
    print("dataframe empty already")
    
created_new_df = False

for k_class in range(0,clusters):
    
    # Filter for class and drop class column
    temp_df = df_for_iteration[df_for_iteration["class"] == k_class]
    temp_df.drop(columns = ["class"], inplace = True)
    
    # Subclass will be in columns after the first iteration
    if "subclass" in temp_df.columns:
        temp_df.drop(columns = ["subclass"], inplace = True)
    
    # define standard scaler
    scaler = StandardScaler()
    # transform data
    scaled_df = scaler.fit_transform(temp_df)
    
    silhouette_avg = []
    for num_clusters in range(2,16):
        
        # Produce model
        model = KMeans(n_clusters=num_clusters)
        model.fit(scaled_df)
        cluster_labels = model.labels_
        
        # Get silhoutte score
        silhouette_avg.append(silhouette_score(scaled_df, cluster_labels))
    
#     print(f'Silhoutte score for class {k_class}: {silhouette_avg}')
    
    # Number of clusters for subclass will be number of clusters with greatest silhoutte score
    max_silhouette_avg = max(silhouette_avg)
    max_index = silhouette_avg.index(max_silhouette_avg)
    k = max_index + 2
    
    # Run model with highest silhoutte score
    model = KMeans(n_clusters=k)
    model.fit(scaled_df)
    cluster_labels = model.labels_
    
    temp_df["class"] = k_class
    temp_df["subclass"] = cluster_labels
    
    print(f'Class: {k_class}; Number of Subclasses: {k}; Max Silhoutte: {max_silhouette_avg}')
        
    if created_new_df == False:
        df_with_subclasses = temp_df.copy()
        created_new_df = True
        
    else:
        df_with_subclasses = df_with_subclasses.append(temp_df)

print("Finished")

Class: 0; Number of Subclasses: 7; Max Silhoutte: 0.2171705999628716
Class: 1; Number of Subclasses: 8; Max Silhoutte: 0.23548653887775786
Class: 2; Number of Subclasses: 2; Max Silhoutte: 0.2630157434057
Finished


In [124]:
# Number of beers per class
df_with_subclasses['class'].value_counts()

1    1652
2     945
0     838
Name: class, dtype: int64

In [125]:
# Number of beers per subclass
df_with_subclasses["class_subclass"] = df_with_subclasses["class"].astype(str) + df_with_subclasses["subclass"].astype(str)
df_with_subclasses['class_subclass'].value_counts()

20    627
15    449
21    318
04    258
11    255
10    231
12    219
13    217
14    189
03    153
01    152
02    122
17     79
05     79
00     72
16     13
06      2
Name: class_subclass, dtype: int64

In [126]:
# Combine results with relevant data
results_df = df_with_subclasses.join(df[["beer_style", "rating", "BA_Big_styles", 'beer_name', 'brewery']])

In [127]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3435 entries, 1 to 3517
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   abv             3435 non-null   float64
 1   astringency     3435 non-null   int64  
 2   body            3435 non-null   int64  
 3   alcohol         3435 non-null   int64  
 4   bitter          3435 non-null   int64  
 5   sweet           3435 non-null   int64  
 6   sour            3435 non-null   int64  
 7   salty           3435 non-null   int64  
 8   fruits          3435 non-null   int64  
 9   hoppy           3435 non-null   int64  
 10  spices          3435 non-null   int64  
 11  malty           3435 non-null   int64  
 12  class           3435 non-null   int64  
 13  subclass        3435 non-null   int32  
 14  class_subclass  3435 non-null   object 
 15  beer_style      3435 non-null   object 
 16  rating          3435 non-null   object 
 17  BA_Big_styles   3435 non-null   o

In [128]:
# Make a lower case beer name column, easier for searching
results_df["beer_name_lower"] = results_df.beer_name.apply(lambda x: x.lower())

In [129]:
results_df.head()

Unnamed: 0,abv,astringency,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty,class,subclass,class_subclass,beer_style,rating,BA_Big_styles,beer_name,brewery,beer_name_lower
1,8.2,21,128,59,51,210,48,0,74,52,18,21,0,4,4,Bock - Doppelbock,3.97,Bock,Troegenator,Tröegs Brewing Company,troegenator
2,7.6,17,155,51,63,155,37,0,54,54,18,17,0,4,4,Bock - Doppelbock,3.92,Bock,Spaten Optimator,Spaten-Franziskaner-Bräu,spaten optimator
3,7.9,21,109,83,41,200,58,0,82,53,25,21,0,4,4,Bock - Doppelbock,3.96,Bock,Salvator,Paulaner Brauerei,salvator
4,7.4,30,155,41,62,188,56,1,84,52,21,30,0,4,4,Bock - Doppelbock,4.21,Bock,Korbinian,Bayerische Staatsbrauerei Weihenstephan,korbinian
6,9.5,14,123,86,47,231,39,0,64,51,18,14,0,4,4,Bock - Doppelbock,3.96,Bock,Samuel Adams Double Bock (Imperial Series),Boston Beer Company (Samuel Adams),samuel adams double bock (imperial series)


### App Test

In [180]:
# Beer input
user_input = input("Input a beer: ")
input_beer = user_input.lower()
# input_beer = "Devil's Milk"
input_beer = input_beer.lower() # can remove when switching to manual input

number_to_recommend = 10

if input_beer not in results_df["beer_name_lower"].to_list():
    print("Hmm, that one isn't in the table, only good beers are in this database.")
    
else:
    print("Woo, that beer is in the table, good job!\n")
    
    duplicates = results_df[results_df.beer_name_lower == input_beer]
    if len(duplicates) > 1:
        duplicate_breweries = duplicates.brewery.values.tolist()
        print(f'There are multiple "{user_input}" beers; what brewery is it from?')
        for i in duplicate_breweries:
            print(i)
        input_brewery = input("Copy/Type the brewery here: ")
        
        while input_brewery not in duplicate_breweries:
            print("\n")
            print("Please input a correct brewery")
            print(f'There are multiple "{user_input}" beers; what brewery is it from?')
            for i in duplicate_breweries:
                print(i)
            input_brewery = input("Copy/Type the brewery here: ")
        
        # Get the input beer class and subclass
        
        beer_class = results_df.loc[
            (results_df["beer_name_lower"] == input_beer) & 
            (results_df["brewery"] == input_brewery)]["class"].item()
        
        beer_subclass = results_df.loc[
            (results_df["beer_name_lower"] == input_beer) & 
            (results_df["brewery"] == input_brewery)]["subclass"].item()
        print(f'Beer Class: {beer_class}')
        print(f'Beer Subclass: {beer_subclass}')
        
    else:
    
        # Get the input beer class and subclass
        beer_class = results_df.loc[results_df["beer_name_lower"] == input_beer]["class"].item()
        beer_subclass = results_df.loc[results_df["beer_name_lower"] == input_beer]["subclass"].item()
        input_brewery = results_df.loc[results_df["beer_name_lower"] == input_beer]["brewery"].item()
        print(f'Beer Class: {beer_class}')
        print(f'Beer Subclass: {beer_subclass}')
    
    print('\n')
    
    # Get dataframe of beers in same class and subclass
    similar_df = results_df[(results_df["class"] == beer_class) & (results_df["subclass"] == beer_subclass)]
    similar_df.sort_values("rating", inplace=True, ascending=False)
    recommended_beers = similar_df.beer_name.to_list()
    recommended_brewery = similar_df.brewery.to_list()
    
    # Option 1: Return list of similar beers ranked by are rating
    
    if len(recommended_beers) <= number_to_recommend:
        print("Recommended Beers with similar taste profiles:")
        for x in range(0, number_to_recommend):
            print(f' {recommended_beers[x]} by {recommended_brewery[x]}')
     
    
    
    # Option 2: Calcualte differences from input beer from similar beers, return those with smallest differences
    else: 
        
        in_abv = results_df.loc[
            (results_df["beer_name_lower"] == input_beer) & 
            (results_df["brewery"] == input_brewery)]["abv"].item()
        in_astringency = results_df.loc[
            (results_df["beer_name_lower"] == input_beer) & 
            (results_df["brewery"] == input_brewery)]["astringency"].item()
        in_body = results_df.loc[
            (results_df["beer_name_lower"] == input_beer) & 
            (results_df["brewery"] == input_brewery)]["body"].item()
        in_bitter = results_df.loc[
            (results_df["beer_name_lower"] == input_beer) & 
            (results_df["brewery"] == input_brewery)]["bitter"].item()
        in_sweet = results_df.loc[
            (results_df["beer_name_lower"] == input_beer) & 
            (results_df["brewery"] == input_brewery)]["sweet"].item()
        in_sour = results_df.loc[
            (results_df["beer_name_lower"] == input_beer) & 
            (results_df["brewery"] == input_brewery)]["sour"].item()
        in_fruits = results_df.loc[
            (results_df["beer_name_lower"] == input_beer) & 
            (results_df["brewery"] == input_brewery)]["fruits"].item()
        in_hoppy = results_df.loc[
            (results_df["beer_name_lower"] == input_beer) & 
            (results_df["brewery"] == input_brewery)]["hoppy"].item()
        in_spices = results_df.loc[
            (results_df["beer_name_lower"] == input_beer) & 
            (results_df["brewery"] == input_brewery)]["spices"].item()
        in_malty = results_df.loc[
            (results_df["beer_name_lower"] == input_beer) & 
            (results_df["brewery"] == input_brewery)]["malty"].item()
        in_salty = results_df.loc[
            (results_df["beer_name_lower"] == input_beer) & 
            (results_df["brewery"] == input_brewery)]["salty"].item()
        in_alcohol = results_df.loc[
            (results_df["beer_name_lower"] == input_beer) & 
            (results_df["brewery"] == input_brewery)]["alcohol"].item()


        def calc_differences(abv, astringency, body, bitter, sweet, sour, fruits, hoppy, spices, malty, salty, alcohol):
                value = (abv - in_abv) + (astringency - in_astringency) + (body - in_body) + (bitter - in_bitter) + (sweet - in_sweet) + (sour - in_sour) + (fruits - in_fruits) + (hoppy - in_hoppy) + (spices - in_spices) + (malty - in_malty)+ (salty - in_salty) + (alcohol - in_alcohol)
                value = abs(value)
                return value


        if len(similar_df) > 10:
            similar_df["difference"] = similar_df.apply(lambda row: calc_differences(row['abv'],
                        row['astringency'], row['body'], row['bitter'],
                        row['sweet'], row['sour'], row['fruits'], row['hoppy'],
                        row['spices'], row['malty'], row['salty'], row['alcohol']), axis = 1)
            similar_df.sort_values("difference", inplace=True, ascending=True)
            recommended_beers = similar_df.beer_name.to_list()
            recommended_brewery = similar_df.brewery.to_list()
            recommended_difference = similar_df.difference.to_list()

            print("Recommended Beers with similar taste profiles:\n")
            for x in range(0, number_to_recommend + 1): # starting at index 1 bc index 0 should be the input beer
                if recommended_beers[x].lower() != input_beer.lower():
                    print(f'{recommended_beers[x]} by {recommended_brewery[x]}; difference: {recommended_difference[x]}')

Input a beer: Pliny The Elder
Woo, that beer is in the table, good job!

Beer Class: 2
Beer Subclass: 0


Recommended Beers with similar taste profiles:

Northern Hemisphere Harvest Wet Hop IPA by Sierra Nevada Brewing Co.; difference: 0.29999999999999716
Double IPA by AleSmith Brewing Company; difference: 0.5
Citra Pale Ale by Hill Farmstead Brewery; difference: 0.5999999999999943
Nebuchadnezzar by Omnipollo; difference: 1.5
Double Dry Hopped Fort Point Pale Ale by Trillium Brewing Company; difference: 1.5999999999999943
Susan by Hill Farmstead Brewery; difference: 2.200000000000003
Dorado by Ballast Point Brewing Company; difference: 3.0
Knuckle Sandwich by Bootlegger's Brewery; difference: 4.0
Blast! by Brooklyn Brewery; difference: 4.399999999999999
Galactica by Clown Shoes; difference: 5.0


In [288]:
hist_df = results_df.sort_values("rating")
fig = px.histogram(hist_df, x="rating", nbins=15)
fig.show()

In [134]:
results_df.to_csv("bar.csv", index=False)

In [186]:
results_df.head()

Unnamed: 0,abv,astringency,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty,class,subclass,class_subclass,beer_style,rating,BA_Big_styles,beer_name,brewery,beer_name_lower
1,8.2,21,128,59,51,210,48,0,74,52,18,21,0,4,4,Bock - Doppelbock,3.97,Bock,Troegenator,Tröegs Brewing Company,troegenator
2,7.6,17,155,51,63,155,37,0,54,54,18,17,0,4,4,Bock - Doppelbock,3.92,Bock,Spaten Optimator,Spaten-Franziskaner-Bräu,spaten optimator
3,7.9,21,109,83,41,200,58,0,82,53,25,21,0,4,4,Bock - Doppelbock,3.96,Bock,Salvator,Paulaner Brauerei,salvator
4,7.4,30,155,41,62,188,56,1,84,52,21,30,0,4,4,Bock - Doppelbock,4.21,Bock,Korbinian,Bayerische Staatsbrauerei Weihenstephan,korbinian
6,9.5,14,123,86,47,231,39,0,64,51,18,14,0,4,4,Bock - Doppelbock,3.96,Bock,Samuel Adams Double Bock (Imperial Series),Boston Beer Company (Samuel Adams),samuel adams double bock (imperial series)


In [188]:
results_df.groupby("class").count()

Unnamed: 0_level_0,abv,astringency,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty,subclass,class_subclass,beer_style,rating,BA_Big_styles,beer_name,brewery,beer_name_lower
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,838,838,838,838,838,838,838,838,838,838,838,838,838,838,838,838,838,838,838,838
1,1652,1652,1652,1652,1652,1652,1652,1652,1652,1652,1652,1652,1652,1652,1652,1652,1652,1652,1652,1652
2,945,945,945,945,945,945,945,945,945,945,945,945,945,945,945,945,945,945,945,945


In [195]:
results_df.head()

Unnamed: 0,abv,astringency,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty,class,subclass,class_subclass,beer_style,rating,BA_Big_styles,beer_name,brewery,beer_name_lower
1,8.2,21,128,59,51,210,48,0,74,52,18,21,0,4,4,Bock - Doppelbock,3.97,Bock,Troegenator,Tröegs Brewing Company,troegenator
2,7.6,17,155,51,63,155,37,0,54,54,18,17,0,4,4,Bock - Doppelbock,3.92,Bock,Spaten Optimator,Spaten-Franziskaner-Bräu,spaten optimator
3,7.9,21,109,83,41,200,58,0,82,53,25,21,0,4,4,Bock - Doppelbock,3.96,Bock,Salvator,Paulaner Brauerei,salvator
4,7.4,30,155,41,62,188,56,1,84,52,21,30,0,4,4,Bock - Doppelbock,4.21,Bock,Korbinian,Bayerische Staatsbrauerei Weihenstephan,korbinian
6,9.5,14,123,86,47,231,39,0,64,51,18,14,0,4,4,Bock - Doppelbock,3.96,Bock,Samuel Adams Double Bock (Imperial Series),Boston Beer Company (Samuel Adams),samuel adams double bock (imperial series)


In [193]:
for_chart = results_df.groupby("class_subclass").count()

In [196]:
for_chart

Unnamed: 0_level_0,abv,astringency,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty,class,subclass,beer_style,rating,BA_Big_styles,beer_name,brewery,beer_name_lower
class_subclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72
1,152,152,152,152,152,152,152,152,152,152,152,152,152,152,152,152,152,152,152,152
2,122,122,122,122,122,122,122,122,122,122,122,122,122,122,122,122,122,122,122,122
3,153,153,153,153,153,153,153,153,153,153,153,153,153,153,153,153,153,153,153,153
4,258,258,258,258,258,258,258,258,258,258,258,258,258,258,258,258,258,258,258,258
5,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79
6,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
10,231,231,231,231,231,231,231,231,231,231,231,231,231,231,231,231,231,231,231,231
11,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
12,219,219,219,219,219,219,219,219,219,219,219,219,219,219,219,219,219,219,219,219


In [197]:
for_dict = for_chart["abv"]

In [199]:
test = for_dict.to_dict()

In [200]:
test

{'00': 72,
 '01': 152,
 '02': 122,
 '03': 153,
 '04': 258,
 '05': 79,
 '06': 2,
 '10': 231,
 '11': 255,
 '12': 219,
 '13': 217,
 '14': 189,
 '15': 449,
 '16': 13,
 '17': 79,
 '20': 627,
 '21': 318}

In [201]:
for_figure = results_df.copy()

In [203]:
def get_count(x):
    count = test[x]
    return count

In [204]:
for_figure["beer_count"] = for_figure.class_subclass.apply(lambda x: test[x])

In [205]:
for_figure

Unnamed: 0,abv,astringency,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty,class,subclass,class_subclass,beer_style,rating,BA_Big_styles,beer_name,brewery,beer_name_lower,beer_count
1,8.2,21,128,59,51,210,48,0,74,52,18,21,0,4,04,Bock - Doppelbock,3.97,Bock,Troegenator,Tröegs Brewing Company,troegenator,258
2,7.6,17,155,51,63,155,37,0,54,54,18,17,0,4,04,Bock - Doppelbock,3.92,Bock,Spaten Optimator,Spaten-Franziskaner-Bräu,spaten optimator,258
3,7.9,21,109,83,41,200,58,0,82,53,25,21,0,4,04,Bock - Doppelbock,3.96,Bock,Salvator,Paulaner Brauerei,salvator,258
4,7.4,30,155,41,62,188,56,1,84,52,21,30,0,4,04,Bock - Doppelbock,4.21,Bock,Korbinian,Bayerische Staatsbrauerei Weihenstephan,korbinian,258
6,9.5,14,123,86,47,231,39,0,64,51,18,14,0,4,04,Bock - Doppelbock,3.96,Bock,Samuel Adams Double Bock (Imperial Series),Boston Beer Company (Samuel Adams),samuel adams double bock (imperial series),258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3513,8.4,100,67,30,8,203,432,0,230,28,17,100,2,1,21,Wild Ale,4.39,Wild/Sour,Cuvée De Castleton,Captain Lawrence Brewing Co.,cuvée de castleton,318
3514,6.3,89,73,25,6,117,335,0,142,18,21,89,2,1,21,Wild Ale,4.17,Wild/Sour,The Wild One,Bell's Brewery - Eccentric Café & General Store,the wild one,318
3515,9.5,75,59,32,25,186,390,0,260,13,17,75,2,1,21,Wild Ale,4.21,Wild/Sour,Figaro,Cascade Brewing / Raccoon Lodge & Brewpub,figaro,318
3516,6.0,101,74,20,8,252,359,4,243,10,6,101,2,1,21,Wild Ale,4.18,Wild/Sour,Raspberry,Upland Brewing Company,raspberry,318


In [262]:
df2 = pd.DataFrame(for_dict)

In [263]:
df2.rename(columns = ({"abv":"beer_count"}), inplace=True)

In [264]:
df2.reset_index(inplace=True)

In [265]:
df2["main_class"] = df2.class_subclass.apply(lambda x: x[0])

In [266]:
df2["subclass"] = df2.class_subclass.apply(lambda x: x[1])

In [267]:
df2["total"] = "Total"

In [268]:
df2["main_class"] = df2.main_class.apply(lambda x: "Class: " + x)

In [269]:
df2["subclass"] = df2.subclass.apply(lambda x: "Subclass: " + x)

In [270]:
df2

Unnamed: 0,class_subclass,beer_count,main_class,subclass,total
0,0,72,Class: 0,Subclass: 0,Total
1,1,152,Class: 0,Subclass: 1,Total
2,2,122,Class: 0,Subclass: 2,Total
3,3,153,Class: 0,Subclass: 3,Total
4,4,258,Class: 0,Subclass: 4,Total
5,5,79,Class: 0,Subclass: 5,Total
6,6,2,Class: 0,Subclass: 6,Total
7,10,231,Class: 1,Subclass: 0,Total
8,11,255,Class: 1,Subclass: 1,Total
9,12,219,Class: 1,Subclass: 2,Total


In [272]:
df2 = df2[["main_class", "subclass", "total", "beer_count"]]

In [274]:
df2

Unnamed: 0,main_class,subclass,total,beer_count
0,Class: 0,Subclass: 0,Total,72
1,Class: 0,Subclass: 1,Total,152
2,Class: 0,Subclass: 2,Total,122
3,Class: 0,Subclass: 3,Total,153
4,Class: 0,Subclass: 4,Total,258
5,Class: 0,Subclass: 5,Total,79
6,Class: 0,Subclass: 6,Total,2
7,Class: 1,Subclass: 0,Total,231
8,Class: 1,Subclass: 1,Total,255
9,Class: 1,Subclass: 2,Total,219


In [275]:
df2.to_csv("csv_files/df_for_sunburst.csv", index=False)

In [276]:
fig = px.sunburst(df2, path=['total','main_class', 'subclass'], values='beer_count')
fig.show()

In [300]:
bar_df2 = for_figure.groupby(["class_subclass"]).mean()
# bar_df2.drop(columns=["class"], inplace=True)
bar_df2.reset_index(inplace=True)
bar_df2

Unnamed: 0,class_subclass,abv,astringency,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty,class,subclass,beer_count
0,0,8.775,22.833333,148.722222,66.916667,68.527778,125.0,32.791667,2.111111,98.069444,44.125,215.680556,22.833333,0.0,0,72.0
1,1,11.002566,20.072368,170.342105,149.092105,67.282895,171.085526,39.973684,0.631579,57.190789,38.348684,57.565789,20.072368,0.0,1,152.0
2,2,9.282459,46.393443,98.934426,75.254098,41.721311,178.327869,136.385246,0.983607,144.139344,62.45082,65.02459,46.393443,0.0,2,122.0
3,3,9.692418,26.732026,234.555556,64.385621,168.686275,156.75817,32.398693,0.777778,35.764706,79.562092,36.496732,26.732026,0.0,3,153.0
4,4,8.93593,25.531008,117.53876,75.275194,48.372093,205.872093,75.984496,0.624031,102.244186,54.248062,43.972868,25.531008,0.0,4,258.0
5,5,10.125316,29.582278,126.392405,93.189873,148.772152,178.468354,92.974684,0.822785,109.746835,184.506329,23.151899,29.582278,0.0,5,79.0
6,6,10.25,36.5,155.5,54.5,46.5,168.5,98.0,43.5,74.0,23.5,39.5,36.5,0.0,6,2.0
7,10,5.573636,37.952381,94.268398,19.069264,129.727273,106.069264,70.575758,1.337662,68.4329,179.627706,20.91342,37.952381,1.0,0,231.0
8,11,6.088667,30.615686,201.235294,21.384314,152.658824,130.494118,24.866667,1.105882,22.384314,76.356863,24.34902,30.615686,1.0,1,255.0
9,12,5.135845,40.748858,80.187215,19.515982,32.753425,99.447489,125.260274,1.305936,143.506849,50.858447,50.616438,40.748858,1.0,2,219.0


In [305]:
bar_df2 = bar_df2[bar_df2["class"] == 0]

In [307]:
bar_df2

Unnamed: 0,class_subclass,abv,astringency,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty,class,subclass,beer_count
0,0,8.775,22.833333,148.722222,66.916667,68.527778,125.0,32.791667,2.111111,98.069444,44.125,215.680556,22.833333,0.0,0,72.0
1,1,11.002566,20.072368,170.342105,149.092105,67.282895,171.085526,39.973684,0.631579,57.190789,38.348684,57.565789,20.072368,0.0,1,152.0
2,2,9.282459,46.393443,98.934426,75.254098,41.721311,178.327869,136.385246,0.983607,144.139344,62.45082,65.02459,46.393443,0.0,2,122.0
3,3,9.692418,26.732026,234.555556,64.385621,168.686275,156.75817,32.398693,0.777778,35.764706,79.562092,36.496732,26.732026,0.0,3,153.0
4,4,8.93593,25.531008,117.53876,75.275194,48.372093,205.872093,75.984496,0.624031,102.244186,54.248062,43.972868,25.531008,0.0,4,258.0
5,5,10.125316,29.582278,126.392405,93.189873,148.772152,178.468354,92.974684,0.822785,109.746835,184.506329,23.151899,29.582278,0.0,5,79.0
6,6,10.25,36.5,155.5,54.5,46.5,168.5,98.0,43.5,74.0,23.5,39.5,36.5,0.0,6,2.0


In [308]:
fig = px.bar(bar_df2, x="class_subclass", y=['abv', 'astringency', 'body', 'alcohol', 'bitter', 'sweet',
       'sour', 'fruits', 'hoppy', 'spices', 'malty'])
fig.show()

NameError: name 'index' is not defined