In [3]:
# import dependencies
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
merged_nb_df = pd.read_csv("merged_nb_df.csv")
pd.set_option('display.max_columns', None)
merged_nb_df.head()

Unnamed: 0,size_l,og,fg,abv,ibu,color,boilsize,boiltime,boilgravity,efficiency,mashthickness,pitchrate,primarytemp,review_taste,beer_abv,sugarscale_Plato,sugarscale_Specific Gravity,brewmethod_All Grain,brewmethod_BIAB,brewmethod_Partial Mash,brewmethod_extract
0,18.93,1.082,1.013,9.1,0.0,4.1,21.58,60.0,0.0,72.0,0.0,0.0,0.0,4.103825,7.6,0.0,1.0,1.0,0.0,0.0,0.0
1,7.57,1.076,1.014,8.1,33.51,3.68,8.71,60.0,1.066,75.0,0.0,0.0,0.0,4.103825,7.6,0.0,1.0,1.0,0.0,0.0,0.0
2,20.82,1.043,1.011,4.14,13.22,17.67,28.39,90.0,0.0,85.0,0.0,1.75,12.22,3.028652,4.4,0.0,1.0,1.0,0.0,0.0,0.0
3,22.71,1.069,1.017,6.75,58.76,40.0,29.34,90.0,1.053,70.0,0.0,0.0,0.0,4.265971,6.4,0.0,1.0,1.0,0.0,0.0,0.0
4,22.71,1.068,1.018,6.53,73.4,40.0,27.44,90.0,1.056,65.0,1.3,1.25,18.89,4.265971,6.4,0.0,1.0,1.0,0.0,0.0,0.0


In [5]:
# What percentage of beers scored 4 or above?
len(merged_nb_df[(merged_nb_df["review_taste"]>=4)]) / len(merged_nb_df) * 100

34.77543538038497

In [6]:
# Since beers with a rating of four or above is the top 34% of beers, let's go with those as "good" beers.
# We need a new category "good beer" with a 1 for good and 0 for bad
merged_nb_df["good_beer"]=0

In [7]:
# for all the "good beers", put a 1 in the good_beer column
merged_nb_df.loc[merged_nb_df["review_taste"].abs()>=4, "good_beer"] = 1

In [8]:
# Drop rows where IBU is listed as above 125
merged_nb_df = merged_nb_df[merged_nb_df['ibu'] <=125]  

# Drop rows where sugarscale = plato because there are only a few and the plato scale specific gravity values are off
merged_nb_df = merged_nb_df[merged_nb_df["sugarscale_Plato"] == 0]

# Drop rows where specific gravity is greater than 5 
# These don't make sense and are potential outliers or typos
merged_nb_df = merged_nb_df[merged_nb_df["og"] <=5]
merged_nb_df = merged_nb_df[merged_nb_df["fg"] <=5]

In [9]:
# What if we drop some sparse columns and try to get better accuracy...
merged_nb_df.drop(columns = [ "size_l", "abv", "boilsize","boiltime", "boilgravity", "efficiency", "mashthickness", "pitchrate","primarytemp","sugarscale_Plato","sugarscale_Specific Gravity"],inplace=True )

In [10]:
beer_class = merged_nb_df.drop(columns = ["review_taste"])

In [11]:
# Split our preprocessed data into our features and target arrays
y = beer_class.good_beer.values
X = beer_class.drop(columns = ["good_beer"]).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5, stratify = y)


In [12]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Gradient Boosting Classifier

In [13]:
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import GradientBoostingClassifier

In [14]:
gradient_booster = GradientBoostingClassifier(learning_rate=0.1, n_estimators = 200)
gradient_booster.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [15]:
gradient_booster.fit(X_train_scaled, y_train)
predictions = gradient_booster.predict(X_test_scaled)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.77      0.92      0.84      1717
           1       0.77      0.50      0.60       921

    accuracy                           0.77      2638
   macro avg       0.77      0.71      0.72      2638
weighted avg       0.77      0.77      0.76      2638



In [16]:
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.7717968157695224


In [17]:
# Save the model to use later
import pickle
filename = "final_ML_model.pkl"
pickle.dump(gradient_booster, open(filename, 'wb'))

In [18]:
filename2 = "beer_scaler.pkl"
pickle.dump(scaler, open(filename2, 'wb'))