In [288]:
# import dependencies
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
import matplotlib.pyplot as plt

In [289]:
merged_nb_df = pd.read_csv("merged_nb_df.csv")
pd.set_option('display.max_columns', None)
merged_nb_df.head()

Unnamed: 0,size_l,og,fg,abv,ibu,color,boilsize,boiltime,boilgravity,efficiency,mashthickness,pitchrate,primarytemp,review_taste,beer_abv,sugarscale_Plato,sugarscale_Specific Gravity,brewmethod_All Grain,brewmethod_BIAB,brewmethod_Partial Mash,brewmethod_extract
0,18.93,1.082,1.013,9.1,0.0,4.1,21.58,60.0,0.0,72.0,0.0,0.0,0.0,4.103825,7.6,0.0,1.0,1.0,0.0,0.0,0.0
1,7.57,1.076,1.014,8.1,33.51,3.68,8.71,60.0,1.066,75.0,0.0,0.0,0.0,4.103825,7.6,0.0,1.0,1.0,0.0,0.0,0.0
2,20.82,1.043,1.011,4.14,13.22,17.67,28.39,90.0,0.0,85.0,0.0,1.75,12.22,3.028652,4.4,0.0,1.0,1.0,0.0,0.0,0.0
3,22.71,1.069,1.017,6.75,58.76,40.0,29.34,90.0,1.053,70.0,0.0,0.0,0.0,4.265971,6.4,0.0,1.0,1.0,0.0,0.0,0.0
4,22.71,1.068,1.018,6.53,73.4,40.0,27.44,90.0,1.056,65.0,1.3,1.25,18.89,4.265971,6.4,0.0,1.0,1.0,0.0,0.0,0.0


In [290]:
# What percentage of beers scored 4 or above?
len(merged_nb_df[(merged_nb_df["review_taste"]>=4)]) / len(merged_nb_df) * 100

34.77543538038497

In [291]:
# Since beers with a rating of four or above is the top 34% of beers, let's go with those as "good" beers.
# We need a new category "good beer" with a 1 for good and 0 for bad
merged_nb_df["good_beer"]=0

In [292]:
# for all the "good beers", put a 1 in the good_beer column
merged_nb_df.loc[merged_nb_df["review_taste"].abs()>=4, "good_beer"] = 1

In [293]:
# Drop rows where IBU is listed as above 125
merged_nb_df = merged_nb_df[merged_nb_df['ibu'] <=125]  

# Drop rows where sugarscale = plato because those specific and final gravities are off
merged_nb_df = merged_nb_df[merged_nb_df["sugarscale_Plato"] == 0]

# Drop rows where specific gravity is greater than 5 
# These don't make sense and are potential outliers or typos
merged_nb_df = merged_nb_df[merged_nb_df["og"] <=5]
merged_nb_df = merged_nb_df[merged_nb_df["fg"] <=5]




In [294]:
# What if we drop some sparse columns and try to get better accuracy...
merged_nb_df.drop(columns = ["beer_abv", "size_l", "abv", "boilsize","boiltime", "boilgravity", "efficiency", "mashthickness", "pitchrate","primarytemp","sugarscale_Plato","sugarscale_Specific Gravity"])

Unnamed: 0,og,fg,ibu,color,review_taste,brewmethod_All Grain,brewmethod_BIAB,brewmethod_Partial Mash,brewmethod_extract,good_beer
0,1.082,1.013,0.00,4.10,4.103825,1.0,0.0,0.0,0.0,1
1,1.076,1.014,33.51,3.68,4.103825,1.0,0.0,0.0,0.0,1
2,1.043,1.011,13.22,17.67,3.028652,1.0,0.0,0.0,0.0,0
3,1.069,1.017,58.76,40.00,4.265971,1.0,0.0,0.0,0.0,1
4,1.068,1.018,73.40,40.00,4.265971,1.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...
10905,1.050,1.007,40.23,9.40,3.833333,1.0,0.0,0.0,0.0,0
10906,1.081,1.020,105.60,7.04,4.350983,0.0,0.0,0.0,1.0,1
10907,1.054,1.010,0.00,50.00,4.166667,1.0,0.0,0.0,0.0,1
10908,1.056,1.009,56.52,9.76,4.000000,0.0,0.0,1.0,0.0,1


In [295]:
beer_class = merged_nb_df.drop(columns = ["review_taste"])

In [296]:
# Split our preprocessed data into our features and target arrays
y = beer_class.good_beer.values
X = beer_class.drop(columns = ["good_beer"]).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5, stratify = y)

In [297]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Random Forest Classifier

In [298]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.602


In [299]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.67      0.75      0.32      0.71      0.49      0.25      1717
          1       0.41      0.32      0.75      0.36      0.49      0.23       921

avg / total       0.58      0.60      0.47      0.59      0.49      0.24      2638

