In [38]:
# import dependencies
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier

In [39]:
# import libraries
import psycopg2
import pandas as pd

# get username and password for DB
from config import sql_u
from config import sql_pw
from config import sql_host

In [40]:
# connect to DB
conn = psycopg2.connect(
    host=sql_host,
    port='5432',
    database='postgres',
    user=sql_u,
    password=sql_pw)

cur = conn.cursor()

In [41]:
# run query
sql_query = '''SELECT name, og, fg, ibu, color, brewmethod
               FROM recipes
               WHERE og <=5
               AND fg<=5
               AND ibu <=125;
            '''

cur.execute(sql_query)

In [42]:
# convert query results to DF
recipes_df = pd.DataFrame(cur.fetchall())
# add column names to DF
num_cols = len(cur.description)
col_names = [i[0] for i in cur.description]
recipes_df.columns = col_names
# recipes_df.head()

In [43]:
# run query
sql_query = '''SELECT beer_name, review_taste, beer_abv
               FROM reviews;
            '''

cur.execute(sql_query)

# convert query results to DF
reviews_df = pd.DataFrame(cur.fetchall())
# add column names to DF
num_cols = len(cur.description)
col_names = [i[0] for i in cur.description]
reviews_df.columns = col_names
# reviews_df.head()

In [44]:
# close sql connection
cur.close()
conn.close()

In [45]:
# take out the word 'clone' in the recipes_df beer names
pd.options.mode.chained_assignment = None
remove = ["Clone", "clone", "clone)", "(clone"]
f = lambda x: ' '.join([item for item in x.split() if item not in remove])
recipes_df["name"] = recipes_df["name"].apply(f)

# check that "clone" was removed
# these are the beer names in the recipes table
# recipes_df["name"].sample(15)

In [46]:
# Create a dataframe of just the beer names
recipe_names = recipes_df[["name"]]
review_names = reviews_df[["beer_name"]]

In [47]:
# Do we have any matches?
recipe_names.name.isin(review_names.beer_name).astype(int)

0        1
1        0
2        0
3        0
4        0
        ..
69883    0
69884    1
69885    0
69886    0
69887    0
Name: name, Length: 69888, dtype: int32

In [48]:
matches_df = recipe_names.assign(review_names=recipe_names.name.isin(review_names.beer_name).astype(int))
# matches_df.head()

In [49]:
matches_df = matches_df.loc[matches_df['review_names'] == 1]
# matches_df.head(25)

In [50]:
# Select the name brand beers from the reviews_df
name_brand_reviews = reviews_df[reviews_df["beer_name"].isin(matches_df["name"])]
name_brand_reviews = name_brand_reviews.rename(columns = {"beer_name": "name"})
name_brand_reviews.head()

Unnamed: 0,name,review_taste,beer_abv
42,#14,3.7857142857142856,8.0
64,'Tis The Saison,3.75,6.0
584,4x4,3.8333333333333335,5.0
628,60 Minute IPA,4.118361465623713,6.0
634,60 Shilling Scottish Ale,3.333333333333333,4.9


In [51]:
# Select the name brand beers from the recipes_df
name_brand_recipes = recipes_df[recipes_df["name"].isin(matches_df["name"])]
name_brand_recipes.head()

Unnamed: 0,name,og,fg,ibu,color,brewmethod
0,Vanilla Cream Ale,1.055,1.013,17.65,4.83,All Grain
5,Sierra Nevada Pale Ale,1.055,1.013,40.12,8.0,All Grain
8,Mango Habanero IPA,1.08,1.017,93.02,8.29,All Grain
13,White IPA,1.064,1.017,64.67,3.91,All Grain
21,Brooklyn Sorachi Ace,1.082,1.013,0.0,4.1,All Grain


In [52]:
# merge the two dataframes
left = name_brand_recipes
right = name_brand_reviews

beers = pd.merge(left, right, how = 'inner', on = "name")
beers.head()

Unnamed: 0,name,og,fg,ibu,color,brewmethod,review_taste,beer_abv
0,Vanilla Cream Ale,1.055,1.013,17.65,4.83,All Grain,3.125,4.5
1,Vanilla Cream Ale,1.055,1.013,17.65,4.83,All Grain,2.5,4.4
2,Vanilla Cream Ale,1.05,1.014,25.43,4.42,extract,3.125,4.5
3,Vanilla Cream Ale,1.05,1.014,25.43,4.42,extract,2.5,4.4
4,Vanilla Cream Ale,1.051,1.012,19.14,3.56,All Grain,3.125,4.5


In [53]:
# Change numerical columns from object to float
num_columns = ['og', 'fg', 'beer_abv', 'ibu', 'color', 'review_taste']

for col in num_columns:
    beers[col] = beers[col].astype(float)

In [54]:
# Where beer name and brew method match, let's average the other metrics
beers = beers.groupby(["name", "brewmethod"], as_index = False)["og", "fg", "ibu", "color", "review_taste", "beer_abv"].mean()
beers.head()

  beers = beers.groupby(["name", "brewmethod"], as_index = False)["og", "fg", "ibu", "color", "review_taste", "beer_abv"].mean()


Unnamed: 0,name,brewmethod,og,fg,ibu,color,review_taste,beer_abv
0,#14,All Grain,1.062,1.014,66.61,8.06,3.785714,8.0
1,'Tis The Saison,All Grain,1.048,1.011,25.39,7.69,3.75,6.0
2,4x4,All Grain,1.058,1.016,63.34,32.5,3.833333,5.0
3,60 Minute IPA,extract,1.058,1.011,65.7,3.51,4.118361,6.0
4,60 Shilling Scottish Ale,All Grain,1.063,1.019,30.01,14.59,3.333333,4.9


In [55]:
# Since beers with a rating of four or above is the top 34% of beers, let's go with those as "good" beers.
# We need a new category "good beer" with a 1 for good and 0 for bad
beers["good_beer"]=0

# for all the "good beers", put a 1 in the good_beer column
beers.loc[beers["review_taste"].abs()>=4, "good_beer"] = 1

In [56]:
# Use OneHotEncoder to encode our categorical variables
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encoded_df = pd.DataFrame(enc.fit_transform(beers[["brewmethod"]]))

# Add the encoded variable names to the dataframe
encoded_df.columns = enc.get_feature_names(["brewmethod"])
encoded_df.head()

Unnamed: 0,brewmethod_All Grain,brewmethod_BIAB,brewmethod_Partial Mash,brewmethod_extract
0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0


In [57]:
beers.head()

Unnamed: 0,name,brewmethod,og,fg,ibu,color,review_taste,beer_abv,good_beer
0,#14,All Grain,1.062,1.014,66.61,8.06,3.785714,8.0,0
1,'Tis The Saison,All Grain,1.048,1.011,25.39,7.69,3.75,6.0,0
2,4x4,All Grain,1.058,1.016,63.34,32.5,3.833333,5.0,0
3,60 Minute IPA,extract,1.058,1.011,65.7,3.51,4.118361,6.0,1
4,60 Shilling Scottish Ale,All Grain,1.063,1.019,30.01,14.59,3.333333,4.9,0


In [58]:
# Merge one-hot encoded features and drop the originals
beers = beers.merge(encoded_df, left_index = True, right_index=True)
beers = beers.drop(["brewmethod", "review_taste", "name"],1)
beers.head()

  beers = beers.drop(["brewmethod", "review_taste", "name"],1)


Unnamed: 0,og,fg,ibu,color,beer_abv,good_beer,brewmethod_All Grain,brewmethod_BIAB,brewmethod_Partial Mash,brewmethod_extract
0,1.062,1.014,66.61,8.06,8.0,0,1.0,0.0,0.0,0.0
1,1.048,1.011,25.39,7.69,6.0,0,1.0,0.0,0.0,0.0
2,1.058,1.016,63.34,32.5,5.0,0,1.0,0.0,0.0,0.0
3,1.058,1.011,65.7,3.51,6.0,1,0.0,0.0,0.0,1.0
4,1.063,1.019,30.01,14.59,4.9,0,1.0,0.0,0.0,0.0


In [59]:
# Split our preprocessed data into our features and target arrays
y = beers.good_beer.values
X = beers.drop(columns = ["good_beer"]).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5, stratify = y)


In [60]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Gradient Boosting Classifier

In [69]:
# Create the model
gradient_booster = GradientBoostingClassifier(learning_rate=0.01, n_estimators = 300)
gradient_booster.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.01,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 300,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [70]:
# fit and train the model
gradient_booster.fit(X_train_scaled, y_train)
predictions = gradient_booster.predict(X_test_scaled)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.74      0.95      0.83       486
           1       0.65      0.21      0.32       206

    accuracy                           0.73       692
   macro avg       0.70      0.58      0.57       692
weighted avg       0.71      0.73      0.68       692



In [71]:
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.7312138728323699


In [72]:
# Save the model to use later
import pickle
filename = "final_ML_model.pkl"
pickle.dump(gradient_booster, open(filename, 'wb'))

In [73]:
filename2 = "beer_scaler.pkl"
pickle.dump(scaler, open(filename2, 'wb'))