In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import precision_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier


import warnings
warnings.filterwarnings('ignore')

In [3]:
beer = pd.read_csv('beer_data_clean_log.csv')

In [4]:
# removed mixed to balance classes
beer = beer[beer.MainStyle != 'Mixed']

In [5]:
beer.MainStyle.unique()

array(['Ale', 'Lager'], dtype=object)

In [4]:
# split data into train, validate, and test
X = beer.drop('MainStyle', axis=1)
y = beer['MainStyle']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Modeling and fine tuning using GridSearchCV

KNN

In [9]:
knn = KNeighborsClassifier()
k_range = {'n_neighbors' : list(range(1, 31))}

knn_grid = GridSearchCV(knn, param_grid=k_range, cv=10, scoring='precision_micro')
knn_grid.fit(X_test, y_test)

print("tuned hpyerparameters:",knn_grid.best_params_)
print("precision :",knn_grid.best_score_)

tuned hpyerparameters: {'n_neighbors': 7}
precision : 0.5615870323249458


Naive Bayes

In [12]:
nb = GaussianNB()
precision = cross_val_score(nb, X_train, y_train, scoring="precision_micro", cv=10).mean()
print(precision)

0.6461284227024515


Logistic Regression

In [13]:
grid={"C": [0.001,0.01,0.1,1,10,100] , "penalty":["l1","l2"]} # l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_grid=GridSearchCV(logreg,grid,cv=10,scoring='precision_micro')
logreg_grid.fit(X_test,y_test)

print("tuned hpyerparameters:",logreg_grid.best_params_)
print("precision :",logreg_grid.best_score_)

tuned hpyerparameters: {'C': 1, 'penalty': 'l1'}
precision : 0.6721326924889266


Random Forest

In [9]:
rf_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'bootstrap': [True, False]
}

# hyperparameters suggested by kaggle

rf = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf, param_grid=rf_grid, cv=5, scoring="precision_micro")
rf_grid.fit(X_train, y_train)

print("tuned hpyerparameters:",rf_grid.best_params_)
print("precision :",rf_grid.best_score_)

KeyboardInterrupt: 

In [8]:
importance = rf_grid.best_estimator_.feature_importances_

NameError: name 'rf_grid' is not defined

SVC

In [None]:
svc_grid = { 'kernel' : ['linear', 'rbf', 'poly'],
            'gamma' : [0.1, 1, 10, 100],
            'C' : [0.1, 1, 10, 100, 1000],
            'degree' : [0, 1, 2, 3, 4, 5, 6]
}

svc = SVC()
svc_grid = GridSearchCV(svc, param_grid=svc_grid, cv=5, scoring="precision_micro")
svc_grid.fit(X_test, y_test)

print("tuned hpyerparameters:",svc_grid.best_params_)
print("precision :",svc_grid.best_score_)

## Scoring the test data using the best model

In [74]:
y_pred = rf_grid.predict(X_test)
precision_score(y_test, y_pred, average="micro")

0.7796626142682123

## Pickle model to use in flask app

In [19]:
import pickle

with open("/Users/demidao/Desktop/METIS/beer_type_categorization/flask_app/rf_predictor.pkl", "wb") as f:
    pickle.dump(rf_fit, f)

## Charts

In [6]:
from bokeh.io import show, output_file
from bokeh.plotting import figure
from bokeh.io import export_svgs
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral6

In [65]:
output_file("scores.html")

model = list(scores_df['model'])
scores = list(scores_df['precision score'])

# sorting the bars means sorting the range factors
sorted_scores = sorted(model, key=lambda x: scores[model.index(x)])

colors = ["#718dbf", "#718dbf", "#718dbf", "#e84d60", "#718dbf", "#718dbf"]

p = figure(x_range = sorted_scores, plot_height=700, plot_width=900, title="Average Precision Score Over 10 CV",
           toolbar_location=None, tools="")

p.xaxis.axis_label = 'Model'
p.yaxis.axis_label = 'Preciscion Score'
p.xaxis.axis_label_text_font_size = "15pt"
p.yaxis.axis_label_text_font_size = "15pt"
p.xaxis.major_label_text_font_size = "15pt"
p.yaxis.major_label_text_font_size = "15pt"

p.title.text_font_size = '20pt'

p.vbar(x=model, top=scores, width=0.9, color=colors)

p.xgrid.grid_line_color = None
p.y_range.start = 0

show(p)



In [18]:
output_file("rf_importance.html")

features = list(X)
importance = ['0.01', '0.00', '0.03', '0.00','0.06', '0.017','0.230','0.310']

# sorting the bars means sorting the range factors
sorted_importance = sorted(features, key=lambda x: importance[features.index(x)])

colors = ["#718dbf","#718dbf","#718dbf", "#e84d60", "#e84d60", "#e84d60", "#718dbf", "#718dbf"]

p = figure(x_range = sorted_importance, plot_height=700, plot_width=900, title="Importance of Features",
           toolbar_location=None, tools="")

p.xaxis.axis_label = 'Feature'
p.yaxis.axis_label = 'Importance Score'
p.xaxis.axis_label_text_font_size = "15pt"
p.yaxis.axis_label_text_font_size = "15pt"
p.xaxis.major_label_text_font_size = "12pt"
p.yaxis.major_label_text_font_size = "12pt"

p.title.text_font_size = '20pt'

p.vbar(x=features, top=importance, width=0.9, color=colors)

p.xgrid.grid_line_color = None
p.y_range.start = 0

show(p)

In [19]:
list(X)

['ABV',
 'IBU',
 'Color',
 'BoilSize',
 'BoilTime',
 'BoilGravity',
 'Efficiency',
 'ChangeInWort']