In [5]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

import datetime

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn import metrics

from sklearn.preprocessing import StandardScaler



In [6]:
data = pd.read_csv('/users/vonb/Desktop/31009/project/data.csv')

In [7]:
data.head(1)

Unnamed: 0.1,Unnamed: 0,stars_x,funny_x,cool_x,city,state,postal_code,stars_y,review_count_x,is_open,...,veggies_category,venezuelan_category,vietnamese_category,waffles_category,whiskey_category,wine_category,wineries_category,wings_category,women_category,yogurt_category
0,0,5,0,0,Gilbert,AZ,85234,4.0,13,1,...,0,0,0,0,0,0,0,0,0,0


In [8]:
type(data)

pandas.core.frame.DataFrame

In [9]:
data.shape

(29992, 282)

In [10]:
# data.columns.tolist()

In [11]:
X = data.select_dtypes(include=[np.number])    # get rid of alpha category labels- we only want numeric

In [12]:
X.shape

(29992, 270)

In [13]:
y = np.array(X['useful_review'])

In [14]:
X = np.array(X.drop(['useful_review'], 1))

In [15]:
X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, train_size=.7, random_state=42)

In [16]:
X_Train.shape

(20994, 269)

In [17]:
scaler = StandardScaler()
scaler.fit(X_Train)
X_Train_Scale = scaler.transform(X_Train)
X_Test_Scale = scaler.transform(X_Test)

In [46]:
#
#   Model 1 - hyperparameters similar to our Assignment 6
#
param_grid = {
'n_estimators':[100,200,300,400],
'learning_rate':[0.2,0.4,0.6,0.8,1,1.2],
'random_state':[42]}

model = AdaBoostClassifier()

In [19]:
model.fit(X_Train_Scale, y_Train)  

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [21]:
grid_search = GridSearchCV(model, param_grid, cv = 5, scoring = 'roc_auc',refit = True)

In [22]:
now = datetime.datetime.now()
print(str(now))

2018-12-04 12:30:07.759059


In [23]:
grid_search.fit(X_Test_Scale, y_Test)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [100, 200, 300, 400], 'learning_rate': [0.2, 0.4, 0.6, 0.8, 1, 1.2], 'random_state': [42]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [24]:
now = datetime.datetime.now()
print(str(now))

2018-12-04 12:40:48.588697


In [25]:
print(grid_search.best_params_)

{'learning_rate': 0.2, 'n_estimators': 100, 'random_state': 42}


In [26]:
print(grid_search.best_estimator_)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.2, n_estimators=100, random_state=42)


In [27]:
grid_search_y_pred = grid_search.predict(X_Test_Scale)
grid_search_y_prob = grid_search.predict_proba(X_Test_Scale)


In [28]:
confusion_matrix(y_Test, grid_search_y_pred)

array([[8188,   86],
       [ 377,  347]])

In [29]:
print(classification_report(y_Test,grid_search_y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      8274
           1       0.80      0.48      0.60       724

   micro avg       0.95      0.95      0.95      8998
   macro avg       0.88      0.73      0.79      8998
weighted avg       0.94      0.95      0.94      8998



In [30]:
base_probs = grid_search.predict_proba(X_Test_Scale)[:,1]
print(roc_auc_score(y_Test,base_probs))

0.9411631256535484


In [48]:
#
#  Now let's try the 2nd model using SAMME and not SAMME.R
#
param_grid2 = {
'n_estimators':[100,200,300,400],
'learning_rate':[0.2,0.4,0.6,0.8,1,1.2],
'algorithm':["SAMME"],
'random_state':[42]}

model2 = AdaBoostClassifier()

In [32]:
model2.fit(X_Train_Scale, y_Train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [33]:
now = datetime.datetime.now()
print(str(now))

2018-12-04 12:44:22.631685


In [34]:
grid_search2 = GridSearchCV(model2, param_grid2, cv = 5, scoring = 'roc_auc',refit = True)

In [35]:
grid_search2.fit(X_Test_Scale, y_Test)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [100, 200, 300, 400], 'learning_rate': [0.2, 0.4, 0.6, 0.8, 1, 1.2], 'algorithm': ['SAMME'], 'random_state': [42]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [36]:
now = datetime.datetime.now()
print(str(now))

2018-12-04 12:54:36.155749


In [37]:
print(grid_search2.best_params_)

{'algorithm': 'SAMME', 'learning_rate': 0.2, 'n_estimators': 400, 'random_state': 42}


In [38]:
print(grid_search2.best_estimator_)

AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=0.2,
          n_estimators=400, random_state=42)


In [39]:
grid_search2_y_pred = grid_search2.predict(X_Test_Scale)
grid_search2_y_prob = grid_search2.predict_proba(X_Test_Scale)



In [40]:
confusion_matrix(y_Test, grid_search2_y_pred)

array([[8202,   72],
       [ 398,  326]])

In [41]:
print(classification_report(y_Test,grid_search2_y_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      8274
           1       0.82      0.45      0.58       724

   micro avg       0.95      0.95      0.95      8998
   macro avg       0.89      0.72      0.78      8998
weighted avg       0.94      0.95      0.94      8998



In [42]:
base_probs2 = grid_search2.predict_proba(X_Test_Scale)[:,1]
print(roc_auc_score(y_Test,base_probs2))

0.9258650041332964


In [49]:
#
#  Now let's try a 3rd model - with a higher CV

param_grid3 = {
'n_estimators':[100,200,300,400],
'learning_rate':[0.2,0.4,0.6,0.8,1,1.2],
'random_state':[42]}

model3 = AdaBoostClassifier()

In [50]:
model3.fit(X_Train_Scale, y_Train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [54]:
now = datetime.datetime.now()
print(str(now))

2018-12-04 14:38:41.103627


In [55]:
grid_search3 = GridSearchCV(model3, param_grid3, cv = 7, scoring = 'roc_auc',refit = True)

In [56]:
grid_search3.fit(X_Test_Scale, y_Test)

GridSearchCV(cv=7, error_score='raise-deprecating',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [100, 200, 300, 400], 'learning_rate': [0.2, 0.4, 0.6, 0.8, 1, 1.2], 'random_state': [42]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [57]:
now = datetime.datetime.now()
print(str(now))

2018-12-04 14:55:05.874242


In [58]:
print(grid_search3.best_params_)

{'learning_rate': 0.2, 'n_estimators': 100, 'random_state': 42}


In [59]:
print(grid_search3.best_estimator_)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.2, n_estimators=100, random_state=42)


In [60]:
grid_search3_y_pred = grid_search3.predict(X_Test_Scale)
grid_search3_y_prob = grid_search3.predict_proba(X_Test_Scale)



In [61]:
confusion_matrix(y_Test, grid_search3_y_pred)

array([[8188,   86],
       [ 377,  347]])

In [62]:
print(classification_report(y_Test,grid_search3_y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      8274
           1       0.80      0.48      0.60       724

   micro avg       0.95      0.95      0.95      8998
   macro avg       0.88      0.73      0.79      8998
weighted avg       0.94      0.95      0.94      8998



In [63]:
base_probs3 = grid_search3.predict_proba(X_Test_Scale)[:,1]
print(roc_auc_score(y_Test,base_probs3))

0.9411631256535484


In [None]:
#
#  Summary  {Best Parameters - AUC/ROC}
#
#  ADA/SAMME         LR = .2    N_Estimators = 400      CV = 5    AUC/ROC    .9258
#  ADA/SAMME.R       LR = .2    N_Estimators = 100      CV = 5    AUC/ROC    .9471
#  ADA/SAMME.R       LR = .2    N_Estimators = 100      CV = 7    AUC/ROC    .9411