In [27]:
import numpy as np
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise import evaluate, print_perf, Reader, dump
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBClassifier
import os
import pickle

In [2]:
df1 = pd.read_pickle('./df_final1.pkl')
df2 = pd.read_pickle('./df_final2.pkl')
df=pd.concat([df1,df2])

### Try a few different models

In [3]:
print(df.columns.values)

['abv' 'abv_listed' 'beerID' 'beer_name' 'beer_style' 'brewerID'
 'review_overall' 'reviewer_username' 'testData' 'text_lookup'
 'vader_neg_avgVaderNeg' 'vader_neg_medianVaderNeg' 'vader_neg_maxVaderNeg'
 'vader_neg_minVaderNeg' 'vader_neg_vaderNegRange' 'month_number_of_months'
 'month_most_common_month' 'text_lookup_text_list'
 'abv_listed_num_abv_listed' 'brewerID_num_brewers'
 'weekday_central_most_common_weekday' 'weekday_central_number_of_weekdays'
 'review_overall_avgOverallScore' 'review_overall_maxOverallScore'
 'review_overall_medianOverallScore' 'review_overall_overallScoreRange'
 'review_overall_minOverallScore' 'beerID_beerID_count'
 'review_palate_avgPalateScore' 'review_palate_medianPalateScore'
 'review_palate_minPalateScore' 'review_palate_palateScoreRange'
 'review_palate_maxPalateScore' 'time_from_beginning_TimestampRange'
 'time_from_beginning_medianTimestamp' 'time_from_beginning_avgTimestamp'
 'time_from_beginning_maxTimestamp' 'time_from_beginning_minTimestamp'
 

In [4]:
df.head()

Unnamed: 0,abv,abv_listed,beerID,beer_name,beer_style,brewerID,review_overall,reviewer_username,testData,text_lookup,...,distance_from_july_maxMonthsFromJuly_prodAgg,distance_from_july_medianMonthsFromJuly_prodAgg,reviewer_username_user_count,rate_of_posting_prodAgg,weekend_review_ratio_prodAgg,aromaSVD,appearanaceSVD,palateSVD,tasteSVD,overallSVD
0,4.5,True,5868,St Peters Organic Ale,Bitter,974,14,abemorsten,False,"[5868, abemorsten, bottlecardinal pours light...",...,6,3.0,306,9.78361e-07,0.447712,5.798785,3.250575,3.344836,5.379055,13.530825
1,4.5,True,5868,St Peters Organic Ale,Bitter,974,11,jazz88,False,"[5868, jazz88, bottle medium golden color tan...",...,6,3.0,306,9.78361e-07,0.447712,6.358591,2.913058,2.928688,5.262634,12.470336
2,4.5,True,5868,St Peters Organic Ale,Bitter,974,12,zach8270,False,"[5868, zach8270, bottle 500 ml slightly hazy...",...,6,3.0,306,9.78361e-07,0.447712,5.981086,2.971897,3.070367,5.419182,13.411647
3,4.5,True,5868,St Peters Organic Ale,Bitter,974,14,Angeloregon,False,"[5868, Angeloregon, updated aug 21 2008 malty...",...,6,3.0,306,9.78361e-07,0.447712,6.223292,3.548265,3.273529,5.545698,14.397025
4,4.5,True,5868,St Peters Organic Ale,Bitter,974,11,bierkoning,False,"[5868, bierkoning, orange color head light ho...",...,6,3.0,306,9.78361e-07,0.447712,6.302348,2.869591,2.486358,6.189397,11.354125


In [5]:
df=df.drop(['text_lookup'], axis=1)

In [6]:
df.dtypes

abv                                                float64
abv_listed                                            bool
beerID                                               int64
beer_name                                           object
beer_style                                          object
brewerID                                             int64
review_overall                                       int64
reviewer_username                                   object
testData                                              bool
vader_neg_avgVaderNeg                              float64
vader_neg_medianVaderNeg                           float64
vader_neg_maxVaderNeg                              float64
vader_neg_minVaderNeg                              float64
vader_neg_vaderNegRange                            float64
month_number_of_months                               int64
month_most_common_month                             object
text_lookup_text_list                               obje

In [7]:
df.select_dtypes(include=['object','bool']).copy().dtypes

abv_listed                                       bool
beer_name                                      object
beer_style                                     object
reviewer_username                              object
testData                                         bool
month_most_common_month                        object
text_lookup_text_list                          object
weekday_central_most_common_weekday            object
month_most_common_month_prodAgg                object
text_lookup_text_list_prodAgg                  object
weekday_central_most_common_weekday_prodAgg    object
beer_style_numBeerStyles_prodAgg               object
dtype: object

In [8]:
df['beer_style'].nunique()

89

In [9]:
df['beer_style_numBeerStyles_prodAgg']

0                                 Bitter
1                                 Bitter
2                                 Bitter
3                                 Bitter
4                                 Bitter
5                                 Bitter
6                                 Bitter
7                                 Bitter
8                                 Bitter
9                                 Bitter
10                                Bitter
11                                Bitter
12                                Bitter
13                                Bitter
14                                Bitter
15                                Bitter
16                                Bitter
17                                Bitter
18                                Bitter
19                                Bitter
20                                Bitter
21                                Bitter
22                                Bitter
23                                Bitter
24              

### One hot encoding of non-numeric columns

In [10]:
# For the moment, won't tokenize anything with very high cardinality
df=pd.get_dummies(df, columns=["beer_style", "month_most_common_month",
                                "weekday_central_most_common_weekday",
                               "weekday_central_most_common_weekday_prodAgg",
                               "month_most_common_month_prodAgg"],
               prefix=["beerStyle", "commonMonth", "commonDay", "commonDayProd", "commonMonthProd"])

In [11]:
df.columns.values

array(['abv', 'abv_listed', 'beerID', 'beer_name', 'brewerID',
       'review_overall', 'reviewer_username', 'testData',
       'vader_neg_avgVaderNeg', 'vader_neg_medianVaderNeg',
       'vader_neg_maxVaderNeg', 'vader_neg_minVaderNeg',
       'vader_neg_vaderNegRange', 'month_number_of_months',
       'text_lookup_text_list', 'abv_listed_num_abv_listed',
       'brewerID_num_brewers', 'weekday_central_number_of_weekdays',
       'review_overall_avgOverallScore', 'review_overall_maxOverallScore',
       'review_overall_medianOverallScore',
       'review_overall_overallScoreRange',
       'review_overall_minOverallScore', 'beerID_beerID_count',
       'review_palate_avgPalateScore', 'review_palate_medianPalateScore',
       'review_palate_minPalateScore', 'review_palate_palateScoreRange',
       'review_palate_maxPalateScore',
       'time_from_beginning_TimestampRange',
       'time_from_beginning_medianTimestamp',
       'time_from_beginning_avgTimestamp',
       'time_from_beginnin

Create a list of only numeric columns to fit on 

In [12]:
training_columns=list(df.columns.values)
notTraining=['abv_listed','beerID','beer_name','beer_style','brewerID','reviewer_username','testData',
             'month_most_common_month','month_most_common_month', 'weekday_central_most_common_weekday',
             'beer_style_numBeerStyles_prodAgg','text_lookup_text_list',
             'weekday_central_most_common_weekday_prodAgg','month_most_common_month_prodAgg',
             'text_lookup_text_list_prodAgg','review_overall','benchmarkSVD']
for col in notTraining:
    if col in training_columns:
        training_columns.remove(col)
    
training_columns

['abv',
 'vader_neg_avgVaderNeg',
 'vader_neg_medianVaderNeg',
 'vader_neg_maxVaderNeg',
 'vader_neg_minVaderNeg',
 'vader_neg_vaderNegRange',
 'month_number_of_months',
 'abv_listed_num_abv_listed',
 'brewerID_num_brewers',
 'weekday_central_number_of_weekdays',
 'review_overall_avgOverallScore',
 'review_overall_maxOverallScore',
 'review_overall_medianOverallScore',
 'review_overall_overallScoreRange',
 'review_overall_minOverallScore',
 'beerID_beerID_count',
 'review_palate_avgPalateScore',
 'review_palate_medianPalateScore',
 'review_palate_minPalateScore',
 'review_palate_palateScoreRange',
 'review_palate_maxPalateScore',
 'time_from_beginning_TimestampRange',
 'time_from_beginning_medianTimestamp',
 'time_from_beginning_avgTimestamp',
 'time_from_beginning_maxTimestamp',
 'time_from_beginning_minTimestamp',
 'review_appearance_avgAppearanceScore',
 'review_appearance_minAppearanceScore',
 'review_appearance_maxAppearanceScore',
 'review_appearance_appearanceScoreRange',
 'revi

In [13]:
df[training_columns].select_dtypes(include=['object','bool']).copy().dtypes

Series([], dtype: object)

In [14]:
np.any(np.isnan(df[training_columns]))

False

In [15]:
isInfinite=(~np.isfinite(df[training_columns])).sum()

In [16]:
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print(isInfinite)

abv                                                    0
vader_neg_avgVaderNeg                                  0
vader_neg_medianVaderNeg                               0
vader_neg_maxVaderNeg                                  0
vader_neg_minVaderNeg                                  0
vader_neg_vaderNegRange                                0
month_number_of_months                                 0
abv_listed_num_abv_listed                              0
brewerID_num_brewers                                   0
weekday_central_number_of_weekdays                     0
review_overall_avgOverallScore                         0
review_overall_maxOverallScore                         0
review_overall_medianOverallScore                      0
review_overall_overallScoreRange                       0
review_overall_minOverallScore                         0
beerID_beerID_count                                    0
review_palate_avgPalateScore                           0
review_palate_medianPalateScore

rate_of_posting and rate_of_posting_prodAgg both have inf values. Makes sense. Probably created during division. Would be created for users/products with just one review. Will fill these values with averages.

In [17]:
df=df.replace([np.inf, -np.inf], np.nan)

In [18]:
df['rate_of_posting'].fillna((df['rate_of_posting'].mean()), inplace=True)

In [19]:
df['rate_of_posting_prodAgg'].fillna((df['rate_of_posting_prodAgg'].mean()), inplace=True)

In [20]:
df_train=df[df['testData']==False]
df_test=df[df['testData']==True]

In [21]:
df_train['testData'].count()

1284624

In [22]:
df_test['testData'].count()

204328

Check the standard SVD errors on test set

In [23]:
mean_absolute_error(df_test['review_overall'],df_test['overallSVD'])

1.8875064754361444

In [24]:
mean_squared_error(df_test['review_overall'],df_test['overallSVD'])

6.2011613605819198

In [25]:
r2_score(df_test['review_overall'],df_test['overallSVD'])

0.28759791556665748

In [26]:
df_train[training_columns].dtypes

abv                                       float64
vader_neg_avgVaderNeg                     float64
vader_neg_medianVaderNeg                  float64
vader_neg_maxVaderNeg                     float64
vader_neg_minVaderNeg                     float64
vader_neg_vaderNegRange                   float64
month_number_of_months                      int64
abv_listed_num_abv_listed                   int64
brewerID_num_brewers                        int64
weekday_central_number_of_weekdays          int64
review_overall_avgOverallScore            float64
review_overall_maxOverallScore              int64
review_overall_medianOverallScore         float64
review_overall_overallScoreRange            int64
review_overall_minOverallScore              int64
beerID_beerID_count                         int64
review_palate_avgPalateScore              float64
review_palate_medianPalateScore           float64
review_palate_minPalateScore                int64
review_palate_palateScoreRange              int64


### Linear Regression

In [33]:
lr = LinearRegression()
lr.fit(df_train[training_columns],df_train['review_overall'])


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [34]:
y_test_pred = lr.predict(df_test[training_columns])

In [35]:
mean_absolute_error(df_test['review_overall'], y_test_pred)

1.6126863794106983

In [36]:
mean_squared_error(df_test['review_overall'], y_test_pred)

4.7854467968649192

In [38]:
r2_score(df_test['review_overall'], y_test_pred)

0.45023809657623337

In [43]:
set(zip(df_test[training_columns].columns.values, lr.coef_))

{('abv', 0.17311850869584686),
 ('abv_ABVrange', 0.0063794125828851023),
 ('abv_ABVrange_prodAgg', -2.4817849491876043e-06),
 ('abv_avgABV', -0.20655040277435432),
 ('abv_avgABV_prodAgg', -0.019525801156731859),
 ('abv_listed_num_abv_listed', 0.00014032736059571411),
 ('abv_listed_num_abv_listed_prodAgg', 0.00050176462502804497),
 ('abv_maxABV', -0.005414944321216808),
 ('abv_maxABV_prodAgg', -0.04805807064514056),
 ('abv_medianABV', 0.0095511449128196091),
 ('abv_medianABV_prodAgg', -0.048056666266404591),
 ('abv_minABV', -0.0083551759201725417),
 ('abv_minABV_prodAgg', -0.048056769552204059),
 ('appearanaceSVD', -0.26994564520772002),
 ('aromaSVD', 0.19297175285295509),
 ('beerID_beerID_count', -0.00010591628437796319),
 ('beerStyle_ Abbey Dubbel', -0.039132984059353948),
 ('beerStyle_ Abbey Tripel', -0.1053473074285054),
 ('beerStyle_ Abt/Quadrupel', -0.11983838542378011),
 ('beerStyle_ Altbier', -0.045362201203219876),
 ('beerStyle_ Amber Ale', -0.05445260032680288),
 ('beerStyle_ 

 Coefficients for various SVD scores
 
 'overallSVD': 0.55339183350724852
 
 'palateSVD': -0.11139295918218295
 
 'tasteSVD': 0.49439024875230331
 
 'appearanaceSVD': -0.26994564520772002
 
 'aromaSVD': 0.19297175285295509

### Ridge Regression

In [37]:
ridge=Ridge()

params_grid={"alpha": [0.001, 0.01, 0.1, 1.0, 5.0]}

bestRidge=GridSearchCV(ridge, params_grid, verbose=2, n_jobs=3)

bestRidge.fit(df_train[training_columns],df_train['review_overall'])

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 2.9527167632547888e-18
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 3.429976819678099e-19
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 2.0069694067856814e-18


[CV] ...................................... alpha=0.001, total= 2.0min
[CV] alpha=0.01 ......................................................
[CV] ...................................... alpha=0.001, total= 1.9min
[CV] alpha=0.01 ......................................................
[CV] ...................................... alpha=0.001, total= 2.1min
[CV] alpha=0.01 ......................................................


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 2.1656076103152866e-18
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 4.1759924384527157e-19
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 3.251610951629228e-18


[CV] ....................................... alpha=0.01, total= 1.2min
[CV] alpha=0.1 .......................................................
[CV] ....................................... alpha=0.01, total= 1.0min
[CV] ....................................... alpha=0.01, total= 1.0min
[CV] alpha=0.1 .......................................................
[CV] alpha=0.1 .......................................................


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 3.0240883307496387e-18
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 2.9839897671830235e-20
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 2.3158446030122033e-18


[CV] ........................................ alpha=0.1, total= 1.1min
[CV] alpha=1.0 .......................................................
[CV] ........................................ alpha=0.1, total= 1.1min
[CV] alpha=1.0 .......................................................
[CV] ........................................ alpha=0.1, total= 1.1min
[CV] alpha=1.0 .......................................................


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 1.7575220756807998e-19
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 3.443222375573847e-18
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 2.618407748559132e-18


[CV] ........................................ alpha=1.0, total= 1.1min
[CV] ........................................ alpha=1.0, total= 1.1min
[CV] alpha=5.0 .......................................................
[CV] alpha=5.0 .......................................................
[CV] ........................................ alpha=1.0, total= 1.2min
[CV] alpha=5.0 .......................................................


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 1.6968231795932277e-19
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 3.650530021061748e-18
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 2.514214503754671e-18


[CV] ........................................ alpha=5.0, total=  45.4s
[CV] ........................................ alpha=5.0, total=  46.7s
[CV] ........................................ alpha=5.0, total=  46.5s


[Parallel(n_jobs=3)]: Done  15 out of  15 | elapsed:  7.5min finished
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 5.567991863181092e-18


GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_jobs=3,
       param_grid={'alpha': [0.001, 0.01, 0.1, 1.0, 5.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

In [63]:
y_test_predRidge = bestRidge.predict(df_test[training_columns])

In [64]:
mean_absolute_error(df_test['review_overall'], y_test_predRidge)

1.6132234497743072

In [66]:
mean_squared_error(df_test['review_overall'], y_test_predRidge)

4.7864027681386689

In [67]:
r2_score(df_test['review_overall'], y_test_predRidge)

0.45012827264350896

In [44]:
set(zip(df_test[training_columns].columns.values, bestRidge.best_estimator_.coef_))


{('abv', 0.0019417583100838571),
 ('abv_ABVrange', 0.0054040581278564917),
 ('abv_ABVrange_prodAgg', 0.0),
 ('abv_avgABV', -0.19367100524315911),
 ('abv_avgABV_prodAgg', 0.0019417584233917394),
 ('abv_listed_num_abv_listed', 0.00014855368592019545),
 ('abv_listed_num_abv_listed_prodAgg', 0.00052223895265111181),
 ('abv_maxABV', -0.004558932397691509),
 ('abv_maxABV_prodAgg', 0.0019417584215964535),
 ('abv_medianABV', -0.0015222938257510735),
 ('abv_medianABV_prodAgg', 0.0019417584241683181),
 ('abv_minABV', -0.0099629886035691156),
 ('abv_minABV_prodAgg', 0.0019417584332378898),
 ('appearanaceSVD', -0.26864348659193171),
 ('aromaSVD', 0.19085392558811454),
 ('beerID_beerID_count', -0.00011718378306825872),
 ('beerStyle_ Abbey Dubbel', -0.021986769821188537),
 ('beerStyle_ Abbey Tripel', -0.054053715471054475),
 ('beerStyle_ Abt/Quadrupel', -0.10614893156119257),
 ('beerStyle_ Altbier', -0.0077699164844163265),
 ('beerStyle_ Amber Ale', -0.047258445648065112),
 ('beerStyle_ American Dar

### Random Forest

In [28]:
rf=RandomForestRegressor()

param_grid = {"max_depth": [3, 5, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10]}

bestRF=GridSearchCV(rf, param_grid, verbose=2, n_jobs=3)

bestRF.fit(df_train[training_columns],df_train['review_overall'])

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] min_samples_leaf=1, min_samples_split=2, max_features=1, max_depth=3 
[CV] min_samples_leaf=1, min_samples_split=2, max_features=1, max_depth=3 
[CV] min_samples_leaf=1, min_samples_split=2, max_features=1, max_depth=3 
[CV]  min_samples_leaf=1, min_samples_split=2, max_features=1, max_depth=3, total=  38.8s
[CV] min_samples_leaf=1, min_samples_split=3, max_features=1, max_depth=3 
[CV]  min_samples_leaf=1, min_samples_split=2, max_features=1, max_depth=3, total=  35.5s
[CV] min_samples_leaf=1, min_samples_split=3, max_features=1, max_depth=3 
[CV]  min_samples_leaf=1, min_samples_split=2, max_features=1, max_depth=3, total=  42.7s
[CV] min_samples_leaf=1, min_samples_split=3, max_features=1, max_depth=3 
[CV]  min_samples_leaf=1, min_samples_split=3, max_features=1, max_depth=3, total=  27.7s
[CV] min_samples_leaf=1, min_samples_split=10, max_features=1, max_depth=3 
[CV]  min_samples_leaf=1, min_samples_split=3, max_f

[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  8.8min


[CV] min_samples_leaf=3, min_samples_split=2, max_features=3, max_depth=3 
[CV]  min_samples_leaf=3, min_samples_split=2, max_features=3, max_depth=3, total=  33.2s
[CV] min_samples_leaf=3, min_samples_split=3, max_features=3, max_depth=3 
[CV]  min_samples_leaf=3, min_samples_split=2, max_features=3, max_depth=3, total=  40.0s
[CV] min_samples_leaf=3, min_samples_split=3, max_features=3, max_depth=3 
[CV]  min_samples_leaf=3, min_samples_split=2, max_features=3, max_depth=3, total=  44.4s
[CV] min_samples_leaf=3, min_samples_split=3, max_features=3, max_depth=3 
[CV]  min_samples_leaf=3, min_samples_split=3, max_features=3, max_depth=3, total=  39.2s
[CV] min_samples_leaf=3, min_samples_split=10, max_features=3, max_depth=3 
[CV]  min_samples_leaf=3, min_samples_split=3, max_features=3, max_depth=3, total=  23.8s
[CV]  min_samples_leaf=3, min_samples_split=3, max_features=3, max_depth=3, total=  22.2s
[CV] min_samples_leaf=3, min_samples_split=10, max_features=3, max_depth=3 
[CV] min

[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed: 44.4min


[CV]  min_samples_leaf=10, min_samples_split=3, max_features=10, max_depth=5, total=  57.1s
[CV] min_samples_leaf=10, min_samples_split=10, max_features=10, max_depth=5 
[CV]  min_samples_leaf=10, min_samples_split=3, max_features=10, max_depth=5, total=  50.3s
[CV] min_samples_leaf=10, min_samples_split=10, max_features=10, max_depth=5 
[CV]  min_samples_leaf=10, min_samples_split=3, max_features=10, max_depth=5, total=  52.6s
[CV] min_samples_leaf=10, min_samples_split=10, max_features=10, max_depth=5 
[CV]  min_samples_leaf=10, min_samples_split=10, max_features=10, max_depth=5, total=  47.4s
[CV] min_samples_leaf=1, min_samples_split=2, max_features=1, max_depth=None 
[CV]  min_samples_leaf=10, min_samples_split=10, max_features=10, max_depth=5, total=  40.1s
[CV] min_samples_leaf=1, min_samples_split=2, max_features=1, max_depth=None 
[CV]  min_samples_leaf=10, min_samples_split=10, max_features=10, max_depth=5, total=  40.9s
[CV] min_samples_leaf=1, min_samples_split=2, max_featu

[Parallel(n_jobs=3)]: Done 243 out of 243 | elapsed: 84.3min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=3,
       param_grid={'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10], 'max_features': [1, 3, 10], 'max_depth': [3, 5, None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

In [29]:
pickle.dump(bestRF, open('./rf_model.pkl', 'wb'))

In [30]:
y_test_predRF = bestRF.predict(df_test[training_columns])

In [34]:
mean_absolute_error(df_test['review_overall'], y_test_predRF)

1.5212611967947476

In [35]:
mean_squared_error(df_test['review_overall'], y_test_predRF)

4.360343037270562

In [36]:
r2_score(df_test['review_overall'], y_test_predRF)

0.49907488485278473

### XGBoost

Took about 45 minutes and still hadn't finished running a single run. 4 runs can be done at a time but that needs there are 60 fits in total to do just to fit the first 2 hyperparameters. I'm going to leave xgboost for the moment.

In [72]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=100, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5, verbose=2)
gsearch1.fit(df_train[training_columns],df_train['review_overall'])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] min_child_weight=1, max_depth=3 .................................
[CV] min_child_weight=1, max_depth=3 .................................
[CV] min_child_weight=1, max_depth=3 .................................
[CV] min_child_weight=1, max_depth=3 .................................


KeyboardInterrupt: 