#### Notebook Summary
This notebook contains the code for generating the final training and inferences of our model. It includes some hyperparameter tuning via grid search for the random forest model that we wound up settling on. 

In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics

from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('../../data/yelp_training/yelp_zip_processed_features.csv', sep='\t')
df.head()

Unnamed: 0,user_id,business_id,rating,num_word,num_noun,num_verb,num_adj,num_adv,num_personal_pronoun,avg_word_len,...,user_content_similarity,positive_reviews,negative_reviews,positive_review_ratio,negative_review_ratio,avg_business_sentiment,avg_business_rating,total_business_reviews,lemma,label
0,5044,0,1.0,35,9,2,5,2,1,5.0,...,0.0,0.0,1.0,0.0,1.0,0.141963,3.613636,88,drink bad hot chocolate water latte burn taste...,-1
1,5045,0,1.0,241,54,31,16,15,7,5.561905,...,0.0,0.0,1.0,0.0,1.0,0.141963,3.613636,88,bad experience casual coffee light fare place ...,-1
2,5046,0,3.0,49,11,3,9,2,1,5.130435,...,0.04074681,1.0,0.0,0.25,0.0,0.141963,3.613636,88,locate site old spruce st video mild cofee goo...,-1
3,5047,0,5.0,216,56,33,19,14,5,5.941176,...,4.440892e-16,1.0,0.0,1.0,0.0,0.141963,3.613636,88,enjoy coffee breakfast twice toast recent visi...,-1
4,5048,0,5.0,146,31,23,15,9,8,5.507246,...,0.0,1.0,0.0,1.0,0.0,0.141963,3.613636,88,love toast food choice fantastic love serve br...,-1


In [3]:
numeric_feature_columns = ['rating', 'lexical_diversity', 'sentiment', 'emotiveness_ratio',
                           'num_negative_words', 'num_clauses',
                           'previous_user_reviews', 'avg_user_sentiment',
                           'total_user_reviews', 'user_content_similarity', 'positive_reviews',
                           'negative_reviews', 'negative_review_ratio', 'avg_business_sentiment',
                           'avg_business_rating', 'total_business_reviews']
text_feature_columns = ['lemma']
id_columns = ['user_id', 'business_id']

In [4]:
null_lemma = df['lemma'].isna()
filtered_df = df[~null_lemma]
filtered_df.shape

(608463, 31)

In [7]:
# SVC
X_train, X_test, y_train, y_test = train_test_split(filtered_df[numeric_feature_columns + text_feature_columns],
                                                    filtered_df['label'].values,
                                                    test_size=.3,
                                                    random_state=24)
resampler = SMOTE(random_state=24, k_neighbors=3)
summary_pipeline = Pipeline([('scaler', StandardScaler())])
tfidf_pipeline = Pipeline([('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1,1)))])
column_processor = ColumnTransformer([('summary', summary_pipeline, numeric_feature_columns),
                                      ('tfidf', tfidf_pipeline, 'lemma')])

svm = LinearSVC()

svm_pipeline = Pipeline([('preprocessing', column_processor),
                         ('upsampler', resampler),
                         ('svc', svm)])


print(X_train.shape)
print(X_test.shape)

svm_pipeline.fit(X_train, y_train)
svm_preds = svm_pipeline.predict(X_test)

print(metrics.classification_report(y_test, svm_preds))
metrics.confusion_matrix(y_test, svm_preds)

(425924, 17)
(182539, 17)




              precision    recall  f1-score   support

          -1       0.28      0.63      0.38     24077
           1       0.93      0.75      0.83    158462

    accuracy                           0.73    182539
   macro avg       0.60      0.69      0.61    182539
weighted avg       0.84      0.73      0.77    182539



array([[ 15138,   8939],
       [ 39595, 118867]], dtype=int64)

In [5]:
# RF
X_train, X_test, y_train, y_test = train_test_split(filtered_df[numeric_feature_columns + text_feature_columns],
                                                    filtered_df['label'].values,
                                                    test_size=.3,
                                                    random_state=24)
resampler = SMOTE(random_state=24, k_neighbors=3)
summary_pipeline = Pipeline([('scaler', StandardScaler())])
tfidf_pipeline = Pipeline([('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1,1)))])
column_processor = ColumnTransformer([('summary', summary_pipeline, numeric_feature_columns),
                                      ('tfidf', tfidf_pipeline, 'lemma')])

rf = RandomForestClassifier(n_estimators=150, min_samples_leaf=1, max_samples=.5, max_depth=20)

rf_pipeline = Pipeline([('preprocessing', column_processor),
                         ('upsampler', resampler),
                         ('rf', rf)])


print(X_train.shape)
print(X_test.shape)

rf_pipeline.fit(X_train, y_train)
rf_preds = rf_pipeline.predict(X_test)

print(metrics.classification_report(y_test, rf_preds))
metrics.confusion_matrix(y_test, rf_preds)

(425924, 17)
(182539, 17)
              precision    recall  f1-score   support

          -1       0.31      0.61      0.41     24077
           1       0.93      0.80      0.86    158462

    accuracy                           0.77    182539
   macro avg       0.62      0.70      0.63    182539
weighted avg       0.85      0.77      0.80    182539



array([[ 14609,   9468],
       [ 32439, 126023]], dtype=int64)

In [7]:
import joblib

joblib.dump(rf_pipeline, 'rf_final_model.pkl')

['rf_final_model.pkl']

In [21]:
pd.DataFrame({'labels': rf_preds}).labels.value_counts()/rf_preds.shape[0]

 1    0.731871
-1    0.268129
Name: labels, dtype: float64

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth': [5, 10, 15, 20],
          'min_samples_leaf': [1, 5, .01],
          'n_estimators': [100, 150]}

rf = RandomForestClassifier(max_samples=.5)
gs_rf = GridSearchCV(estimator=rf, param_grid=params, n_jobs=8)

In [11]:
rf_pipeline[2].cv_results_

{'mean_fit_time': array([ 55.74729972,  81.7218997 ,  54.69330006,  81.87800169,
         28.98100147,  40.86236019, 117.88440075, 173.28810005,
        110.08240056, 169.20730124,  28.55430064,  43.83959928,
        215.84879227, 322.27450013, 188.35630016, 287.63340006,
         27.95050001,  42.56709957, 379.90850062, 537.38480072,
        260.70720048, 338.35539994,  26.02040195,  36.30100102]),
 'std_fit_time': array([ 0.91172944,  1.87029312,  1.24191298,  2.4994744 ,  1.27264193,
         1.36705373,  1.21610049,  4.38253972,  1.98068291,  4.19380972,
         1.62069638,  0.56915327,  1.82042726,  4.13854198,  1.83901993,
         3.24925451,  1.53346758,  2.02647757,  6.42811265, 27.09273694,
        14.592427  , 22.41697156,  1.36941765,  2.43445648]),
 'mean_score_time': array([4.92740059, 6.71830225, 4.79919968, 7.237885  , 4.65209823,
        6.56549993, 4.70179958, 6.76030059, 4.50979886, 7.03919811,
        4.22009997, 6.2218996 , 4.90879998, 7.572399  , 5.0721015 ,
    

In [12]:
from sklearn.model_selection import GridSearchCV

# Random Forest
X_train, X_test, y_train, y_test = train_test_split(filtered_df[numeric_feature_columns + text_feature_columns],
                                                    filtered_df['label'].values,
                                                    test_size=.3,
                                                    random_state=24)
resampler = SMOTE(random_state=24, k_neighbors=3)
summary_pipeline = Pipeline([('scaler', StandardScaler())])
tfidf_pipeline = Pipeline([('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1,1)))])
column_processor = ColumnTransformer([('summary', summary_pipeline, numeric_feature_columns),
                                      ('tfidf', tfidf_pipeline, 'lemma')])

params = {'max_depth': [15, 20],
          'min_samples_leaf': [1, 3],
          'n_estimators': [100, 150]}

rf = RandomForestClassifier(max_samples=.5)
gs_rf = GridSearchCV(estimator=rf, param_grid=params, scoring='f1_macro', n_jobs=8)

rf_pipeline = Pipeline([('preprocessing', column_processor),
                         ('upsampler', resampler),
                         ('classifier', gs_rf)])


print(X_train.shape)
print(X_test.shape)

rf_pipeline.fit(X_train, y_train)
rf_preds = rf_pipeline.predict(X_test)

print(metrics.classification_report(y_test, rf_preds))
metrics.confusion_matrix(y_test, rf_preds)

(425924, 19)
(182539, 19)
              precision    recall  f1-score   support

          -1       0.31      0.62      0.42     24077
           1       0.93      0.80      0.86    158462

    accuracy                           0.77    182539
   macro avg       0.62      0.71      0.64    182539
weighted avg       0.85      0.77      0.80    182539



array([[ 14899,   9178],
       [ 32414, 126048]], dtype=int64)

In [13]:
rf_pipeline[2].cv_results_

{'mean_fit_time': array([205.0421979 , 287.88299932, 168.17559829, 283.4786983 ,
        395.14619951, 594.30439925, 321.23929925, 402.842098  ]),
 'std_fit_time': array([ 2.1034299 , 11.12232475,  1.48808441, 11.94609093,  6.29080794,
         4.9794179 ,  7.85330146, 29.63495146]),
 'mean_score_time': array([4.70269961, 6.44900603, 4.44300098, 7.47850037, 5.48090148,
        8.33510137, 5.3878005 , 6.80299959]),
 'std_score_time': array([0.40887997, 0.3938286 , 0.26314798, 0.38967014, 0.351193  ,
        0.33285412, 0.39357054, 0.5549497 ]),
 'param_max_depth': masked_array(data=[15, 15, 15, 15, 20, 20, 20, 20],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[1, 1, 3, 3, 1, 1, 3, 3],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[100, 15

In [15]:
from sklearn.model_selection import GridSearchCV

# Random Forest
X_train, X_test, y_train, y_test = train_test_split(filtered_df[numeric_feature_columns + text_feature_columns],
                                                    filtered_df['label'].values,
                                                    test_size=.3,
                                                    random_state=24)
resampler = SMOTE(random_state=24, k_neighbors=3)
summary_pipeline = Pipeline([('scaler', StandardScaler())])
tfidf_pipeline = Pipeline([('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1,1)))])
column_processor = ColumnTransformer([('summary', summary_pipeline, numeric_feature_columns),
                                      ('tfidf', tfidf_pipeline, 'lemma')])

params = {'max_depth': [20, 30]}

rf = RandomForestClassifier(n_estimators=150, min_samples_leaf=1, max_samples=.5)
gs_rf = GridSearchCV(estimator=rf, param_grid=params, scoring='f1', n_jobs=5)

rf_pipeline = Pipeline([('preprocessing', column_processor),
                         ('upsampler', resampler),
                         ('classifier', gs_rf)])


print(X_train.shape)
print(X_test.shape)

rf_pipeline.fit(X_train, y_train)
rf_preds = rf_pipeline.predict(X_test)

print(metrics.classification_report(y_test, rf_preds))
metrics.confusion_matrix(y_test, rf_preds)

(425924, 19)
(182539, 19)
              precision    recall  f1-score   support

          -1       0.32      0.59      0.42     24077
           1       0.93      0.81      0.87    158462

    accuracy                           0.78    182539
   macro avg       0.63      0.70      0.64    182539
weighted avg       0.85      0.78      0.81    182539



array([[ 14282,   9795],
       [ 29703, 128759]], dtype=int64)

In [16]:
rf_pipeline[2].cv_results_

{'mean_fit_time': array([380.4331017 , 892.77549839]),
 'std_fit_time': array([ 2.23868168, 13.83918291]),
 'mean_score_time': array([7.07489963, 7.57670135]),
 'std_score_time': array([0.35750201, 0.72704709]),
 'param_max_depth': masked_array(data=[20, 30],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 20}, {'max_depth': 30}],
 'split0_test_score': array([0.75673856, 0.77343559]),
 'split1_test_score': array([0.82378117, 0.84274131]),
 'split2_test_score': array([0.81364059, 0.83041676]),
 'split3_test_score': array([0.81493317, 0.84867518]),
 'split4_test_score': array([0.81002915, 0.84084648]),
 'mean_test_score': array([0.80382453, 0.82722306]),
 'std_test_score': array([0.02397336, 0.02753143]),
 'rank_test_score': array([2, 1])}

The RFC has clearly outperformed the SVC, improving on every metric primarily by reducing the number of false negatives. With this in mind, it's time to generate new inferences on our true data set

In [11]:
rf_pipeline

In [12]:
all_df = pd.read_csv('../../data/yelp_all/filtered_preprocessed_dataset.tsv', sep='\t')
null_lemma = all_df['lemma'].isna()
filtered_df = all_df[~null_lemma]
filtered_df.head()

Unnamed: 0,review_id,user_id,business_id,rating,num_word,num_noun,num_verb,num_adj,num_adv,num_personal_pronoun,...,total_user_reviews,user_content_similarity,positive_reviews,negative_reviews,positive_review_ratio,negative_review_ratio,avg_business_sentiment,avg_business_rating,total_business_reviews,lemma
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,97,12,17,10,12,10,...,28,0.054206,19.0,3.0,0.678571,0.107143,0.08927,3.068571,175,decide eat aware go hour begin end try multipl...
1,jHmqmoEI-78BGHFJaDKlhQ,5EGs7LX3Z8ZogvOOgNLsnA,XQfwVwDr-v0ZS3_CbbE5Xw,2,23,5,3,5,0,2,...,16,0.025516,6.0,10.0,0.375,0.625,0.08927,3.068571,175,excellent food slow slow slow staff need train...
2,vwIXZHod-jQmGFvx0wCqSg,q3Kv3wFOwu1Rd2I6T_VWOQ,XQfwVwDr-v0ZS3_CbbE5Xw,5,62,17,11,2,3,2,...,6,0.023194,5.0,1.0,0.833333,0.166667,0.08927,3.068571,175,read ton mixed review fiancé decide try turnin...
3,SP32nOhRm-KRAjYMPgf_MQ,wmqsehbFirZPlAluJUakeQ,XQfwVwDr-v0ZS3_CbbE5Xw,3,79,23,6,9,3,1,...,375,0.106651,182.0,58.0,0.485333,0.154667,0.08927,3.068571,175,stop lunch sit afternoon luckily seat fairly q...
4,fvu5n5shkAJDbQjulKNuqw,NMLvjdY7IOdtfU0TepvUuA,XQfwVwDr-v0ZS3_CbbE5Xw,2,251,60,38,22,12,8,...,357,0.023849,180.0,59.0,0.504202,0.165266,0.08927,3.068571,175,want love restaurant interior decorate food lo...


In [13]:
all_predictions = rf_pipeline.predict(filtered_df[numeric_feature_columns + text_feature_columns])

In [16]:
predicted_reviews = pd.DataFrame({'review_id': filtered_df['review_id'], 'review_label': all_predictions})
predicted_reviews.review_label.value_counts()


 1    5481720
-1    1508496
Name: review_label, dtype: int64

In [17]:
predicted_reviews.review_label.value_counts() / predicted_reviews.shape[0]

 1    0.784199
-1    0.215801
Name: review_label, dtype: float64

In [22]:
predicted_reviews.to_csv('../../data/yelp_all/reviews_with_predicted_label_final_rf.csv', sep=',', index=False)