In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import naive_bayes, tree
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import roc_curve, auc

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('../data/model_ready.csv')

In [4]:
df.drop(['spotlight_True'], axis=1, inplace=True)
df.rename(columns={'main_category_Comics': 'Comics',
                   'main_category_Crafts': 'Crafts', 
                   'main_category_Dance': 'Dance', 
                   'main_category_Design': 'Design',
                   'main_category_Fashion': 'Fashion', 
                   'main_category_Film & Video': 'Film_Video',
                   'main_category_Food': 'Food', 
                   'main_category_Games': 'Games', 
                   'main_category_Journalism': 'Journalism',
                   'main_category_Music': 'Music', 
                   'main_category_Photography': 'Photography',
                   'main_category_Publishing': 'Publishing', 
                   'main_category_Technology': 'Technology',
                   'main_category_Theater': 'Theater'}, inplace=True)
df.columns

Index(['success', 'goal_usd', 'country_US', 'cam_duration', 'prep_duration',
       'desc_length', 'staff_pick_True', 'Comics', 'Crafts', 'Dance', 'Design',
       'Fashion', 'Film_Video', 'Food', 'Games', 'Journalism', 'Music',
       'Photography', 'Publishing', 'Technology', 'Theater'],
      dtype='object')

In [6]:
X, y = df.drop(['success'], axis=1), df.success
X_mid, X_test, y_mid, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_mid, y_mid, test_size=0.2, random_state=42)

In [7]:
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_val_sc = scaler.transform(X_val)

## Random Forest + Gridsearch CV

In [8]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)

rf_ac = rf.score(X_val, y_val)
rf_precision = precision_score(y_val, rf_pred)
rf_recall = recall_score(y_val, rf_pred)
rf_f1 = f1_score(y_val, rf_pred)

rf_y_score = rf.predict_proba(X_val)[:, 1]
rf_fpr, rf_tpr, auc_thresholds = roc_curve(y_val, rf_y_score)
roc_auc_rf = auc(rf_fpr, rf_tpr)

print(classification_report(y_val, rf_pred))

              precision    recall  f1-score   support

           0       0.69      0.62      0.65     12303
           1       0.75      0.81      0.78     17963

    accuracy                           0.73     30266
   macro avg       0.72      0.71      0.72     30266
weighted avg       0.73      0.73      0.73     30266



In [9]:
print('Random Forest validation metrics: \n Accuracy: %.4f \n Precision: %.4f \n Recall: %.4f \n F1: %.4f \n ROC: %.4f' %
        (rf_ac, 
         rf_precision, 
         rf_recall,
         rf_f1,
         roc_auc_rf)
     )

Random Forest validation metrics: 
 Accuracy: 0.7322 
 Precision: 0.7550 
 Recall: 0.8124 
 F1: 0.7827 
 ROC: 0.7942


In [10]:
confusion_matrix(y_val, rf_pred)

array([[ 7567,  4736],
       [ 3369, 14594]])

In [7]:
# random forest grid search to minimize FP - improve precision
n_estimators = [10,100,200,300,400,500,600,1000]
criterion = ['gini', 'entropy']
param_grid = dict(n_estimators = n_estimators, criterion = criterion)
rf_grid_search = RandomForestClassifier()

In [8]:
rf_grid = GridSearchCV(rf_grid_search, param_grid=param_grid, cv=5, scoring='f1', verbose=2, n_jobs=-1)
rf_grid.fit(X_train, y_train)


Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [11]:
rf_grid_preds = rf_grid.predict(X_val)
rf_best_params = rf_grid.best_params_
rf_best_estimator = rf_grid.best_estimator_
rf_best_cm = confusion_matrix(y_val,rf_grid_preds)
rf_best_cr = classification_report(y_val,rf_grid_preds)
print(rf_best_params, rf_best_estimator, rf_best_cm, rf_best_cr)

{'criterion': 'entropy', 'n_estimators': 600} RandomForestClassifier(criterion='entropy', n_estimators=600) [[ 7560  4743]
 [ 3260 14703]]               precision    recall  f1-score   support

           0       0.70      0.61      0.65     12303
           1       0.76      0.82      0.79     17963

    accuracy                           0.74     30266
   macro avg       0.73      0.72      0.72     30266
weighted avg       0.73      0.74      0.73     30266



In [10]:
rf_y_score = rf_grid.predict_proba(X_val)[:, 1]
rf_grid_fpr, rf_grid_tpr, auc_thresholds = roc_curve(y_val, rf_y_score)
roc_auc_rf_grid = auc(rf_grid_fpr, rf_grid_tpr)
print(roc_auc_rf_grid)

0.7958826507415839


## Logistic Regression + GridsearchCV

In [11]:
lr = LogisticRegression()
lr.fit(X_train_sc, y_train)
lr_pred = lr.predict(X_val_sc)
lr_confusion = confusion_matrix(y_val, lr_pred)

lr_ac = lr.score(X_val_sc, y_val)
lr_precision = precision_score(y_val, lr_pred)
lr_recall = recall_score(y_val, lr_pred)
lr_f1 = f1_score(y_val, lr_pred)

print(classification_report(y_val, lr_pred))

              precision    recall  f1-score   support

           0       0.69      0.53      0.60     12303
           1       0.72      0.84      0.77     17963

    accuracy                           0.71     30266
   macro avg       0.70      0.68      0.69     30266
weighted avg       0.71      0.71      0.70     30266



In [12]:
print('Logistic Regression validation metrics: \n Accuracy: %.4f \n Precision: %.4f \n Recall: %.4f \n F1: %.4f' %
        (lr_ac, 
         lr_precision, 
         lr_recall,
         lr_f1)
     )

Logistic Regression validation metrics: 
 Accuracy: 0.7112 
 Precision: 0.7218 
 Recall: 0.8354 
 F1: 0.7745


In [8]:
# logistic regression grid search to minimize FP - improve precision
penalty = ['l1', 'l2']
C = [0.001,0.01,0.1,1,10,100,1000]
param_grid = dict(C=C, penalty=penalty)
lr_grid_search = LogisticRegression()
lr_grid = GridSearchCV(lr_grid_search, param_grid, cv=5, scoring='f1', verbose=2, n_jobs=-1, refit = True)
lr_grid.fit(X_train_sc, y_train)
lr_grid_preds = lr_grid.predict(X_val_sc)

lr_grid_best_params = lr_grid.best_params_
lr_grid_best_estimator = lr_grid.best_estimator_
lr_grid_best_cm = confusion_matrix(y_val,lr_grid_preds)
lr_grid_best_cr = classification_report(y_val,lr_grid_preds)
print(lr_grid_best_params, lr_grid_best_estimator, lr_grid_best_cm, lr_grid_best_cr)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


        nan 0.77385393        nan 0.77485187        nan 0.77507374
        nan 0.7750438 ]


{'C': 100, 'penalty': 'l2'} LogisticRegression(C=100) [[ 6502  5801]
 [ 2920 15043]]               precision    recall  f1-score   support

           0       0.69      0.53      0.60     12303
           1       0.72      0.84      0.78     17963

    accuracy                           0.71     30266
   macro avg       0.71      0.68      0.69     30266
weighted avg       0.71      0.71      0.70     30266



In [9]:
lr_grid_y_score = lr_grid.predict_proba(X_val)[:, 1]
lr_grid_fpr, lr_grid_tpr, auc_thresholds = roc_curve(y_val, lr_grid_y_score)
roc_auc_lr_grid = auc(lr_grid_fpr, lr_grid_tpr)
print(roc_auc_lr_grid )

0.5029882041570825


In [13]:
# test comparison

# Logistic Regression
scaler = StandardScaler()
X_mid_sc = scaler.fit_transform(X_mid)
X_test_sc = scaler.transform(X_test)

lr = LogisticRegression(C=100)
lr.fit(X_mid_sc, y_mid)
lr_pred = lr.predict(X_test_sc)

lr_ac = lr.score(X_test_sc, y_test)
lr_precision = precision_score(y_test, lr_pred)
lr_recall = recall_score(y_test, lr_pred)
lr_f1 = f1_score(y_test, lr_pred)

lr_y_score = lr.predict_proba(X_test)[:, 1]
lr_fpr, lr_tpr, auc_thresholds = roc_curve(y_test, lr_y_score)
roc_auc_lr = auc(lr_fpr, lr_tpr)

# Random Forst

rf = RandomForestClassifier(criterion='entropy', n_estimators=600)
rf.fit(X_mid, y_mid)
rf_pred = rf.predict(X_test)

rf_ac = rf.score(X_test, y_test)
rf_precision = precision_score(y_test, rf_pred)
rf_recall = recall_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred)

rf_y_score = rf.predict_proba(X_test)[:, 1]
rf_fpr, rf_tpr, auc_thresholds = roc_curve(y_test, rf_y_score)
roc_auc_rf = auc(rf_fpr, rf_tpr)

In [15]:
print('Logistic Regression test score: \n Accuracy: %.4f \n Precision: %.4f \n Recall: %.4f \n F1: %.4f \n ROC: %.4f' %
        (lr_ac, 
         lr_precision, 
         lr_recall,
         lr_f1,
         roc_auc_lr)
     )

print('Random Forest test score: \n Accuracy: %.4f \n Precision: %.4f \n Recall: %.4f \n F1: %.4f \n ROC: %.4f' %
        (rf_ac, 
         rf_precision, 
         rf_recall,
         rf_f1,
         roc_auc_rf)
     )

Logistic Regression test score: 
 Accuracy: 0.7101 
 Precision: 0.7175 
 Recall: 0.8423 
 F1: 0.7749 
 ROC: 0.5029
Random Forest test score: 
 Accuracy: 0.7276 
 Precision: 0.7499 
 Recall: 0.8105 
 F1: 0.7790 
 ROC: 0.7890
