## Naive Baise ##

Naive Baise modeling notebook

In [1]:
# for preprocessing/eda models
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# balancing
from imblearn.over_sampling import SMOTE

# accuracy metrics and data split models
from sklearn.model_selection import train_test_split
from sklearn import metrics, model_selection
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

pd.set_option('display.max_columns', 500) # display max 500 rows
pd.set_option('display.max_rows', 500)



In [2]:
# read in data 
all_data = pd.read_csv('../all_model_data.csv', index_col = 0)
all_data.head()

Unnamed: 0,OperatingSystems,Browser,Region,TrafficType,Weekend,Revenue,Administrative_Duration_Scaled,Informational_Duration_Scaled,ProductRelated_Duration_Scaled,BounceRates_Scaled,ExitRates_Scaled,PageValues_Scaled,ExitRatesImpute_Scaled,totalFracAdmin_Scaled,totalFracInfo_Scaled,totalFracProd_Scaled,BounceExitAvg_Scaled,BounceExitW1_Scaled,BounceExitW2_Scaled,BounceExitW3_Scaled,BounceExitW4_Scaled,BouncePageRatio_Scaled,ExitPageRatio_Scaled,InfoPageRatio_Scaled,ProdRelPageRatio_Scaled,InfoBounceRatio_Scaled,AdminBounceRatio_Scaled,ProdRelBounceRatio_Scaled,InfoExitRatio_Scaled,ProdRelExitRatio_Scaled,Administrative_Duration_Scaled_Norm,Informational_Duration_Scaled_Norm,ProductRelated_Duration_Scaled_Norm,BounceRates_Scaled_Norm,ExitRates_Scaled_Norm,PageValues_Scaled_Norm,ExitRatesImpute_Scaled_Norm,totalFracAdmin_Scaled_Norm,totalFracInfo_Scaled_Norm,totalFracProd_Scaled_Norm,BounceExitAvg_Scaled_Norm,BounceExitW1_Scaled_Norm,BounceExitW2_Scaled_Norm,BounceExitW3_Scaled_Norm,BounceExitW4_Scaled_Norm,BouncePageRatio_Scaled_Norm,ExitPageRatio_Scaled_Norm,InfoPageRatio_Scaled_Norm,ProdRelPageRatio_Scaled_Norm,InfoBounceRatio_Scaled_Norm,AdminBounceRatio_Scaled_Norm,ProdRelBounceRatio_Scaled_Norm,InfoExitRatio_Scaled_Norm,ProdRelExitRatio_Scaled_Norm,VisitorType_bin_1,VisitorType_bin_2,VisitorType_bin_3,Month_bin_1,Month_bin_2,Month_bin_3,Month_bin_4,SpecialDay_0.0,SpecialDay_0.2,SpecialDay_0.4,SpecialDay_0.6,SpecialDay_0.8,SpecialDay_1.0,Browser_Bin_1,Browser_Bin_2,Browser_Bin_3,TrafficType_Bin_1,TrafficType_Bin_2,TrafficType_Bin_3,Region_Bin_1,Region_Bin_2,Region_Bin_3,OperatingSystems_Bin_1,OperatingSystems_Bin_2,OperatingSystems_Bin_3,Informational_Duration_Scaled_Bin,PageValues_Scaled_Bin,totalFracInfo_Scaled_Bin,BouncePageRatio_Scaled_Bin,ExitPageRatio_Scaled_Bin,InfoPageRatio_Scaled_Bin,ProdRelPageRatio_Scaled_Bin,InfoBounceRatio_Scaled_Bin,InfoExitRatio_Scaled_Bin,totalFracProd_Bin,Administrative_Duration_Norm,Informational_Duration_Norm,ProductRelated_Duration_Norm,BounceRates_Norm,ExitRates_Norm,PageValues_Norm,ExitRatesImpute_Norm,totalFracAdmin_Norm,totalFracInfo_Norm,totalFracProd_Norm,BounceExitAvg_Norm,BounceExitW1_Norm,BounceExitW2_Norm,BounceExitW3_Norm,BounceExitW4_Norm,BouncePageRatio_Norm,ExitPageRatio_Norm,InfoPageRatio_Norm,ProdRelPageRatio_Norm,InfoBounceRatio_Norm,AdminBounceRatio_Norm,ProdRelBounceRatio_Norm,InfoExitRatio_Norm,ProdRelExitRatio_Norm,Administrative_Duration_Norm_Scaled,Informational_Duration_Norm_Scaled,ProductRelated_Duration_Norm_Scaled,BounceRates_Norm_Scaled,ExitRates_Norm_Scaled,PageValues_Norm_Scaled,ExitRatesImpute_Norm_Scaled,totalFracAdmin_Norm_Scaled,totalFracInfo_Norm_Scaled,totalFracProd_Norm_Scaled,BounceExitAvg_Norm_Scaled,BounceExitW1_Norm_Scaled,BounceExitW2_Norm_Scaled,BounceExitW3_Norm_Scaled,BounceExitW4_Norm_Scaled,BouncePageRatio_Norm_Scaled,ExitPageRatio_Norm_Scaled,InfoPageRatio_Norm_Scaled,ProdRelPageRatio_Norm_Scaled,InfoBounceRatio_Norm_Scaled,AdminBounceRatio_Norm_Scaled,ProdRelBounceRatio_Norm_Scaled,InfoExitRatio_Norm_Scaled,ProdRelExitRatio_Norm_Scaled
0,1,1,1,1,False,False,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.339602,0.19895,0.0,0.196854,0.0,0.0,1.0,0.388586,0.376399,0.364063,0.160423,0.16993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,0.0,0.0,0.0,0.164848,0.045231,0.0,0.044788,0.0,0.0,1.0,0.186358,0.180863,0.17536,0.191778,0.197112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.996659,-0.492101,-2.096783,1.733188,1.982547,-0.531818,1.982622,-1.005365,-0.515133,0.757905,1.76066,1.784062,1.80832,1.738774,1.718535,-0.366273,-0.496257,-0.298863,-0.532522,-0.39044,-1.029711,-0.986837,-0.491352,-2.077588
1,2,2,1,2,False,False,0.0,0.0,0.001,0.0,0.5,0.0,0.499561,0.0,0.0,1.0,0.25,0.2,0.15,0.3,0.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5e-05,0.0,0.0,0.031306,0.0,0.177272,0.0,0.175783,0.0,0.0,1.0,0.342421,0.320879,0.294237,0.130272,0.142518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006663,True,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,0.0,0.0,6.990788,0.0,0.040278,0.0,0.039977,0.0,0.0,1.0,0.162123,0.151301,0.137884,0.171354,0.179548,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.184085,-0.996659,-0.492101,-1.074189,-0.974179,1.569866,-0.531818,1.57393,-1.005365,-0.515133,0.757905,1.171289,1.064098,0.903147,1.245637,1.299009,-0.366273,-0.496257,-0.298863,-0.532522,-0.39044,-1.029711,-0.986837,-0.491352,-1.190272
2,4,1,9,3,False,False,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.339602,0.19895,0.0,0.196854,0.0,0.0,1.0,0.388586,0.376399,0.364063,0.160423,0.16993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,True,0.0,0.0,0.0,0.164848,0.045231,0.0,0.044788,0.0,0.0,1.0,0.186358,0.180863,0.17536,0.191778,0.197112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.996659,-0.492101,-2.096783,1.733188,1.982547,-0.531818,1.982622,-1.005365,-0.515133,0.757905,1.76066,1.784062,1.80832,1.738774,1.718535,-0.366273,-0.496257,-0.298863,-0.532522,-0.39044,-1.029711,-0.986837,-0.491352,-2.077588
3,3,2,2,4,False,False,0.0,0.0,4.2e-05,0.25,0.7,0.0,0.699736,0.0,0.0,1.0,0.475,0.43,0.385,0.52,0.565,0.0,0.0,0.0,0.0,0.0,0.0,1.705531e-07,0.0,1e-06,0.0,0.0,0.006454,0.314382,0.190387,0.0,0.188576,0.0,0.0,1.0,0.375055,0.362113,0.348861,0.150307,0.16007,0.0,0.0,0.0,0.0,0.0,0.0,0.142554,0.0,0.001151,True,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,0.0,0.0,1.513204,0.150599,0.043425,0.0,0.043045,0.0,0.0,1.0,0.179783,0.173713,0.167491,0.1857,0.19149,0.0,0.0,0.0,0.0,0.0,0.0,1.383607,0.0,3.968822,-0.996659,-0.492101,-1.875436,1.499177,1.832073,-0.531818,1.834505,-1.005365,-0.515133,0.757905,1.600768,1.609932,1.618258,1.592022,1.58424,-0.366273,-0.496257,-0.298863,-0.532522,-0.39044,-1.029711,0.62373,-0.491352,-1.788555
4,3,3,1,4,True,False,0.0,0.0,0.009809,0.1,0.25,0.0,0.249341,0.0,0.0,1.0,0.175,0.16,0.145,0.19,0.205,0.0,0.0,0.0,0.0,0.0,0.0,0.0001003332,0.0,0.000873,0.0,0.0,0.089834,0.254789,0.136293,0.0,0.135385,0.0,0.0,1.0,0.315545,0.303794,0.291588,0.106852,0.114152,0.0,0.0,0.0,0.0,0.0,0.0,0.314969,0.0,0.028814,True,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,True,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,0.0,0.0,14.72749,0.118607,0.030107,0.0,0.029966,0.0,0.0,1.0,0.147922,0.14232,0.136505,0.153329,0.158568,0.0,0.0,0.0,0.0,0.0,0.0,1.698486,0.0,24.686052,-0.996659,-0.492101,0.057515,0.97376,0.72246,-0.531818,0.723407,-1.005365,-0.515133,0.757905,0.825942,0.845382,0.869842,0.81042,0.797889,-0.366273,-0.496257,-0.298863,-0.532522,-0.39044,-1.029711,0.990261,-0.491352,-0.279806


In [3]:
# select X and y 
features = all_data.drop('Revenue', axis =1) #features
target = all_data['Revenue'] #target
print(all_data.shape)
print(features.shape)
print(target.shape)

(12330, 137)
(12330, 136)
(12330,)


In [4]:
X = features[['ProdRelPageRatio_Scaled_Bin','totalFracAdmin_Scaled','Administrative_Duration_Scaled'
             ,'BounceRates_Norm_Scaled', 'ExitRates_Scaled','SpecialDay_1.0']]
y = target

In [5]:
# lists for f1-score and AUC
f1_score_lst = []
auc_lst = []

#loop to calculate f1 and auc scores and present averages after 10 runs
for count in range (1,10):

    #Create a Gaussian Classifier
    gnb = GaussianNB()

    # create training and testing vars
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123, shuffle=True)
        
    # this is the formula after you split the dataset
    sm = SMOTE(random_state=123, sampling_strategy = 'minority')
    x_train_res, y_train_res = sm.fit_sample(X_train, y_train)
    
    #Train the model using the training sets
    gnb.fit(x_train_res, y_train_res)
    
    #Predict the response for test dataset
    pred_y = gnb.predict(X_test)

    #10-fold cross validation
    kfold = model_selection.KFold(n_splits=10, random_state=123, shuffle=True)
    scoring = 'accuracy'
    results = model_selection.cross_val_score(gnb, x_train_res, y_train_res, cv=kfold, scoring=scoring)

    #calculate AUC
    clf_roc_auc = roc_auc_score(y_test, pred_y)
    
    #calculate average f1-score and AUC
    f1_score_lst.append(precision_recall_fscore_support(y_test, pred_y, average='weighted')[2])
    auc_lst.append(clf_roc_auc)

# display average AUC and F1 score
print('F1 {:.4f}; AUC {:.4f} '.format(np.mean(f1_score_lst),np.mean(auc_lst)))
    
# Is our model still predicting just one class?
print('Model is predicting ',np.unique( pred_y ),'class' )

# Print accuracy score
print('Accuracy of classifier on test set: {:.3f}'.format(gnb.score(X_test, y_test)))
    
# Display 10-fold cross validation average accuracy
print("10-fold cross validation average accuracy of clf_0: %.3f" % (results.mean()))
    
# calculate cunfusion matrix
confusion_matrix_y = confusion_matrix(y_test, pred_y)
print('Confusion Matrix for Classfier:')
print(confusion_matrix_y)

print('Classification Report for Classfier:')
print(classification_report(y_test, pred_y))



F1 0.8461; AUC 0.8188 
Model is predicting  [False  True] class
Accuracy of classifier on test set: 0.832
10-fold cross validation average accuracy of clf_0: 0.827
Confusion Matrix for Classfier:
[[2589  501]
 [ 122  487]]
Classification Report for Classfier:
              precision    recall  f1-score   support

       False       0.95      0.84      0.89      3090
        True       0.49      0.80      0.61       609

    accuracy                           0.83      3699
   macro avg       0.72      0.82      0.75      3699
weighted avg       0.88      0.83      0.85      3699

