# Importing Data and Packages #

In [1]:
pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [2]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from precision_recall_cutoff import precision_recall_cutoff

from tqdm import tqdm
from scipy.stats import boxcox
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier 
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, recall_score
from sklearn.feature_selection import RFE, RFECV

pd.set_option('display.max_columns', 50)

store_final = pd.read_csv('store_final.csv')
store_final.head()

Unnamed: 0,Year_Birth,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Income_imp,Education_Basic,Education_Graduation,Education_Master,Education_PhD,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_Widow,Age,Child,MntTotal,Recency_u50,TotalPurchses,Enr_year,Enr_quarter,Enr_month,Log_Income,Interaction_1,Interaction_2,Interaction_3,Interaction_4,Interaction_5,Interaction_6,Tree_1,Tree_2,Tree_3,Response
0,1970,84835.0,0,0,0,189,104,379,111,189,218,1,4,4,6,1,0,0,0,1,0,0,1,0,0,0,53,0,1190,1,14,2014,2,6,11.348463,259420,4760,0,872,0,0,0,0,0,1
1,1961,57091.0,0,0,0,464,5,64,7,0,37,1,7,3,7,5,0,0,0,1,0,0,0,0,1,0,62,0,577,1,17,2014,2,6,10.952402,21349,1731,0,111,0,0,0,1,0,1
2,1958,67267.0,0,1,0,134,11,59,15,2,30,1,3,2,5,2,0,0,0,1,0,0,0,1,0,0,65,1,251,1,10,2014,2,5,11.116425,7530,502,0,60,0,0,0,1,0,0
3,1967,32474.0,1,1,0,10,0,1,0,0,0,1,1,0,2,7,0,0,0,1,0,0,0,1,0,0,56,1,11,1,3,2014,4,11,10.388195,0,0,0,0,0,0,0,1,0,0
4,1989,21474.0,1,0,0,6,16,24,11,0,34,2,3,1,2,7,0,0,0,1,0,0,0,0,1,0,34,1,91,1,6,2014,3,8,9.974598,3094,91,0,34,0,0,0,1,0,1


In [5]:
#defining input (top 8) and target (Response)
x=store_final[['Recency', 'MntTotal', 'Interaction_2', 'Enr_year', 'MntWines', 'Interaction_4', 'MntMeatProducts', 'NumWebVisitsMonth']]
y=store_final['Response']

#store model results
dt_results = []
rf_results = []
ada_results = []
gb_results = []
xgb_results = []

for i in tqdm(range(0,100)):
    
    #splitting the data
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y)
    
    #top 6 variables for RF model
    x_train_6 = x_train.drop(columns = ['MntMeatProducts','NumWebVisitsMonth'])
    x_test_6 = x_test.drop(columns = ['MntMeatProducts','NumWebVisitsMonth'])
    
    ######### Decision Tree ######### 
    
    #building the model with best hyperparameters
    dt_md = DecisionTreeClassifier(min_samples_split = 10, min_samples_leaf = 5, max_depth = 7).fit(x_train, y_train)

    #predicting on test
    dt_pred = dt_md.predict_proba(x_test)[:,1]

    #changing likelihood to labels
    dt_label = precision_recall_cutoff(y_test, dt_pred)
    
    dt_results.append(recall_score(y_test, dt_label))
    
    ######### Random Forest ######### 
    
    #building the model with best hyperparameters
    rf_md = RandomForestClassifier(max_depth = 7, min_samples_leaf = 5, min_samples_split = 10, n_estimators = 300).fit(x_train,y_train)

    #predicting on test
    rf_pred = rf_md.predict_proba(x_test)[:,1]

    #changing likelihood to labels
    rf_label = precision_recall_cutoff(y_test, rf_pred)
    
    rf_results.append(recall_score(y_test, rf_label))
    
    ######### AdaBoost ######### 
    
    #building the model with best hyperparameters
    ada_md = AdaBoostClassifier(estimator = DecisionTreeClassifier(min_samples_split = 10, min_samples_leaf = 7, max_depth = 3),
                            n_estimators = 300, learning_rate = 0.1).fit(x_train, y_train)

    #predicting on test
    ada_pred = ada_md.predict_proba(x_test)[:,1]

    #changing likelihood to labels
    ada_label = precision_recall_cutoff(y_test, ada_pred)
    
    ada_results.append(recall_score(y_test, ada_label))
    
    ######### Gradient Boosting ######### 
    
    #building the model with best hyperparameters
    gb_md = GradientBoostingClassifier(max_depth = 7, min_samples_leaf = 5, min_samples_split = 10, 
                                       learning_rate = 0.1, n_estimators = 100).fit(x_train, y_train)

    #predicting on test
    gb_pred = gb_md.predict_proba(x_test)[:,1]

    #changing likelihood to labels
    gb_label = precision_recall_cutoff(y_test, gb_pred)

    gb_results.append(recall_score(y_test, gb_label))
    
    ######### XGBoost ######### 
    
    #building the model with best hyperparameters
    xgb_md = XGBClassifier(n_estimators = 500, learning_rate = 0.1, max_depth = 7, gamma = 0.1, 
                           min_child_weight = 5, subsample = 0.8, colsample_bytree = 1).fit(x_train, y_train)

    #predicting on test
    xgb_pred = xgb_md.predict_proba(x_test)[:,1]

    #changing likelihood to labels
    xgb_label = precision_recall_cutoff(y_test, xgb_pred)
    
    xgb_results.append(recall_score(y_test, xgb_label))
    
print('dt',np.mean(dt_results))
print('rf',np.mean(rf_results))
print('ada',np.mean(ada_results))
print('gb',np.mean(gb_results))
print('xgb',np.mean(xgb_results))

100%|██████████| 100/100 [08:33<00:00,  5.13s/it]

dt 0.9061197455030164
rf 0.9015352269447995
ada 0.9740804716164407
gb 0.9766730565747344
xgb 0.9750494702560087



