In [96]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import model_selection

#Stop warnings
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

In [35]:
data = pd.read_csv('Clothing_Store.csv')

dummies = pd.get_dummies(data[['VALPHON']],  drop_first=True)
data=data.join(dummies)

del data['VALPHON']
final_train = data.sample(frac=0.6, random_state=450411920)
final_test = data[data.index.isin(final_train.index)==False]
#Now we have final train/test which only has predictors whilst y_train/test are the response
y_train = final_train.pop('RESP')
y_test = final_test.pop('RESP')


### Going to try out some models here

You can see more information about the models I use in this: http://scikit-learn.org/stable/modules/ensemble.html#gradient-boosting

In [99]:
#In order to store the results from our models
pred = []
method = []
model = []

In [100]:
#Adaptive Boosting
regr = AdaBoostClassifier(learning_rate = 1, n_estimators = 350)
model.append(regr)
regr = regr.fit(final_train,y_train)
adapred = regr.predict(final_test)
pred.append(adapred)
method.append('AdaptiveBoost')

In [101]:
#Random Forest 
RandomFor = RandomForestClassifier(max_depth=None,min_samples_split=2)
model.append(RandomFor)
RandomFor = RandomFor.fit(final_train,y_train)
randomforpred = RandomFor.predict(final_test)
pred.append(randomforpred)
method.append('Random Forest')

In [102]:
randomforpred

array([0, 0, 0, ..., 0, 0, 0])

In [103]:
#Extremely Random Forest
Extratree = ExtraTreesClassifier(max_depth=None,min_samples_split=2)
model.append(Extratree)
Extratree = Extratree.fit(final_train,y_train)
predFinalExtRandomForest = Extratree.predict(final_test)
pred.append(predFinalExtRandomForest)
method.append('Extreme Random Forest')

In [104]:
predFinalExtRandomForest

array([0, 0, 1, ..., 0, 0, 0])

In [105]:
#Gradient Boosting Classifier
GradBoost = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                 max_depth=1, random_state=0)
model.append(GradBoost)
GradBoost = GradBoost.fit(final_train,y_train)
predGradBoost = GradBoost.predict(final_test)
pred.append(predGradBoost)
method.append('Gradient Boost')

In [106]:
predGradBoost

array([0, 0, 0, ..., 0, 0, 0])

In [107]:
#Naive Bayes
NaiveB = GaussianNB()
model.append(NaiveB)
NaiveB = NaiveB.fit(final_train, y_train)
predNaiveB = NaiveB.predict(final_test)
pred.append(predNaiveB)
method.append('Naive Bayes')

In [108]:
predNaiveB

array([0, 0, 0, ..., 1, 1, 1])

In [109]:
LogReg = LogisticRegression(random_state=450411920)
model.append(LogReg)
LogReg = LogReg.fit(final_train,y_train)
predLogReg = LogReg.predict(final_test)
pred.append(predLogReg)
method.append('Logit Reg')

In [110]:
predLogReg

array([0, 0, 0, ..., 0, 0, 0])

We can evaluate how each of our model performs from a confusion matrix.
https://machinelearningmastery.com/metrics-evaluate-machine-learning-algorithms-python/

In [123]:
for prediction, name in zip(pred,method):
    print('{} confusion matrix is:'.format(name))
    matrix = confusion_matrix(y_test, prediction)
    print(matrix)
    print('Accuracy of predicting Yes is %.3f'%(matrix[0][0]/(matrix[0][0]+matrix[1][0])))
    print('Accuracy of predicting No is %.3f'%(matrix[1][1]/(matrix[0][1]+matrix[1][1])))
    print('Overall accuracy is %.3f' % ((matrix[0][0]+matrix[1][1])/(matrix[0][0]+matrix[1][1]+matrix[1][0]+matrix[0][1])))
    #8696 is total observations

AdaptiveBoost confusion matrix is:
[[6928  319]
 [ 982  467]]
Accuracy of predicting Yes is 0.876
Accuracy of predicting No is 0.594
Overall accuracy is 0.850
Random Forest confusion matrix is:
[[7008  239]
 [1098  351]]
Accuracy of predicting Yes is 0.865
Accuracy of predicting No is 0.595
Overall accuracy is 0.846
Extreme Random Forest confusion matrix is:
[[7009  238]
 [1131  318]]
Accuracy of predicting Yes is 0.861
Accuracy of predicting No is 0.572
Overall accuracy is 0.843
Gradient Boost confusion matrix is:
[[6970  277]
 [1019  430]]
Accuracy of predicting Yes is 0.872
Accuracy of predicting No is 0.608
Overall accuracy is 0.851
Naive Bayes confusion matrix is:
[[7202   45]
 [1436   13]]
Accuracy of predicting Yes is 0.834
Accuracy of predicting No is 0.224
Overall accuracy is 0.830
Logit Reg confusion matrix is:
[[7247    0]
 [1449    0]]
Accuracy of predicting Yes is 0.833
Accuracy of predicting No is nan
Overall accuracy is 0.833


  


We can see that Gradient Boosting was the most accurate

#### To interpret a confusion matrix, the column 1 and 2 represent the model predicting Yes or No. Then the rows represent were they actually yes or no. The [0,0] entry means that we predicted that many to be Yes, and it turns out that we were correct in predicting those, so that tells us how many yes observations we predicted correctly. Likewise, the [1,1] entry means how many no's we predicted correctly. The bottom left, [1,0] entry means we predicted yes, but was actually no and [0,1] entry is the vice versa. Conclusively, the diagonals of the matrix tells us how accurate are our predictions

### We have predictions from our models and we can take a majority vote to predict each observation. Here, we use a majority vote. Here, we assign each model as having a vote that is of equal importance.

In [124]:
ensem = VotingClassifier(estimators=[('AdaBoost', regr), ('ExtraTrees', Extratree), ('GradientBoost', GradBoost)], voting='hard')

In [125]:
ensem = ensem.fit(final_train,y_train)
ensempred1 = ensem.predict(final_test)

In [126]:
ensempred1

array([0, 0, 0, ..., 0, 0, 0])

In [128]:
matrix = confusion_matrix(y_test, ensempred1)
print(matrix)
print('Accuracy of predicting Yes is %.3f'%(matrix[0][0]/(matrix[0][0]+matrix[1][0])))
print('Accuracy of predicting No is %.3f'%(matrix[1][1]/(matrix[0][1]+matrix[1][1])))
print('Overall accuracy is %.3f' % ((matrix[0][0]+matrix[1][1])/(matrix[0][0]+matrix[1][1]+matrix[1][0]+matrix[0][1])))
    

[[7003  244]
 [1026  423]]
Accuracy of predicting Yes is 0.872
Accuracy of predicting No is 0.634
Overall accuracy is 0.854


We see here that taking a vote on each prediction improves the model