In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format='retina'

import pandas as pd
import numpy as np
import re
import os
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import pylab
import matplotlib as mpl
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

pd.set_option('max_columns', 1000) 
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

In [None]:
SAVE_PROCESSED_DATA = True

In [None]:
scriptDir = os.getcwd()
relPath = r"../Customer Segmentation/"
sampleFilePath = os.path.join(scriptDir, relPath, 'whole_dataset.csv')
relPathOutput = r"../output/"
outputFolderPath = os.path.join(scriptDir, relPathOutput)

In [None]:
df = pd.read_csv(sampleFilePath, sep=',', header=0, quotechar='"', encoding='latin1')
df = df.drop(['companyid'], axis=1)

# Prepare Data For Consumption

In [None]:
Target = ['rebate(%)']
Features = ['HashEncode_0', 'HashEncode_1', 'HashEncode_2', 'HashEncode_3',
       'HashEncode_4', 'HashEncode_5', 'HashEncode_6', 'HashEncode_7',
       'Round_trip', 'Scaled_gross_log', 'Scaled_Distance_log',
       'Scaled_Total Flight Time (Dec)', 'Country_France', 'Country_Germany',
       'Country_Italy', 'Country_Monaco', 'Country_Other', 'Country_Sweden',
       'Country_Switzerland', 'Country_Turkey', 'Country_United Kingdom',
       'Country_United States', 'Product_CJ - Adhoc', 'Product_Seat Sales',
       'Product_Tour Ops', 'passengersRange_1', 'passengersRange_2',
       'passengersRange_3', 'passengersRange_4', 'Class', 'categ_aircraft','categ_0', 'categ_1',  'categ_2',  'categ_3',  'categ_4','categ_5']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[Features], df[Target], test_size = 0.33, random_state = 0)

# Ensembling & Stacking models

## base-line classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
import pydotplus

In [None]:
model = DecisionTreeClassifier(criterion='entropy')
model.fit(X_train, y_train)
y_test_hat = model.predict(X_test)   

accuracy = accuracy_score(y_test, y_test_hat)
print('Accuracy_score: %.2f%%' % (accuracy * 100))

In [None]:
# dot_data = tree.export_graphviz(model, out_file=None, feature_names=Features, class_names=Target,
#                                 filled=True, rounded=True, special_characters=True)
# graph = pydotplus.graph_from_dot_data(dot_data)
# graph.write_pdf('airpartner.pdf')

### Feature extraction by using random forest
- split the train dataset into trainset and validation set

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
train_data, val_data, train_y, val_y = train_test_split(X_train, y_train, train_size=0.8, random_state=21)

In [None]:
np.random.seed(21)

rf_reg = RandomForestClassifier(n_estimators = 50, verbose=1)
rf_reg.fit(train_data, train_y)

combine_lists = lambda item: [item[0], item[1]]

feature_imp = list(map(combine_lists, zip(train_data.columns, rf_reg.feature_importances_)))
feature_imp = pd.DataFrame(
        feature_imp, columns=['Feature', 'Importance']
).sort_values(by='Importance', ascending=False)

In [None]:
feature_imp.head(10)

### All features with filtering importance less than 0.001 include constructed features

In [None]:
filter_feature = feature_imp[feature_imp['Importance'] > 0.001]['Feature'].tolist()

### Visualize the basic prediction

In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

In [None]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier

In [None]:
kfold = StratifiedKFold(n_splits=10)

In [None]:
random_state = 2
classifiers = []

classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state,learning_rate=0.1))
classifiers.append(RandomForestClassifier(random_state=random_state))
classifiers.append(ExtraTreesClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(SVC(random_state=random_state))
classifiers.append(XGBClassifier())
classifiers.append(MLPClassifier(random_state=random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state = random_state))
classifiers.append(LinearDiscriminantAnalysis())
classifiers.append(BaggingClassifier())
# classifiers.append(GaussianProcessClassifier())

In [None]:
cv_results = []
for classifier in classifiers :
    cv_results.append(cross_val_score(classifier, X_train, y = y_train, scoring = "accuracy", cv = kfold, n_jobs=4))
    
    
cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

cv_res = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,"Algorithm":["DecisionTree","AdaBoost",
"RandomForest","ExtraTrees","GradientBoosting","SVC","XGboost","MultipleLayerPerceptron","KNeighboors",\
"LogisticRegression","LinearDiscriminantAnalysis",'Bagging']})

In [None]:
g = sns.barplot(x = "CrossValMeans", y = "Algorithm", data = cv_res, color='m')
plt.title('Machine Learning Algorithm Accuracy Score \n')
plt.xlabel('Accuracy Score (%)')
plt.ylabel('Algorithm')
plt.show()

## Other classifiers

### Hyperparameter tunning for best models
- Bagging, AdaBoost, RandomForest and ExtraTrees perfrom the 70 plus score in the experiment, so that I choose those classifiers for the ensemble modeling.

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
evaluation = pd.DataFrame({'Model': [],
                           'Parameters':[],
                           'Root Mean Squared Error (RMSE)':[],
                           'R-squared (training)':[],
                           'Adjusted R-squared (training)':[],
                           'R-squared (test)':[],
                           'Adjusted R-squared (test)':[],
                           '10-Fold Cross Validation':[]})

In [None]:
def adjustedR2(r2,n,k):
    return r2-(k-1)/(n-k)*(1-r2)

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

#### AdaBoostClassifier hyperparameter tunning

In [None]:
DTC = DecisionTreeClassifier()

adaDTC = AdaBoostClassifier(DTC, random_state=7)

ada_param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "algorithm" : ["SAMME","SAMME.R"],
              "n_estimators" :[3,4],
              "learning_rate":  [0.0001, 0.001, 0.01, 0.1, 0.15, 0.2, 0.3,1.5]}

gs_adaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gs_adaDTC.fit(train_data, train_y)

ada_best = gs_adaDTC.best_estimator_
pred = gs_adaDTC.predict(X_test)

In [None]:
def score_list(classifier):
    rmsecm = float(format(np.sqrt(mean_squared_error(y_test,pred)),'.3f'))
    rtrcm = float(format(classifier.score(X_train, y_train),'.3f'))
    artrcm = float(format(adjustedR2(classifier.score(X_train, y_train),X_train.shape[0],len(Features)),'.3f'))
    rtecm = float(format(classifier.score(X_test, y_test),'.3f'))
    artecm = float(format(adjustedR2(classifier.score(X_test, y_test),X_test.shape[0],len(Features)),'.3f'))
    return rmsecm, rtrcm, artrcm, rtecm,artecm

rmsecm, rtrcm, artrcm, rtecm,artecm = score_list(gs_adaDTC)
r = evaluation.shape[0]
evaluation.loc[r] = ['AdaBoostClassifier',ada_best,rmsecm,rtrcm,artrcm,rtecm,artecm,gs_adaDTC.best_score_]

#### ExtraTree Classifier hyperparameter tunning

In [None]:
ExtC = ExtraTreesClassifier()

ex_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["entropy", "gini"]}


gs_ExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gs_ExtC.fit(train_data, train_y)

ExtC_best = gs_ExtC.best_estimator_
pred = gs_ExtC.predict(X_test)

In [None]:
rmsecm, rtrcm, artrcm, rtecm,artecm = score_list(ExtC_best)

r = evaluation.shape[0]
evaluation.loc[r] = ['ExtraTreesClassifier', ExtC_best, rmsecm, rtrcm, artrcm, rtecm, artecm, gs_ExtC.best_score_]

#### RandomForest Classifier hyperparameter tunning

In [None]:
RFC = RandomForestClassifier()

rf_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}


gs_RFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gs_RFC.fit(X_train,y_train)

RFC_best = gs_RFC.best_estimator_
pred = gs_RFC.predict(X_test)

In [None]:
rmsecm, rtrcm, artrcm, rtecm,artecm = score_list(gs_RFC)

r = evaluation.shape[0]
evaluation.loc[r] = ['RandomForestClassifier', RFC_best, rmsecm, rtrcm, artrcm, rtecm, artecm, gs_RFC.best_score_]

#### GradientBoosting Classifier hyperparameter tunning

In [None]:
GBC = GradientBoostingClassifier()
gb_param_grid = {'loss' : ["deviance"],
              'n_estimators' : [100,200,300],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8],
              'min_samples_leaf': [100,150],
              'max_features': [0.3, 0.1] 
              }

gs_GBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gs_GBC.fit(X_train,y_train)

GBC_best = gs_GBC.best_estimator_
pred = gs_GBC.predict(X_test)

In [None]:
rmsecm, rtrcm, artrcm, rtecm,artecm = score_list(gs_GBC)

r = evaluation.shape[0]
evaluation.loc[r] = ['GradientBoostingClassifier', GBC_best, rmsecm, rtrcm, artrcm, rtecm, artecm, gs_GBC.best_score_]

#### BaggingClassifier hyperparameter tunning

In [None]:
BC = BaggingClassifier()

bc_param_grid = {"n_estimators": [10, 30, 50, 100],
              "max_features": [1, 3, 10],
              "max_samples": [2, 3, 10],
              "bootstrap": ["False", "True"],
              "warm_start": ["False", "True"]}

gs_BC = GridSearchCV(BC, param_grid = bc_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gs_BC.fit(X_train, y_train)

BC_best = gs_BC.best_estimator_
pred = gs_BC.predict(X_test)

In [None]:
rmsecm, rtrcm, artrcm, rtecm,artecm = score_list(gs_BC)

r = evaluation.shape[0]
evaluation.loc[r] = ['BaggingClassifier', GBC_best, rmsecm, rtrcm, artrcm, rtecm, artecm, gs_BC.best_score_]

In [None]:
evaluation

### Plot Learning curves
- Learning curves are a good way to see the overfitting effect on the training set and the effect of the training size on the accuracy

In [None]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):
    """Generate a simple plot of the test and training learning curve"""
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, 
                                        cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

g = plot_learning_curve(gs_RFC.best_estimator_,"RF mearning curves",X_train,y_train,cv=kfold)
g = plot_learning_curve(gs_ExtC.best_estimator_,"ExtraTrees learning curves",X_train,y_train,cv=kfold)
# g = plot_learning_curve(gs_SVMC.best_estimator_,"SVC learning curves",X_train,y_train,cv=kfold)
g = plot_learning_curve(gs_adaDTC.best_estimator_,"AdaBoost learning curves",X_train,y_train,cv=kfold)
g = plot_learning_curve(gs_GBC.best_estimator_,"GradientBoosting learning curves",X_train,y_train,cv=kfold)

### Plot Confusion Matrix
- plot the normalized and unnormalized confusion matrix

In [None]:
import itertools
from sklearn.metrics import confusion_matrix, recall_score, classification_report

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Compute confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test, pred)
np.set_printoptions(precision=2)

print('Recall metric in the testing dataset: ', cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
class_names = ['rebate_0', 'rebate_1','rebate_2','rebate_3','rebate_4']
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, 
                      title='Normalized confusion matrix')

### Graph ML version of DecisionTree

In [None]:
import graphviz 

In [None]:
dot_data = tree.export_graphviz(DTC, out_file=None, 
                                feature_names = Features, class_names = True,
                                filled = True, rounded = True)
graph = graphviz.Source(dot_data) 

### Feature importance of tree based classifiers
- In order to see the most informative features for the prediction of passengers survival, i displayed the feature importance for the 4 tree based classifiers.

In [None]:
nrows = ncols = 2
fig.tight_layout()
fig, axes = plt.subplots(nrows = nrows, ncols = ncols, sharex="all", figsize=(15,15))
plt.subplots_adjust(wspace =0.45, hspace =0.1)

names_classifiers = [("AdaBoosting", ada_best),("ExtraTrees",ExtC_best),("RandomForest",RFC_best),("GradientBoosting",GBC_best)]

nclassifier = 0
for row in range(nrows):
    for col in range(ncols):

        name = names_classifiers[nclassifier][0]
        classifier = names_classifiers[nclassifier][1]
        indices = np.argsort(classifier.feature_importances_)[::-1][:40]
        g = sns.barplot(y=X_train.columns[indices][:40],x = classifier.feature_importances_[indices][:40] , orient='h',ax=axes[row][col])
        g.set_xlabel("Relative importance",fontsize=12)
        g.set_title(name + " feature importance")
        nclassifier += 1

In [None]:
test_Survived_RFC = pd.Series(RFC_best.predict(X_test), name="RFC")
test_Survived_ExtC = pd.Series(ExtC_best.predict(X_test), name="ExtC")
test_Survived_DTC = pd.Series(DTC_best.predict(X_test), name="DTC")
test_Survived_AdaC = pd.Series(ada_best.predict(X_test), name="Ada")
test_Survived_GBC = pd.Series(GBC_best.predict(X_test), name="GBC")

# Concatenate all classifier results
ensemble_results = pd.concat([test_Survived_RFC,test_Survived_ExtC,test_Survived_AdaC,test_Survived_GBC, test_Survived_DTC],axis=1)

g= sns.heatmap(ensemble_results.corr(),annot=True,linewidth=1)

## Ensemble modeing
- I choose a voting classifier to combine the predicitions coming from the 5 classifiers.
- I preferred to pass the argument 'soft' to the voting parameter to take into account the probability of each vote

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
votingC = VotingClassifier(estimators=[('rfc', RFC_best), ('extc', ExtC_best),\
('dtc', DTC_best), ('adac',ada_best),('gbc',GBC_best)], voting='soft', n_jobs=4)

votingC = votingC.fit(X_train, y_train)

### 6.3 Prediction and Submit results

In [None]:
test_rebate = pd.Series(votingC.predict(X_test), name="rebate(%)")

results = pd.concat([y_test, test_rebate],axis=1)

results.to_csv("ensemble_python_voting.csv",index=False)