In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# models
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# model selection
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# metrics
from sklearn.metrics import precision_score, classification_report, confusion_matrix, f1_score, recall_score, accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold

# smote
from imblearn.over_sampling import SMOTE

# data
from sklearn.model_selection import train_test_split

# ignoring warnings
import warnings
warnings.filterwarnings('ignore')


# Data Import and Split


In [24]:
Train = pd.read_csv(r'../data/Train_year.csv', index_col=0)
y_Train = pd.read_csv(r'../data/y_Train_year.csv', index_col=0)

Test = pd.read_csv(r'../data/Test_year.csv', index_col=0)
y_Test = pd.read_csv(r'../data/y_Test_year.csv', index_col=0)


In [25]:
features = ['ADR', 'Adults', 'ArrivalDateWeekNumber', 'BookingChanges',
       'LeadTime',
        'StaysInWeekNights',
       'PreviousCancellationRate',
       'TotalOfSpecialRequests','x2_avg_booker',
       'x2_good_booker', 'x2_low_booker', 'x2_no_booker', 'x2_super_booker',
       'x4_Low_Season']

# Functions to Test Models


In [26]:
def metrics(X_train, X_val, y_train, pred_train, y_val, pred_val, model):
    print('___________________________________________________________________________________________________________')
    print('                                                     TRAIN                                                 ')
    print('-----------------------------------------------------------------------------------------------------------')
    print(classification_report(y_train, pred_train))
    print(confusion_matrix(y_train, pred_train))
    print("Score: " + str(model.score(X_train, y_train)))
    print("F1 Score: " + str(f1_score(y_train, pred_train)))

    print('___________________________________________________________________________________________________________')
    print('                                                VALIDATION                                                 ')
    print('-----------------------------------------------------------------------------------------------------------')
    print(classification_report(y_val, pred_val))
    print(confusion_matrix(y_val, pred_val))
    print("Score: " + str(model.score(X_val, y_val)))
    print("F1 Score: " + str(f1_score(y_val, pred_val)))


In [27]:
def avg_score(model, data_to_slice, y_to_slice, columns_to_use, smote=True):
    # apply kfold
    skf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5)
    # create lists to store the results from the different models
    score_train = []
    score_test = []
    f1_list = []
    precision_list = []
    recall_list = []
    tn_avg = 0
    fp_avg = 0
    fn_avg = 0
    tp_avg = 0
    count = 0
    for train_index, test_index in skf.split(data_to_slice[columns_to_use], y_to_slice):
        # get the indexes of the observations assigned for each partition
        X_train, X_val = data_to_slice[columns_to_use].iloc[train_index], data_to_slice[columns_to_use].iloc[test_index]
        y_train, y_val = y_to_slice.iloc[train_index], y_to_slice.iloc[test_index]

        # SMOTE É AQUI
        if smote:

            smote = SMOTE(random_state=11)
            X_train, y_train = smote.fit_resample(X_train, y_train)

        # applies the model
        model_fit = model.fit(X_train, y_train)
        # predicts training
        y_pred_train = model_fit.predict(X_train)
        # predicts validation
        y_pred_val = model_fit.predict(X_val)
        # prints metric results

        #metrics(X_train, X_val, y_train, y_pred_train, y_val, y_pred_val, model)
        tn, fp, fn, tp = confusion_matrix(y_val, y_pred_val).ravel()
        count += 1
        tn_avg += tn
        fp_avg += fp
        fn_avg += fn
        tp_avg += tp

        value_train = model.score(X_train, y_train)
        # check the mean accuracy for the test
        value_test = model.score(X_val, y_val)
        f1_score_val = f1_score(y_val, y_pred_val)
        precision_val = precision_score(y_val, y_pred_val)
        recall_val = recall_score(y_val, y_pred_val)
        # append the accuracies, the time and the number of iterations in the corresponding list
        score_train.append(value_train)
        score_test.append(value_test)
        f1_list.append(f1_score_val)
        precision_list.append(precision_val)
        recall_list.append(recall_val)

    avg_train = round(np.mean(score_train), 3)
    avg_test = round(np.mean(score_test), 3)
    std_train = round(np.std(score_train), 2)
    std_test = round(np.std(score_test), 2)
    avg_f1 = round(np.mean(f1_list), 3)
    std_f1 = round(np.std(f1_list), 2)
    avg_precision = round(np.mean(precision_list), 3)
    std_precision = round(np.std(precision_list), 2)
    avg_recall = round(np.mean(recall_list), 3)
    std_recall = round(np.mean(recall_list), 2)

    tn_avg = tn_avg / count
    fp_avg = fp_avg / count
    fn_avg = fn_avg / count
    tp_avg = tp_avg / count
    #print(confusion_matrix(y_val, y_pred_val))
    print(str(tp_avg) + ' , ' + str(fp_avg) +
          '\n' + str(fn_avg) + ' , ' + str(tn_avg))
    return str(avg_train) + '+/-' + str(std_train),\
        str(avg_test) + '+/-' + str(std_test),\
        'F1 SCORE : ' + str(avg_f1) + '+/-' + str(std_f1), \
        'Precision : ' + str(avg_precision) + '+/-' + str(std_precision), \
        'RECALL :' + str(avg_recall) + '+/-' + str(std_recall)


In [38]:
def Test_models(model, Train, y_Train, Test, y_Test):
    model_fit = model.fit(Train, y_Train)
    y_pred_test = model_fit.predict(Test)
    a = 0
    ones = 0
    for i, x in enumerate(y_Test.IsCanceled):
        #print(i, y_pred_test[i],x)
        if y_pred_test[i] == x:
            a += 1
            if x == 1:
                ones += 1

    print(classification_report(y_Test, y_pred_test))
    print(confusion_matrix(y_Test, y_pred_test))
    print(a)
    print(ones)

In [29]:
# from matplotlib.legend_handler import HandlerLine2D
# n_estimators = np.arange(1, 50)
# train_results = []
# test_results = []
# diff=[]

# for estimator in n_estimators:
#     rf = RandomForestClassifier(n_estimators=estimator, criterion= 'gini')
#     rf.fit(Train, y_Train)
#     train_pred = rf.predict(Train)
#     train_score = f1_score(y_Train,train_pred)
#     train_results.append(train_score)

#     test_pred = rf.predict(Test)
#     val_score = f1_score(y_Test,test_pred)
#     test_results.append(val_score)
#     diff.append(val_score-train_score)

# line1, = plt.plot(n_estimators, train_results, 'b', label='Train F1')
# line2, = plt.plot(n_estimators, test_results, 'r', label='Test F1')
# plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
# plt.ylabel('F1 Score')
# plt.xlabel('n_estimators')
# plt.show()


In [30]:
# df_estimator=pd.DataFrame([pd.Series(n_estimators,name='n_estimators',dtype=int),pd.Series(train_results,name='Train'),pd.Series(test_results,name='Val'),pd.Series(diff,name='Diff')]).T
# df_estimator.sort_values(by='Val',ascending=False)


# Random Forest

In [9]:
rf_gini = RandomForestClassifier(criterion='gini', n_estimators=25)
rf_entropy = RandomForestClassifier(criterion='entropy', n_estimators=25)


In [10]:
avg_score(rf_gini, Train, y_Train, Train.columns, True)


1119.96 , 530.24
715.24 , 4254.96


('0.993+/-0.0',
 '0.812+/-0.0',
 'F1 SCORE : 0.643+/-0.01',
 'Precision : 0.679+/-0.01',
 'RECALL :0.61+/-0.61')

In [11]:
avg_score(rf_entropy, Train, y_Train, Train.columns, True)


1118.96 , 523.92
716.24 , 4261.28


('0.993+/-0.0',
 '0.813+/-0.0',
 'F1 SCORE : 0.643+/-0.01',
 'Precision : 0.681+/-0.01',
 'RECALL :0.61+/-0.61')

In [12]:
prediction = rf_entropy.predict(Test)


In [13]:
a = 0
for i, x in enumerate(y_Test.IsCanceled):
    if prediction[i] == x:
        a += 1
a


14174

Entropy looks like the best choice although both criterias are very similar.


In [14]:
# # forest params
# param_grid = {
#     'max_depth' : range(12,20),
#     'n_estimators' : range(25,27),
#     'min_impurity_decrease' : np.linspace(0.0000001,0.01,100),
#     'min_samples_leaf': range(110,140,10),
#     'min_samples_split':range(50,100,10)
# }

# # Instantiate the grid search model
# grid_search = GridSearchCV(estimator = rf_entropy, param_grid = param_grid, scoring = 'f1')

# grid_search.fit(Train, y_Train)
# grid_search.best_params_, grid_search.best_score_


In [31]:
rf_model = RandomForestClassifier(criterion='entropy', max_depth=15, min_impurity_decrease=0.00010110909090909092,
                                  min_samples_leaf=120, min_samples_split=50, n_estimators=130)
avg_score(rf_model, Train, y_Train, Train.columns, True)


1135.28 , 644.4
699.92 , 4140.8


('0.793+/-0.0',
 '0.797+/-0.0',
 'F1 SCORE : 0.628+/-0.01',
 'Precision : 0.638+/-0.01',
 'RECALL :0.619+/-0.62')

In [32]:
rf_model2 = RandomForestClassifier(
    n_estimators=130, max_depth=15, random_state=10)
avg_score(rf_model2, Train, y_Train, Train.columns, False)


935.92 , 261.88
899.28 , 4523.32


('0.853+/-0.0',
 '0.825+/-0.0',
 'F1 SCORE : 0.617+/-0.01',
 'Precision : 0.781+/-0.01',
 'RECALL :0.51+/-0.51')

In [33]:
Test_models(rf_model, Train, y_Train, Test, y_Test)

              precision    recall  f1-score   support

           0       0.74      0.90      0.81     13183
           1       0.66      0.38      0.48      6826

    accuracy                           0.72     20009
   macro avg       0.70      0.64      0.65     20009
weighted avg       0.71      0.72      0.70     20009

[[11871  1312]
 [ 4244  2582]]
14453
2582


In [37]:
Test_models(rf_model2, Train, y_Train, Test, y_Test)

              precision    recall  f1-score   support

           0       0.75      0.89      0.81     13183
           1       0.66      0.41      0.51      6826

    accuracy                           0.73     20009
   macro avg       0.70      0.65      0.66     20009
weighted avg       0.72      0.73      0.71     20009

[[11741  1442]
 [ 3998  2828]]
14569
2828
0.6622950819672131
0.4142982713155582
0.5097332372025954


# XGBoost 

In [20]:
model = XGBClassifier(random_state = 10 , n_estimators = 130)

In [21]:
# forest params
param_grid = {
    'max_depth' : range(3,8,1),
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = model, param_grid = param_grid, scoring = 'f1')

grid_search.fit(Train, y_Train)
grid_search.best_params_, grid_search.best_score_




({'max_depth': 3}, 0.12671877512707025)

In [39]:
model = XGBClassifier(verbosity = 0, n_estimators = 163, max_depth = 5, random_state = 10, learning_rate = 0.2)
# avg_score(model, Train, y_Train, Train.columns, True)

In [40]:
Test_models(model, Train, y_Train, Test, y_Test)

              precision    recall  f1-score   support

           0       0.76      0.86      0.80     13183
           1       0.63      0.47      0.54      6826

    accuracy                           0.72     20009
   macro avg       0.69      0.66      0.67     20009
weighted avg       0.71      0.72      0.71     20009

[[11280  1903]
 [ 3627  3199]]
14479
3199


# Gaussian Bayes

In [17]:
GaussianNB = GaussianNB(var_smoothing= 0.003511191734215131)

In [19]:
parameter_space = {
    'var_smoothing': np.logspace(0,-9, num=100)
    
}

grid_search = GridSearchCV(GaussianNB, parameter_space, scoring = 'f1')

grid_search.fit(Train[features], y_Train)
grid_search.best_params_

{'var_smoothing': 0.003511191734215131}

In [22]:
avg_score(GaussianNB, Train, y_Train, Train.columns, True)

1453.44 , 2252.4
381.76 , 2532.8


('0.668+/-0.01',
 '0.602+/-0.01',
 'F1 SCORE : 0.525+/-0.01',
 'Precision : 0.392+/-0.01',
 'RECALL :0.792+/-0.79')

In [20]:
Test_models(GaussianNB, Train, y_Train, Test, y_Test)

              precision    recall  f1-score   support

           0       0.80      0.59      0.68     13183
           1       0.47      0.71      0.57      6826

    accuracy                           0.63     20009
   macro avg       0.63      0.65      0.62     20009
weighted avg       0.69      0.63      0.64     20009

[[7780 5403]
 [1989 4837]]
12617
4837
