In [9]:
import pandas as pd
import numpy as np 

# models 
from sklearn.linear_model import LogisticRegression

#metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,f1_score, precision_score
from sklearn.model_selection import RepeatedStratifiedKFold

#smote 
from imblearn.over_sampling import SMOTE

# data
from sklearn.model_selection import train_test_split

#ignoring warnings
import warnings
warnings.filterwarnings('ignore')

#GridSearch
from sklearn.model_selection import GridSearchCV

#KNN
from sklearn.neighbors import KNeighborsClassifier

#SVM
from sklearn.svm import SVC

# Data Import and Split 

In [43]:
Train = pd.read_csv(r'../data/Train_year.csv', index_col=0)
y_Train = pd.read_csv(r'../data/y_Train_year.csv', index_col= 0)

Test = pd.read_csv(r'../data/Test_year.csv', index_col= 0)
y_Test = pd.read_csv(r'../data/y_Test_year.csv', index_col= 0)

In [31]:
features = ['ADR', 'LeadTime','StaysInWeekNights', 'TotalOfSpecialRequests',
        'BookingChanges', 'PreviousBookingsNotCanceled', 'RequiredCarParkingSpaces', 'PreviousCancellations',
        'x0_BB', 'x0_SC', 'x1_A', 'x1_B', 'x1_D',
       'x1_E', 'x1_F', 'x1_G', 'x2_avg_booker', 'x2_good_booker',
       'x2_low_booker', 'x2_no_booker', 'x2_super_booker', 'x3_Autumn',
       'x3_Spring', 'x3_Summer', 'x4_Low_Season']

In [13]:
# creating train/val and test dataframes 
X_train_val, X_test, y_train_val, y_test = train_test_split(df_to_models[features], 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=15, 
                                                    shuffle=True, 
                                                    stratify=y
                                                   )

# Functions to Test Models 

In [14]:
def metrics(X_train, X_val, y_train, pred_train , y_val, pred_val, model):
    print('___________________________________________________________________________________________________________')
    print('                                                     TRAIN                                                 ')
    print('-----------------------------------------------------------------------------------------------------------')
    print(classification_report(y_train, pred_train))
    print(confusion_matrix(y_train, pred_train))
    print("Score: "+ str(model.score(X_train, y_train)))
    print("F1 Score: "+ str(f1_score(y_train, pred_train)))
    print("Precision: "+ str(precision_score(y_train, pred_train)))
    

    print('___________________________________________________________________________________________________________')
    print('                                                VALIDATION                                                 ')
    print('-----------------------------------------------------------------------------------------------------------')
    print(classification_report(y_val, pred_val))
    print(confusion_matrix(y_val, pred_val))
    print("Score: "+ str(model.score(X_val, y_val)))
    print("F1 Score: "+ str(f1_score(y_val, pred_val)))
    print("Precision: "+ str(precision_score(y_val, pred_val)))

In [47]:
def avg_score(model, data_to_slice, y_to_slice, columns_to_use, smote = True):
    # apply kfold
    skf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5)
    # create lists to store the results from the different models 
    score_train = []
    score_test = []
    f1_list = []
    precision_list =[]
    tn_avg = 0
    fp_avg = 0
    fn_avg = 0
    tp_avg = 0
    count = 0
    for train_index, test_index in skf.split(data_to_slice[columns_to_use],y_to_slice):
        # get the indexes of the observations assigned for each partition
        X_train, X_val = data_to_slice[columns_to_use].iloc[train_index], data_to_slice[columns_to_use].iloc[test_index]
        y_train, y_val = y_to_slice.iloc[train_index], y.iloc[test_index]
        
        # SMOTE É AQUI 
        if smote:
             
            smote = SMOTE(random_state = 11)
            X_train, y_train = smote.fit_resample(X_train, y_train)

        # applies the model 
        model_fit = model.fit(X_train, y_train)
        # predicts training 
        y_pred_train =  model_fit.predict(X_train)
        #predicts validation 
        y_pred_val = model_fit.predict(X_val)
        # prints metric results 
        
        #metrics(X_train, X_val, y_train, y_pred_train, y_val, y_pred_val, model)
        tn, fp, fn, tp = confusion_matrix(y_val, y_pred_val).ravel()
        count += 1
        tn_avg += tn
        fp_avg += fp
        fn_avg += fn
        tp_avg += tp

        
        value_train = model.score(X_train, y_train)
        # check the mean accuracy for the test
        value_test = model.score(X_val,y_val)
        f1_score_val = f1_score(y_val, y_pred_val)
        precision_val = precision_score(y_val, y_pred_val)
        # append the accuracies, the time and the number of iterations in the corresponding list
        score_train.append(value_train)
        score_test.append(value_test)
        f1_list.append(f1_score_val)
        precision_list.append(precision_val)
  
    avg_train = round(np.mean(score_train),3)
    avg_test = round(np.mean(score_test),3)
    std_train = round(np.std(score_train),2)
    std_test = round(np.std(score_test),2)
    avg_f1 = round(np.mean(f1_list),3)
    std_f1 = round(np.std(f1_list),2)
    avg_precision = round(np.mean(precision_list),3)
    std_precision = round(np.std(precision_list),2)

    tn_avg = tn_avg / count
    fp_avg = fp_avg / count
    fn_avg = fn_avg / count
    tp_avg = tp_avg / count
    #print(confusion_matrix(y_val, y_pred_val))
    print(str(tp_avg)+ ' , ' + str(tn_avg) + '\n' + str(fp_avg) + ' , ' +  str(fn_avg))
    return str(avg_train) + '+/-' + str(std_train),\
            str(avg_test) + '+/-' + str(std_test) , str(avg_f1) + '+/-' + str(std_f1), avg_f1,\
                ' Precision: ' + str(avg_precision) + '+/-' + str(std_precision), avg_precision

## Logistic Regression

In [48]:
LogReg = LogisticRegression(random_state=11)

avg_score(LogReg, Train[features], y_Train,Train[features].columns, smote = True)


1555.92 , 2398.44
1025.56 , 1640.48


('0.717+/-0.0',
 '0.597+/-0.01',
 '0.539+/-0.01',
 0.539,
 ' Precision: 0.603+/-0.0',
 0.603)

In [20]:
param_grid = {
    'penalty': ['l1','l2','elasticnet','none'],
    'dual':[True, False],
    'C':[0.05, 0.2, 0.5, 1, 2],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [50,100,200,500],
    'multi_class': ['auto', 'ovr', 'multinomial'],
    
}
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = LogReg, param_grid = param_grid, scoring = 'precision')

grid_search.fit(X_train_val, y_train_val)
grid_search.best_params_

{'C': 0.05,
 'dual': False,
 'max_iter': 50,
 'multi_class': 'multinomial',
 'penalty': 'l1',
 'solver': 'saga'}

In [49]:
LogReg = LogisticRegression(random_state=11,C= 0.05,dual= False,max_iter= 50,
 multi_class= 'multinomial',penalty= 'l1', solver= 'saga' )

avg_score(LogReg, Train[features], y_Train,Train[features].columns, smote = True)


1556.36 , 2403.44
1020.56 , 1640.04


('0.717+/-0.0',
 '0.598+/-0.01',
 '0.539+/-0.0',
 0.539,
 ' Precision: 0.604+/-0.01',
 0.604)

## KNN Classifier

In [24]:
modelKNN = KNeighborsClassifier()

avg_score(modelKNN, X_train_val, y_train_val,X_train_val.columns, smote = True)

1242.12 , 3239.32
2059.88 , 1956.28


('0.855+/-0.0',
 '0.527+/-0.01',
 '0.382+/-0.01',
 0.382,
 ' Precision: 0.376+/-0.01',
 0.376)

In [25]:
param_grid = {
    'n_neighbors': [5,10,50,100],
    'weights':['uniform', 'distance'],
    'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10,30,50]
}
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = modelKNN, param_grid = param_grid, scoring = 'precision')

grid_search.fit(X_train_val, y_train_val)
grid_search.best_params_

{'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 50, 'weights': 'uniform'}

In [27]:
modelKNN = KNeighborsClassifier(n_neighbors=50, weights='uniform',algorithm='auto',
leaf_size= 10)

avg_score(modelKNN, X_train_val, y_train_val,X_train_val.columns, smote = True)

1280.88 , 3187.56
2111.64 , 1917.52


('0.747+/-0.0',
 '0.526+/-0.0',
 '0.389+/-0.01',
 0.389,
 ' Precision: 0.378+/-0.01',
 0.378)

In [29]:
df_to_models.columns

Index(['ADR', 'Adults', 'ArrivalDateWeekNumber', 'BookingChanges', 'Children',
       'DaysInWaitingList', 'LeadTime', 'PreviousBookingsNotCanceled',
       'RequiredCarParkingSpaces', 'PreviousCancellations',
       'PreviousCancellationRate', 'StaysInWeekendNights', 'StaysInWeekNights',
       'TotalOfSpecialRequests', 'x0_BB', 'x0_SC', 'x1_A', 'x1_B', 'x1_D',
       'x1_E', 'x1_F', 'x1_G', 'x2_avg_booker', 'x2_good_booker',
       'x2_low_booker', 'x2_no_booker', 'x2_super_booker', 'x3_Autumn',
       'x3_Spring', 'x3_Summer', 'x4_Low_Season'],
      dtype='object')