# Imports


In [48]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix, recall_score
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.model_selection import cross_val_score

## Reading the file

In [13]:
df=pd.read_csv('hotel_bookings_cleaned_Kbest.csv')

In [14]:
df.head()

Unnamed: 0,deposit_type_No Deposit,deposit_type_Non Refund,lead_time,adr,home_country,previous_cancellations,total_of_special_requests,required_car_parking_spaces,distribution_channel_TA/TO,market_segment_Groups,...,reserved_room_type_D,babies,customer_type_Contract,market_segment_Complementary,market_segment_Corporate,reserved_room_type_L,meal_FB,deposit_type_Refundable,reserved_room_type_B,is_canceled
0,1,0,342,0.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,737,0.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,7,75.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,13,75.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1,0,14,98.0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## What will we cover in this notebook ?
- train test split
- logistic regression
- random forest
- xgboost classifier
- hyperparameters tuning
- make predictions
- stacking

## Splitting the Data into a training and a testing set

In [16]:
#train test split
X=df.drop('is_canceled',axis=1)
y=df.is_canceled
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=3)


## Test 3 models with Cross validation without hyperparameters tuning

### First let's find out what metrics will we use for our classification problem

In this classification task, we want to reduce the number of false negative, that is to say the number of bookings that we will predict canceled but that will not be. It is also important to reduce the number of false positive, but focusing on false negative is more important since if the hotel choose to allocate another booking to the room, it will be a problem if the first person that have booked the room come (2 families for only one room). It may be expensive for the hotel to deal with it.

#### Therefore, we will use recall, which is a good metric when we want to focus on reducing the number of false negative.

### Second, let's define a function Recall_cv with Cross Validation Strategy

In [28]:
def recall_cv(model):
    recall= np.mean(cross_val_score(model, X_train, y_train, scoring="recall", cv = 5))
    return recall

Before diving directly into hyperparameters tuning, I like to see how various different models perform with default parameters. I tried the following models using 5 fold cross validation to get a baseline. With a validation set basline, we can see how much tuning improves each of the models. 

In [31]:
#Logistic Regression
lr=LogisticRegression(max_iter=3000)
lr.fit(X_train,y_train)
score_lr=recall_cv(lr)
print(f"\nLogistic Regression score: {score_lr}\n")


Logistic Regression score: 0.6121672165997871



In [27]:
#Random Forest
rf=RandomForestClassifier(n_jobs=-1)
rf.fit(X_train,y_train)
score_rf=recall_cv(rf)
print(f"\nRandom Forest score: {score_rf}\n")


Random Forest score: 0.7897479251410138



In [29]:
#XGBoost classifier
xg=XGBClassifier(n_jobs=-1)
xg.fit(X_train,y_train)
score_xg=recall_cv(xg)
print(f"\nXGBoost Classifier score: {score_xg}\n")


XGBoost Classifier score: 0.7692611049699671



In [33]:
scores=pd.DataFrame(index=['Logistic_Reg','Random_Forest','XGboost'],data=[score_lr,score_rf,score_xg],columns=['score']).sort_values('score',ascending=False)
scores

Unnamed: 0,score
Random_Forest,0.789748
XGboost,0.769261
Logistic_Reg,0.612167


## Hyperparameters tuning

In [36]:
# Logistic regression

parameters = {'max_iter' : [2000],
              'penalty' : ['l1', 'l2'],
              'C' : np.logspace(-4, 4, 20),
              'solver' : ['liblinear']}

gs_lr=GridSearchCV(lr,parameters,scoring='recall',n_jobs=-1,cv=3)
gs_lr.fit(X_train,y_train)
gs_lr.best_estimator_

LogisticRegression(C=3792.690190732246, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=2000, multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
lr=LogisticRegression(C=3792.690190732246, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=2000, multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
lr.fit(X_train,y_train)

In [38]:
# Random Forest

parameters={'n_estimators':(100,200,300,400),
           'max_features':['log2','sqrt',1,0.5],
           'min_samples_split':[2,3],
           'min_samples_leaf':[1,3,5,10,25]}

gs_rf=RandomizedSearchCV(rf,parameters,scoring='recall',n_jobs=-1,n_iter=10,cv=3)
gs_rf.fit(X_train,y_train)
gs_rf.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=0.5,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [41]:
rf=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=0.5,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=0.5,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [40]:
# XGBoost Classifier

parameters={ "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
             "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
             "min_child_weight" : [ 1, 3, 5, 7 ],
             "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
            "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] }

gs_xg=RandomizedSearchCV(xg,parameters,scoring='recall',n_jobs=-1,n_iter=10,cv=3)
gs_xg.fit(X_train,y_train)
gs_xg.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, gamma=0.0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=12,
              min_child_weight=7, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=-1, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [43]:
xg=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, gamma=0.0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=12,
              min_child_weight=7, monotone_constraints='()',
              n_estimators=100, n_jobs=-1, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

xg.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, gamma=0.0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=12,
              min_child_weight=7, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=-1, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

## Make predictions

In [44]:
#Make predictions
preds_lr=lr.predict(X_test)
preds_rf=rf.predict(X_test)
preds_xg=xg.predict(X_test)

In [46]:
#Calculate the score of each model
scores_lr=recall_score(preds_lr,y_test)
scores_rf=recall_score(preds_rf,y_test)
scores_xg=recall_score(preds_xg,y_test)

In [47]:
#put the scores in a DataFrame
scores=pd.DataFrame({'scores':[scores_lr,scores_rf,scores_xg]})
scores= scores.rename(index={0:'Logistic_Reg',1:'Random_Forest',2:'XGBoost'}).sort_values('scores',ascending=False)
scores

Unnamed: 0,scores
Random_Forest,0.859945
XGBoost,0.847029
Logistic_Reg,0.801322


## Stacking

In [63]:
voting_clf_hard = VotingClassifier(estimators = [('Logistic_Reg',lr),('rf',rf),('XGBoost',xg)], voting = 'hard') 
voting_clf_soft = VotingClassifier(estimators = [('Logistic_Reg',lr),('rf',rf),('XGBoost',xg)], voting = 'soft') 

In [56]:
voting_clf_hard.fit(X_train,y_train)
preds_voting_hard=voting_clf_hard.predict(X_test)
scores_voting_hard=recall_score(preds_voting_hard,y_test)

In [64]:
#Hyperparameter tuning for the voting classifier.
#in a soft voting classifier you can weight some models more than others.

parameters = {'weights' : [[1,1,1],[1,2,1],[1,1,2],[2,1,1],[2,2,1],[1,2,2],[2,1,2]]}

gs_voting=GridSearchCV(voting_clf_soft,parameters,scoring='recall',n_jobs=-1,cv=3)
gs_voting.fit(X_train,y_train)
gs_voting.best_estimator_

VotingClassifier(estimators=[('Logistic_Reg',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=3000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_wei...
                                        

In [65]:
#Make predictions
voting_clf_soft = VotingClassifier(estimators = [('Logistic_Reg',lr),('rf',rf),('XGBoost',xg)], voting = 'soft',weights=[1, 2, 2]) 
voting_clf_soft.fit(X_train,y_train)
preds_voting_soft=voting_clf_soft.predict(X_test)
scores_voting_soft=recall_score(preds_voting_soft,y_test)

In [72]:
# Scoring with accuracy
scores_lr_acc=accuracy_score(preds_lr,y_test)
scores_rf_acc=accuracy_score(preds_rf,y_test)
scores_xg_acc=accuracy_score(preds_xg,y_test)
scores_voting_hard_acc=accuracy_score(preds_voting_hard,y_test)
scores_voting_soft_acc=accuracy_score(preds_voting_soft,y_test)

In [73]:
final_scores=pd.DataFrame({'scores_recall':[scores_lr,scores_rf,scores_xg,scores_voting_hard,scores_voting_soft],'scores_accuracy':[scores_lr_acc,scores_rf_acc,scores_xg_acc,scores_voting_hard_acc,scores_voting_soft_acc]})
final_scores= final_scores.rename(index={0:'Logistic_Reg',1:'Random_Forest',2:'XGBoost',3:'Voting Classifier hard',4:'Voting Classifier soft'}).sort_values('scores_recall',ascending=False)
final_scores

Unnamed: 0,scores_recall,scores_accuracy
Voting Classifier hard,0.866019,0.874558
Voting Classifier soft,0.865064,0.877628
Random_Forest,0.859945,0.878932
XGBoost,0.847029,0.870984
Logistic_Reg,0.801322,0.799916


## Confusion Matrix

In [74]:
print (confusion_matrix(preds_voting_hard,y_test))

[[13842  1907]
 [ 1076  6955]]
