In [1]:
import patoolib # to extract the files
import pandas as pd # for data analysis and manipulation
import numpy as np # for scientific computing
import plotly.express as px # for visualization

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier # for random forest and feature selection
from sklearn.model_selection import train_test_split # for splitting the data
from sklearn.pipeline import Pipeline # for pipeline
from sklearn.preprocessing import * # for preprocessing
from sklearn.metrics import make_scorer, balanced_accuracy_score, f1_score,  precision_score, recall_score # for evaluation
from sklearn.model_selection import RandomizedSearchCV # for hyperparameter tuning
from datetime import datetime # date manipulation

In [2]:
# .rar extraction

#patoolib.extract_archive("raw/cancellation_prediction.rar", outdir="raw/")

# Reading dataset

In [3]:
df = pd.read_csv("raw/cancellation_prediction.csv")

# Data cleaning

### Droping duplicated rows

In [4]:
len(df)

119390

In [5]:
df = df.drop_duplicates()

In [6]:
len(df)

86144

### Checking NaN values

In [7]:
nan_df = pd.DataFrame()
values = []


for column in df.columns:
    nan_df[column] = [len(df[column].dropna())/len(df)]

nan_df = nan_df.T
    
fig = px.bar(nan_df, x=nan_df.index, y=0)
fig.show()

### Treating NaN values

In [8]:
df = df.drop(["id_person_booking"], axis=1)
df = df[df["country"].notna()]
df = df[df["num_children"].notna()]

In [9]:
nan_df = pd.DataFrame()
values = []


for column in df.columns:
    nan_df[column] = [len(df[column].dropna())/len(df)]

nan_df = nan_df.T
    
fig = px.bar(nan_df, x=nan_df.index, y=0)
fig.show()

# Mapping categorical variables

In [10]:
months_dict = dict((v,k) for k,v in zip(range(1, 13), sorted(df["month_arrival_date"].unique(), key=lambda m: datetime.strptime(m, "%B"))))
df['month_arrival_date'] = df['month_arrival_date'].map(months_dict)

df["breakfast"] = df["breakfast"]*1

rooms_dict = dict((v,k) for k,v in zip(range(len(df["reserved_room"].unique())), np.sort(df["reserved_room"].unique())))
df['reserved_room'] = df['reserved_room'].map(rooms_dict)

type_dict = dict((v,k) for k,v in zip(range(len(df["type"].unique())), np.sort(df["type"].unique())))
df['type'] = df['type'].map(type_dict)

deposit_dict = dict((v,k) for k,v in zip(range(len(df["deposit_policy"].unique())), np.sort(df["deposit_policy"].unique())))
df["deposit_policy"] = df["deposit_policy"].map(deposit_dict)

countries_dict = dict((v,k) for k,v in zip(range(len(df["country"].unique())), df["country"].unique()))
df["country"] = df["country"].map(countries_dict)

travel_agency_dict = dict((v,k) for k,v in zip(range(len(df["id_travel_agency_booking"].unique())), df["id_travel_agency_booking"].unique()))
df["id_travel_agency_booking"] = df["id_travel_agency_booking"].map(travel_agency_dict)

# Data description

In [11]:
# Tendency
mean = pd.DataFrame(df.mean()).round(2).T
median = pd.DataFrame(df.median()).round(2).T

# Dispersion
std = pd.DataFrame(df.std()).round(2).T
v_max = pd.DataFrame(df.max()).round(2).T
v_min = pd.DataFrame(df.min()).round(2).T

x = lambda x: x.max() - x.min()
v_range = pd.DataFrame(df.apply(x)).T

x = lambda x: x.skew().round(2)
skew = pd.DataFrame(df.apply(x)).T

x = lambda x: x.kurt().round(2)
kurt = pd.DataFrame(df.apply(x)).T

data_description = pd.concat([mean, median, std, v_max, v_min, v_range, skew, kurt]).T.reset_index()
data_description.columns = ["column", "mean", "median", "std", "max", "min", "range", "skew", "kurt"]
data_description

Unnamed: 0,column,mean,median,std,max,min,range,skew,kurt
0,type,0.61,1.0,0.49,1.0,0.0,1.0,-0.47,-1.78
1,cancellation,0.28,0.0,0.45,1.0,0.0,1.0,1.0,-1.0
2,days_between_booking_arrival,79.54,49.0,85.55,737.0,0.0,737.0,1.43,2.15
3,year_arrival_date,2016.22,2016.0,0.68,2017.0,2015.0,2.0,-0.31,-0.87
4,month_arrival_date,6.47,7.0,3.09,12.0,1.0,11.0,0.01,-0.96
5,week_number_arrival_date,26.8,27.0,13.65,53.0,1.0,52.0,0.03,-0.94
6,day_of_month_arrival_date,15.81,16.0,8.84,31.0,1.0,30.0,0.0,-1.2
7,num_weekend_nights,1.01,1.0,1.03,16.0,0.0,16.0,1.31,6.21
8,num_workweek_nights,2.64,2.0,2.05,41.0,0.0,41.0,2.5,17.64
9,num_adults,1.88,2.0,0.63,55.0,0.0,55.0,20.22,1370.26


### Filtering data by kurtosis

In [12]:
data_description[(data_description["kurt"] > 3) | (data_description["kurt"] < -3)]

Unnamed: 0,column,mean,median,std,max,min,range,skew,kurt
7,num_weekend_nights,1.01,1.0,1.03,16.0,0.0,16.0,1.31,6.21
8,num_workweek_nights,2.64,2.0,2.05,41.0,0.0,41.0,2.5,17.64
9,num_adults,1.88,2.0,0.63,55.0,0.0,55.0,20.22,1370.26
10,num_children,0.14,0.0,0.46,10.0,0.0,10.0,3.43,12.7
11,num_babies,0.01,0.0,0.11,10.0,0.0,10.0,21.03,1193.14
13,country,8.57,3.0,15.6,176.0,0.0,176.0,4.05,22.59
16,repeated_guest,0.04,0.0,0.2,1.0,0.0,1.0,4.72,20.25
17,num_previous_cancellations,0.03,0.0,0.37,26.0,0.0,26.0,34.22,1709.02
18,num_previous_stays,0.18,0.0,1.73,72.0,0.0,72.0,20.74,591.23
20,changes_between_booking_arrival,0.27,0.0,0.73,21.0,0.0,21.0,5.56,67.69


# Data Exploration

In [13]:
guest_city = df[df['cancellation'] == 0]['country'].value_counts().reset_index()
guest_city.columns = ['Country', 'n_clients']
inv_countries_dict = {v: k for k, v in countries_dict.items()}
guest_city["Country"] = guest_city["Country"].map(inv_countries_dict)
guest_city.sort_values(by="n_clients", ascending=False).iloc[0:10]

Unnamed: 0,Country,n_clients
0,PRT,17315
1,GBR,8338
2,FRA,7000
3,ESP,5314
4,DEU,4232
5,IRL,2332
6,ITA,1952
7,BEL,1660
8,NLD,1549
9,USA,1404


In [14]:
categories_values = list()
categories = sorted(df["total_of_special_requests"].unique())

for i in sorted(df["total_of_special_requests"].unique()):
    percentage = df.groupby("total_of_special_requests")["cancellation"].sum()[i] / len(df[df["total_of_special_requests"] == i])
    categories_values.append(percentage)

fig = px.bar(x=categories, y=categories_values, title="Cancellation Rate vs Total Special Requests", color=categories)
fig.show()

In [15]:
categories_values = list()
categories = sorted(df["market_segment"].unique())

for i in sorted(df["market_segment"].unique()):
    percentage = df.groupby("market_segment")["cancellation"].sum()[i] / len(df[df["market_segment"] == i])
    categories_values.append(percentage)

fig = px.bar(x=categories, y=categories_values, title="Cancellation Rate vs Market Segment", color=categories)
fig.show()

In [16]:
fig = px.histogram(df.groupby("days_between_booking_arrival")["cancellation"].sum(), nbins=10, title="Cancellation Rate vs Days between booking arrival")
fig.show()

In [17]:
categories_values = list()

for i in range(len(deposit_dict)):
    percentage = df.groupby("deposit_policy")["cancellation"].sum()[i] / len(df[df["deposit_policy"] == i])
    categories_values.append(percentage)

categories = ['No Deposit','Non Refund','Refundable']

fig = px.bar(x=categories, y=categories_values, title="Percentage of Cancellations by Deposit Policy", color=categories)
fig.show()

In [18]:
rate_previous1 = df[df["num_previous_cancellations"] == 0]["cancellation"].sum() / len(df[df["num_previous_cancellations"] == 0])
rate_previous2 = df[df["num_previous_cancellations"] > 0]["cancellation"].sum() / len(df[df["num_previous_cancellations"] > 0])
categories = ["0", ">0"]

fig = px.bar(x=categories, y=[rate_previous1, rate_previous2], title="Cancellation Rate vs Previous cancellation", color=categories)
fig.show()


# Feature Selection

### Extra Tree Classifier for Feature Selection

In [19]:
y = df["cancellation"]
X = df.drop("cancellation", axis = 1)

In [20]:
# Building the model
extra_tree_forest = ExtraTreesClassifier(n_estimators = 5, criterion ='entropy', max_features = 2)
# ver pra que serve os parametros
extra_tree_forest.fit(X, y)

ExtraTreesClassifier(criterion='entropy', max_features=2, n_estimators=5)

In [21]:
feature_importance = extra_tree_forest.feature_importances_

In [22]:
feature_importance_normalized = np.std([tree.feature_importances_ for tree in 
                                        extra_tree_forest.estimators_],
                                        axis = 0)

In [23]:
fig = px.bar(x=X.columns, y=feature_importance_normalized, color=X.columns)
fig.show()

In [24]:
ETC_results = pd.Series(feature_importance_normalized, index=X.columns).sort_values(ascending=False)
ETC_results

total_of_special_requests          0.014026
num_workweek_nights                0.006527
market_segment                     0.006189
id_travel_agency_booking           0.005216
type                               0.004777
month_arrival_date                 0.004743
distribution_channel               0.004611
days_between_booking_arrival       0.004567
required_car_parking_spaces        0.004249
year_arrival_date                  0.004090
breakfast                          0.003657
customer_type                      0.002999
week_number_arrival_date           0.002777
day_of_month_arrival_date          0.002534
avg_price                          0.002457
num_weekend_nights                 0.002236
country                            0.002119
changes_between_booking_arrival    0.001760
repeated_guest                     0.001743
deposit_policy                     0.001548
num_children                       0.001499
reserved_room                      0.001444
num_previous_cancellations      

# Model

### Data scaling

In [25]:
X = df.drop('cancellation',axis=1)
y = df["cancellation"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42)

std_scaler = StandardScaler()
std_scaler.fit(X_train)
X_train = std_scaler.transform(X_train)
X_test = std_scaler.transform(X_test)

### Entire dataset

In [26]:
classifier_rf = RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=5, n_estimators=100, oob_score=True)
classifier_rf.fit(X_train, y_train)
classifier_rf.score(X_test, y_test)

0.7479811417635251

### Searching for Hyperparams

In [27]:
pipe = Pipeline([("rf", RandomForestClassifier())])

In [28]:

rf_hyperparams = {"rf__n_estimators": [100,150,200], # number of trees used in random forest, very high values could lead to overfitting
                 "rf__max_depth": [5, 10,15], # max depth of each tree, if the depth is too low, the accuracy is poor
                 "rf__criterion": ["gini", "entropy"], # to check whether impurity or information gain is the best way to split
                 "rf__min_samples_leaf": [3,5,10], # minimum samples beyond which a node cannot be split, higher values imply more generality
                 "rf__max_features": ["sqrt", "log2"], # to check what is the best way limit the number of features to each tree
                 "rf__bootstrap": ["True", "False"]} # to check whether bagging and aggregating results in a better model 

In [29]:
f1_wtd = make_scorer(f1_score, average="weighted")
pr_wtd = make_scorer(precision_score, average="weighted")
bal_acc_score = make_scorer(balanced_accuracy_score)
recall_wtd = make_scorer(recall_score, average = "weighted")

scoring_dict = {"bal_acc_score": bal_acc_score,
                "f1_wtd": f1_wtd,
                "pr_wtd": pr_wtd,
                "recall_wtd": recall_wtd}

In [30]:
rscv = RandomizedSearchCV(estimator=pipe,
                          param_distributions=rf_hyperparams,
                          n_iter = 10,
                          scoring = scoring_dict,
                          refit = "pr_wtd",
                          n_jobs = -1,
                          cv = 5,
                          random_state=42)

### Using best features from feature selection

In [31]:
X = df[ETC_results.index[0:5]]
y = df["cancellation"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42)

std_scaler = StandardScaler()
std_scaler.fit(X_train)
X_train = std_scaler.transform(X_train)
X_test = std_scaler.transform(X_test)

result = rscv.fit(X_train, y_train)

In [32]:
print(result.best_params_)
print(result.best_score_)

{'rf__n_estimators': 100, 'rf__min_samples_leaf': 5, 'rf__max_features': 'sqrt', 'rf__max_depth': 15, 'rf__criterion': 'entropy', 'rf__bootstrap': 'False'}
0.7449505725745316


In [33]:
pipe = result.best_estimator_
model = pipe.fit(X_train, y_train)

In [34]:
model.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('rf',
                 RandomForestClassifier(bootstrap='False', criterion='entropy',
                                        max_depth=15, max_features='sqrt',
                                        min_samples_leaf=5))])>

In [35]:
y_pred = model.predict(X_test)

In [36]:
print(f"Balanced accuracy score: {balanced_accuracy_score(y_test, y_pred): .4f}")
print(f"Precision score: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall score: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 score: {f1_score(y_test, y_pred, average='weighted'):.4f}")

Balanced accuracy score:  0.6441
Precision score: 0.7458
Recall score: 0.7648
F1 score: 0.7444


### Using entire dataset

In [37]:
X = df.drop('cancellation',axis=1)
y = df["cancellation"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42)

std_scaler = StandardScaler()
std_scaler.fit(X_train)
X_train = std_scaler.transform(X_train)
X_test = std_scaler.transform(X_test)

result = rscv.fit(X_train, y_train)

In [38]:
print(result.best_params_)
print(result.best_score_)

{'rf__n_estimators': 100, 'rf__min_samples_leaf': 5, 'rf__max_features': 'sqrt', 'rf__max_depth': 15, 'rf__criterion': 'entropy', 'rf__bootstrap': 'False'}
0.8320372873981411


In [39]:
pipe = result.best_estimator_
model = pipe.fit(X_train, y_train)

In [40]:
y_pred = model.predict(X_test)

In [41]:
print(f"Balanced accuracy score: {balanced_accuracy_score(y_test, y_pred): .4f}")
print(f"Precision score: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall score: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 score: {f1_score(y_test, y_pred, average='weighted'):.4f}")

Balanced accuracy score:  0.7619
Precision score: 0.8304
Recall score: 0.8364
F1 score: 0.8299


### Spliting by time

In [42]:
X_train = df[(df["year_arrival_date"] == 2015) | (df["year_arrival_date"] == 2016)].drop("cancellation", axis=1)
y_train = df[(df["year_arrival_date"] == 2015) | (df["year_arrival_date"] == 2016)]["cancellation"]

std_scaler = StandardScaler()
std_scaler.fit(X_train)
X_train = std_scaler.transform(X_train)
X_test = std_scaler.transform(X_test)

result = rscv.fit(X_train, y_train)

In [43]:
print(result.best_params_)
print(result.best_score_)

{'rf__n_estimators': 100, 'rf__min_samples_leaf': 10, 'rf__max_features': 'log2', 'rf__max_depth': 5, 'rf__criterion': 'gini', 'rf__bootstrap': 'True'}
0.7020509039828106


In [44]:
pipe = result.best_estimator_
model = pipe.fit(X_train, y_train)

In [45]:
X_test = df[(df["year_arrival_date"] == 2017)].drop("cancellation", axis=1)
y_test = df[(df["year_arrival_date"] == 2017)]["cancellation"]
y_pred = model.predict(X_test)

In [46]:
print(f"Balanced accuracy score: {balanced_accuracy_score(y_test, y_pred): .4f}")
print(f"Precision score: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall score: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 score: {f1_score(y_test, y_pred, average='weighted'):.4f}")

Balanced accuracy score:  0.5000
Precision score: 0.4631
Recall score: 0.6805
F1 score: 0.5511



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



### Manually selecting features

In [47]:
X = df[["market_segment", "country", "market_segment", "num_previous_cancellations", "num_previous_stays", "deposit_policy", "total_of_special_requests"]]
y = df["cancellation"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42)

std_scaler = StandardScaler()
std_scaler.fit(X_train)
X_train = std_scaler.transform(X_train)
X_test = std_scaler.transform(X_test)

result = rscv.fit(X_train, y_train)

In [48]:
print(result.best_params_)
print(result.best_score_)

{'rf__n_estimators': 100, 'rf__min_samples_leaf': 10, 'rf__max_features': 'log2', 'rf__max_depth': 5, 'rf__criterion': 'entropy', 'rf__bootstrap': 'False'}
0.8038984250889587


In [49]:
pipe = result.best_estimator_
model = pipe.fit(X_train, y_train)

In [50]:
y_pred = model.predict(X_test)

In [51]:
print(f"Balanced accuracy score: {balanced_accuracy_score(y_test, y_pred): .4f}")
print(f"Precision score: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall score: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 score: {f1_score(y_test, y_pred, average='weighted'):.4f}")

Balanced accuracy score:  0.5462
Precision score: 0.7636
Recall score: 0.7470
F1 score: 0.6665
