In [159]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [160]:
# load data
df = pd.read_csv('../../data/H2.csv')

1. Split train and test data

In [161]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_df.shape, test_df.shape

((63464, 31), (15866, 31))

2. Handle Missing Values

In [162]:
# Handle missing values
train_df['Children'].fillna(0, inplace=True)  # Fill missing 'Children' with 0
train_df['Country'].fillna('Unknown', inplace=True)  # Fill missing 'Country' with 'Unknown'

# Drop rows where adults, babies and children are zero at the same time
train_df = train_df[(train_df['Adults'] != 0) | (train_df['Babies'] != 0) | (train_df['Children'] != 0)] 

3. Remove Outliers

In [163]:
# Get only numerical data
train_df_num = train_df.select_dtypes(include=[np.number])
# Get only Categorical data
train_df_cat = train_df.select_dtypes(include=['object'])

train_df_num.shape, train_df_cat.shape

((63335, 18), (63335, 13))

In [164]:
train_df_num.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,BookingChanges,DaysInWaitingList,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests
12826,1,71,2016,25,14,0,3,1,0.0,0,0,0,0,0,0,120.0,0,0
36957,0,0,2015,35,28,0,2,2,0.0,0,0,0,0,0,0,106.0,0,1
46297,0,0,2016,14,31,0,1,2,1.0,0,0,0,0,0,0,138.0,0,0
12562,1,27,2016,24,9,0,1,2,0.0,0,0,0,0,0,0,120.0,0,0
37239,0,256,2015,38,14,1,2,1,0.0,0,0,0,0,0,0,64.33,0,0


In [165]:
from sklearn.ensemble import IsolationForest

# Predict Outliers
isolation_forest = IsolationForest(random_state=42)
outlier_pred = isolation_forest.fit_predict(train_df_num)

# Exclude outliers
train_df = train_df.iloc[outlier_pred == 1]

train_df.shape

(58419, 31)

4. Feature Selection and Creation

In [166]:
# Columns to drop from the dataset
columns_to_drop = [
    'IsCanceled', 
    'ReservationStatus', 
    'ReservationStatusDate', 
    'ArrivalDateYear', 
    'ArrivalDateMonth', 
    'ArrivalDateWeekNumber', 
    'ArrivalDateDayOfMonth', 
    'Company'
]

# Drop the specified columns
train_df = train_df.drop(columns=columns_to_drop)

In [167]:
# TotalStay: Total number of nights the customer stayed
train_df["TotalStay"] = train_df["StaysInWeekendNights"] + train_df["StaysInWeekNights"]

# SpendingPerPerson: Customer's spending capacity per person
train_df["SpendingPerPerson"] = train_df["ADR"] / (train_df["Adults"] + train_df["Children"] + train_df["Babies"])
train_df["SpendingPerPerson"].fillna(0, inplace=True)  # Handle division by zero


# GroupType: Structure of customer groups
def determine_group_type(row):
    if row["Adults"] == 1 and (row["Children"] + row["Babies"] == 0):
        return "Single"
    elif row["Adults"] == 2 and (row["Children"] + row["Babies"] == 0):
        return "Couple"
    elif (row["Children"] + row["Babies"]) > 0:
        return "Family"
    elif row["Adults"] > 2:
        return "Big_Group"
    else:
        return "Other"


train_df["GroupType"] = train_df.apply(determine_group_type, axis=1)


# BookingUrgency: Categorize bookings by lead time
def determine_booking_urgency(lead_time):
    if lead_time > 60:
        return "Early_Booking"
    elif 30 <= lead_time <= 60:
        return "Medium_Term_Booking"
    else:
        return "Last_Minute_Booking"


train_df["BookingUrgency"] = train_df["LeadTime"].apply(determine_booking_urgency)

# HighDemand: Categorize customers based on special requests
train_df["HighDemand"] = train_df["TotalOfSpecialRequests"].apply(
    lambda x: "High" if x >= 2 else "Low"
)

# LoyaltyScore: Numerical representation of customer loyalty
train_df["LoyaltyScore"] = train_df["IsRepeatedGuest"] * (train_df["PreviousBookingsNotCanceled"] + 1)

# ChildRatio: Ratio of children to total group size
train_df["ChildRatio"] = (train_df["Children"] + train_df["Babies"]) / (
    train_df["Adults"] + train_df["Children"] + train_df["Babies"]
)
train_df["ChildRatio"].fillna(0, inplace=True)  # Handle division by zero

# Occupancy: Density and occupancy rate of rooms
train_df["Occupancy"] = (train_df["Adults"] + train_df["Children"] + train_df["Babies"]) / train_df["TotalStay"]
train_df["Occupancy"].fillna(0, inplace=True)  # Handle division by zero


# WaitingTimeCategory: Categorize customers based on waiting time
def determine_waiting_time_category(days):
    if days < 7:
        return "Short"
    elif 7 <= days <= 30:
        return "Medium"
    else:
        return "Long"


train_df["WaitingTimeCategory"] = train_df["DaysInWaitingList"].apply(
    determine_waiting_time_category
)

# CancellationRisk: Risk score based on previous cancellations and lead time
train_df["CancellationRisk"] = train_df["PreviousCancellations"] * train_df["LeadTime"]

In [168]:
train_df.shape, train_df.columns

((58419, 33),
 Index(['LeadTime', 'StaysInWeekendNights', 'StaysInWeekNights', 'Adults',
        'Children', 'Babies', 'Meal', 'Country', 'MarketSegment',
        'DistributionChannel', 'IsRepeatedGuest', 'PreviousCancellations',
        'PreviousBookingsNotCanceled', 'ReservedRoomType', 'AssignedRoomType',
        'BookingChanges', 'DepositType', 'Agent', 'DaysInWaitingList',
        'CustomerType', 'ADR', 'RequiredCarParkingSpaces',
        'TotalOfSpecialRequests', 'TotalStay', 'SpendingPerPerson', 'GroupType',
        'BookingUrgency', 'HighDemand', 'LoyaltyScore', 'ChildRatio',
        'Occupancy', 'WaitingTimeCategory', 'CancellationRisk'],
       dtype='object'))

Since the `Country` Column, which contains country codes, contains too many categories, let's reduce it to regions.

In [169]:
# Define a dictionary to map country codes to regions
country_to_region = {
    "PRT": "Europe",
    "GBR": "Europe",
    "ESP": "Europe",
    "IRL": "Europe",
    "NLD": "Europe",
    "RUS": "Europe",
    "FRA": "Europe",
    "DEU": "Europe",
    "ITA": "Europe",
    "BEL": "Europe",
    "CHE": "Europe",
    "USA": "North America",
    "CAN": "North America",
    "MEX": "North America",
    "CN": "Asia",
    "BRA": "South America",
    "ARG": "South America",
    "AUS": "Oceania",
    "Unknown": "Unknown",
}

# Map the country codes to regions
train_df["Region"] = train_df["Country"].map(country_to_region)

# Drop the Country column
train_df.drop(columns=["Country"], inplace=True)

train_df["Region"].value_counts()

Region
Europe           48283
South America     1443
North America     1251
Asia               431
Oceania            253
Unknown             13
Name: count, dtype: int64

The categorical variable `Agent` has a lot of unique values ​​and most of them are numbers. There is only one string named 'NULL'. And some reason there is blank spaces before values.
- There is blanks in data, first we will remove these.
- We can change the 'NULL' value of this variable to 0 and
- convert it to int to make it easier to deal with.

In [170]:
# Remove blank spaces from the 'Agent' column
train_df['Agent'] = train_df['Agent'].str.strip()

# Replace 'NULL' values in the 'Agent' column with 0
train_df['Agent'].replace('NULL', 0, inplace=True)

# Convert the 'Agent' column to integer
train_df['Agent'] = train_df['Agent'].astype(int)

5. Encoding and Scaling

In [171]:
# Get only numerical data
train_df_num = train_df.select_dtypes(include=[np.number])
# Get only Categorical data
train_df_cat = train_df.select_dtypes(include=['object'])

train_df_num.shape, train_df_cat.shape

((58419, 21), (58419, 12))

In [172]:
# Numerical columns with unique values 
train_df_num.nunique().sort_values()

IsRepeatedGuest                   2
PreviousCancellations             2
RequiredCarParkingSpaces          2
Babies                            3
Children                          4
LoyaltyScore                      4
Adults                            5
TotalOfSpecialRequests            6
ChildRatio                        7
StaysInWeekendNights              7
BookingChanges                   13
StaysInWeekNights                13
TotalStay                        17
PreviousBookingsNotCanceled      17
Occupancy                        33
DaysInWaitingList               110
Agent                           213
CancellationRisk                216
LeadTime                        445
ADR                            4248
SpendingPerPerson              5220
dtype: int64

In [173]:
train_df_cat.nunique().sort_values()

HighDemand             2
DepositType            3
BookingUrgency         3
WaitingTimeCategory    3
CustomerType           4
Meal                   4
GroupType              4
DistributionChannel    5
Region                 6
ReservedRoomType       7
AssignedRoomType       8
MarketSegment          8
dtype: int64

**One-Hot Encode**<br>

***Numeric columns that behave like Categorical***
- `IsRepeatedGuest`, 
- `PreviousCancellations`, 
- `RequiredCarParkingSpaces`, 
- `Babies`, 
- `Children`, 
- `LoyaltyScore`, 
- `Adults`, 
- `TotalOfSpecialRequests`, 
- `ChildRatio`, 
- `StaysInWeekendNights`<br>
***Categorical Columns***
- `HighDemand`
- `DepositType`
- `BookingUrgency`
- `WaitingTimeCategory`
- `CustomerType`
- `Meal`
- `GroupType`
- `DistributionChannel`
- `Region`
- `ReservedRoomType`
- `AssignedRoomType`
- `MarketSegment`

***StandarScaler***
- `BookingChanges`
- `StaysInWeekNights`
- `TotalStay`
- `PreviousBookingsNotCanceled`
- `Occupancy`
- `DaysInWaitingList`
- `Agent`
- `CancellationRisk`
- `LeadTime`
- `ADR`
- `SpendingPerPerson`

In [174]:
from sklearn.preprocessing import OneHotEncoder

# One Hot Encoding - Categorical Variables
cat_encoder = OneHotEncoder()
train_cat_1hot = cat_encoder.fit_transform(train_df_cat)
train_cat_1hot

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 701028 stored elements and shape (58419, 58)>

In [177]:
train_cat_1hot_dense = train_cat_1hot.toarray()
encoded_cat_df = pd.DataFrame(train_cat_1hot_dense, 
                          columns=cat_encoder.get_feature_names_out(train_df_cat.columns))

encoded_cat_df

Unnamed: 0,Meal_BB,Meal_FB,Meal_HB,Meal_SC,MarketSegment_Aviation,MarketSegment_Complementary,MarketSegment_Corporate,MarketSegment_Direct,MarketSegment_Groups,MarketSegment_Offline TA/TO,MarketSegment_Online TA,MarketSegment_Undefined,DistributionChannel_Corporate,DistributionChannel_Direct,DistributionChannel_GDS,DistributionChannel_TA/TO,DistributionChannel_Undefined,ReservedRoomType_A,ReservedRoomType_B,ReservedRoomType_C,ReservedRoomType_D,ReservedRoomType_E,ReservedRoomType_F,ReservedRoomType_G,AssignedRoomType_A,AssignedRoomType_B,AssignedRoomType_C,AssignedRoomType_D,AssignedRoomType_E,AssignedRoomType_F,AssignedRoomType_G,AssignedRoomType_K,DepositType_No Deposit,DepositType_Non Refund,DepositType_Refundable,CustomerType_Contract,CustomerType_Group,CustomerType_Transient,CustomerType_Transient-Party,GroupType_Big_Group,GroupType_Couple,GroupType_Family,GroupType_Single,BookingUrgency_Early_Booking,BookingUrgency_Last_Minute_Booking,BookingUrgency_Medium_Term_Booking,HighDemand_High,HighDemand_Low,WaitingTimeCategory_Long,WaitingTimeCategory_Medium,WaitingTimeCategory_Short,Region_Asia,Region_Europe,Region_North America,Region_Oceania,Region_South America,Region_Unknown,Region_nan
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58414,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
58415,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
58416,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
58417,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [178]:
# One Hot Encoding - Numeric columns that behave like Categorical
encode_numeric = ['IsRepeatedGuest', 'PreviousCancellations', 'RequiredCarParkingSpaces', 'Babies',
                  'Children', 'LoyaltyScore', 'Adults', 'TotalOfSpecialRequests', 'ChildRatio', 'StaysInWeekendNights']

num_encoder = OneHotEncoder()

train_num_1hot = num_encoder.fit_transform(train_df_num[encode_numeric])
train_num_1hot

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 584190 stored elements and shape (58419, 42)>

In [184]:
train_num_1hot_dense = train_num_1hot.toarray()
encoded_num_df = pd.DataFrame(
    train_num_1hot_dense, columns=num_encoder.get_feature_names_out(train_df_num[encode_numeric].columns)
)

encoded_num_df

Unnamed: 0,IsRepeatedGuest_0,IsRepeatedGuest_1,PreviousCancellations_0,PreviousCancellations_1,RequiredCarParkingSpaces_0,RequiredCarParkingSpaces_1,Babies_0,Babies_1,Babies_2,Children_0.0,Children_1.0,Children_2.0,Children_3.0,LoyaltyScore_0,LoyaltyScore_1,LoyaltyScore_2,LoyaltyScore_3,Adults_0,Adults_1,Adults_2,Adults_3,Adults_4,TotalOfSpecialRequests_0,TotalOfSpecialRequests_1,TotalOfSpecialRequests_2,TotalOfSpecialRequests_3,TotalOfSpecialRequests_4,TotalOfSpecialRequests_5,ChildRatio_0.0,ChildRatio_0.25,ChildRatio_0.3333333333333333,ChildRatio_0.5,ChildRatio_0.6,ChildRatio_0.6666666666666666,ChildRatio_1.0,StaysInWeekendNights_0,StaysInWeekendNights_1,StaysInWeekendNights_2,StaysInWeekendNights_3,StaysInWeekendNights_4,StaysInWeekendNights_5,StaysInWeekendNights_6
0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58414,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
58415,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
58416,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
58417,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


This is the end of the encoding process, when we apply StandardScale to the remaining numeric columns, the data is ready for ML.