In [21]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [22]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from imblearn.over_sampling import SMOTE

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_score

In [23]:
# load data
df = pd.read_csv('../../data/H2.csv')

In [24]:
# Handle missing values
df["Children"].fillna(0, inplace=True)  # Fill missing 'Children' with 0
df["Country"].fillna("Unknown", inplace=True)  # Fill missing 'Country' with 'Unknown'

# Drop rows where adults, babies and children are zero at the same time
df = df[(df["Adults"] != 0) | (df["Babies"] != 0) | (df["Children"] != 0)]

In [25]:
# Drop Duplicate rows
print(f"Total Number of Rows: {len(df)}")
print(f"Total Number of Rows without Duplicates: {len(df.drop_duplicates())}")

df = df.drop_duplicates()

Total Number of Rows: 79163
Total Number of Rows without Duplicates: 53274


In [26]:
# Remove Outliers
# Get only numerical data
df_num = df.select_dtypes(include=[np.number])
# Get only Categorical data
df_cat = df.select_dtypes(include=["object"])

df_num_out = df_num[["LeadTime", "ADR"]]

# Outlier Detection Fonksion
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 2.5 * IQR
    upper_bound = Q3 + 2.5 * IQR
    return data[(data[column] < lower_bound) | (data[column] > upper_bound)]

# Remove Outliers
def remove_outliers(data, outliers):
  return data.drop(outliers.index)

for col in df_num_out:
  outliers = detect_outliers_iqr(df, col)
  df = remove_outliers(df, outliers)
  print(f"{col}: {len(outliers)} outliers removed.")

df.shape

LeadTime: 351 outliers removed.
ADR: 254 outliers removed.


(52669, 31)

In [27]:
df.columns, df.shape

(Index(['IsCanceled', 'LeadTime', 'ArrivalDateYear', 'ArrivalDateMonth',
        'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth',
        'StaysInWeekendNights', 'StaysInWeekNights', 'Adults', 'Children',
        'Babies', 'Meal', 'Country', 'MarketSegment', 'DistributionChannel',
        'IsRepeatedGuest', 'PreviousCancellations',
        'PreviousBookingsNotCanceled', 'ReservedRoomType', 'AssignedRoomType',
        'BookingChanges', 'DepositType', 'Agent', 'Company',
        'DaysInWaitingList', 'CustomerType', 'ADR', 'RequiredCarParkingSpaces',
        'TotalOfSpecialRequests', 'ReservationStatus', 'ReservationStatusDate'],
       dtype='object'),
 (52669, 31))

In [28]:
# Remove columns NOT useful for the model
df = df.drop(
    [
        "ReservationStatus",
        "AssignedRoomType",
        "ArrivalDateYear",
        "AssignedRoomType",
        "BookingChanges",
        "ArrivalDateYear",
        "ArrivalDateWeekNumber",
        "ArrivalDateDayOfMonth",
        "DaysInWaitingList",
        "Country",
    ],
    axis=1,
)

In [29]:
df.columns, df.shape

(Index(['IsCanceled', 'LeadTime', 'ArrivalDateMonth', 'StaysInWeekendNights',
        'StaysInWeekNights', 'Adults', 'Children', 'Babies', 'Meal',
        'MarketSegment', 'DistributionChannel', 'IsRepeatedGuest',
        'PreviousCancellations', 'PreviousBookingsNotCanceled',
        'ReservedRoomType', 'DepositType', 'Agent', 'Company', 'CustomerType',
        'ADR', 'RequiredCarParkingSpaces', 'TotalOfSpecialRequests',
        'ReservationStatusDate'],
       dtype='object'),
 (52669, 23))

In [30]:
# Create Features from 'ReservationStatusDate' column
df['ReservationStatusDate'] = pd.to_datetime(df['ReservationStatusDate'])

df['Year'] = df['ReservationStatusDate'].dt.year
df['Month'] = df['ReservationStatusDate'].dt.month
df['Day'] = df['ReservationStatusDate'].dt.day

df.drop(['ReservationStatusDate'] , axis = 1, inplace = True)

In [31]:
# Correct the wrong type in 'Agent' and 'Company' columns

# Remove blank spaces
remove_spaces = lambda col: df[col].str.strip()

# Replace 'NULL' values with 0
replace_null = lambda col: df[col].replace("NULL", 0)

# Convert the column to integer
convert_to_int = lambda col: df[col].astype(int)


# Correct the 'Agent' column
df['Agent'] = remove_spaces('Agent')
df['Agent'] = replace_null('Agent')
df['Agent'] = convert_to_int('Agent')

# Correct the 'Company' column
df['Company'] = remove_spaces('Company')
df['Company'] = replace_null('Company')
df['Company'] = convert_to_int('Company')

df['Agent'].value_counts(), df['Company'].value_counts()

(Agent
 9      28624
 0       5514
 7       3287
 14      3272
 28      1488
        ...  
 388        1
 480        1
 464        1
 476        1
 449        1
 Name: count, Length: 223, dtype: int64,
 Company
 0      49908
 40       845
 45       237
 153      205
 219      131
        ...  
 481        1
 494        1
 491        1
 421        1
 497        1
 Name: count, Length: 203, dtype: int64)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 52669 entries, 0 to 79329
Data columns (total 25 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   IsCanceled                   52669 non-null  int64  
 1   LeadTime                     52669 non-null  int64  
 2   ArrivalDateMonth             52669 non-null  object 
 3   StaysInWeekendNights         52669 non-null  int64  
 4   StaysInWeekNights            52669 non-null  int64  
 5   Adults                       52669 non-null  int64  
 6   Children                     52669 non-null  float64
 7   Babies                       52669 non-null  int64  
 8   Meal                         52669 non-null  object 
 9   MarketSegment                52669 non-null  object 
 10  DistributionChannel          52669 non-null  object 
 11  IsRepeatedGuest              52669 non-null  int64  
 12  PreviousCancellations        52669 non-null  int64  
 13  PreviousBookingsNotCa

In [33]:
# Remove rare categories
def remove_rare_categories(df, categorical_cols, min_count=10):
    """
    In categorical columns, it deletes categories that are below a certain threshold value.
        Args:
            df: Pandas DataFrame
            categorical_cols: List of categorical columns
            min_count: Minimum number of observations for a category to remain
    """

    for col in categorical_cols:
        value_counts = df[col].value_counts()
        common_categories = value_counts[value_counts >= min_count].index
        df = df[df[col].isin(common_categories)]
    return df


df = remove_rare_categories(
    df, df.select_dtypes(include=["object"]).columns, min_count=10
)

In [34]:
# Train-Test Split
from sklearn.model_selection import train_test_split

# Split data to features and target
X = df.drop("IsCanceled", axis=1)
y = df["IsCanceled"]

# Stratified splitting into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True
)

In [35]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((42132, 24), (10533, 24), (42132,), (10533,))

In [36]:
# Categoric and numeric features from X_train
categorical_features = X_train.select_dtypes(include=['object']).columns
numerical_features = X_train.select_dtypes(include=[np.number]).columns

# Convert Columns withOneHotEncoder and StandardScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create a Pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [37]:
# Fit the preprocessor only on the training data
preprocessor.fit(X_train) 

# Transform both train and test data using the fitted preprocessor
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test) 

In [38]:
# SMOTE
smote = SMOTE(sampling_strategy='auto')
X_train_res, y_train_res = smote.fit_resample(X_train_transformed, y_train)

print(f"Original dataset shape: {X_train.shape}")
print(f"Resampled dataset shape: {X_train_res.shape}")

Original dataset shape: (42132, 24)
Resampled dataset shape: (59036, 58)


**Logistic Regression**

In [39]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(X_train_res, y_train_res)

In [40]:
# Predict the labels
y_pred = lr_model.predict(X_test_transformed)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.714706161587392

**Random Forest Classifier**

In [41]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_res, y_train_res)

In [42]:
# Predict the labels
y_pred = rf_model.predict(X_test_transformed)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9068641412702934

**Gradient Boosting Classifier**

In [44]:
from sklearn.ensemble import GradientBoostingClassifier


gbc_model = GradientBoostingClassifier(
    n_estimators=100, learning_rate=0.1, random_state=42
)

# Model eğitimi
gbc_model.fit(X_train_res, y_train_res)

In [45]:
# Predict the labels
y_pred = gbc_model.predict(X_test_transformed)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8464824836228995

**SVC**

In [46]:
from sklearn.svm import SVC

svm_model = SVC()

svm_model.fit(X_train_res, y_train_res)

In [47]:
# Predict the labels
y_pred = svm_model.predict(X_test_transformed)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8625272951675685

**KNeighborsClassifier**

In [49]:
from sklearn.neighbors import KNeighborsClassifier

KNeighborsC_model = KNeighborsClassifier()

KNeighborsC_model.fit(X_train_res, y_train_res)

In [50]:
# Predict the labels
y_pred = KNeighborsC_model.predict(X_test_transformed)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7682521598784772

**LGBMClassifier**

In [61]:
import lightgbm as lgb

lgb_model = lgb.LGBMClassifier()

lgb_model.fit(X_train_res, y_train_res)

[LightGBM] [Info] Number of positive: 29518, number of negative: 29518
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018499 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9639
[LightGBM] [Info] Number of data points in the train set: 59036, number of used features: 56
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [62]:
# Predict the labels
y_pred = lgb_model.predict(X_test_transformed)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9174973891578847

In [67]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

gnb.fit(X_train_res, y_train_res)

In [68]:
# Predict the labels
y_pred = gnb.predict(X_test_transformed)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.4433684610272477

**Ada Boost**

In [69]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier()

clf.fit(X_train_res, y_train_res)

In [70]:
# Predict the labels
y_pred = clf.predict(X_test_transformed)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7672078230323744

---