In [3]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.utils import resample

In [4]:
df = pd.read_csv( "customer_booking.csv", encoding='ISO-8859-1')
df.head()

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,1,5.52,0


In [5]:
df.shape

(50000, 14)

In [6]:
# Splitting the dataset into training and testing sets
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

In [7]:
nomial_variables=train_set.select_dtypes(object)
ordinal_variables = [nomial_variables.pop("flight_day").name]
nomial_variables = nomial_variables.columns

In [8]:
day_order= ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"]

In [9]:
ordinal_encoder = OrdinalEncoder(categories=[day_order])
train_set[ordinal_variables]= ordinal_encoder.fit_transform(train_set[ordinal_variables])
test_set[ordinal_variables]= ordinal_encoder.transform(test_set[ordinal_variables])

In [10]:
nominal_encoder = OneHotEncoder(handle_unknown="ignore", drop="first", sparse=False)
encoded_train = nominal_encoder.fit_transform(train_set[nomial_variables])
encoded_test = nominal_encoder.transform(test_set[nomial_variables])



In [11]:
train_df_encoded=pd.DataFrame(encoded_train, columns= nominal_encoder.get_feature_names_out(nomial_variables))
test_df_encoded=pd.DataFrame(encoded_test, columns= nominal_encoder.get_feature_names_out(nomial_variables))

In [12]:
train_df=train_set.drop(columns=nomial_variables).reset_index(drop=True)
test_df=test_set.drop(columns=nomial_variables).reset_index(drop=True)

In [13]:
train_df = pd.concat([train_df, train_df_encoded ], axis=1)
test_df = pd.concat([test_df, test_df_encoded ], axis=1)

In [14]:
train_df.shape

(40000, 886)

In [15]:
test_df.shape

(10000, 886)

# Baseline Model

In [16]:
X = train_df.copy()
y = X.pop("booking_complete")

In [17]:
X_test = test_df.copy()
y_test = X_test.pop("booking_complete")

In [18]:
# Model Training - Random Forest
rf_model = RandomForestClassifier(random_state=42, class_weight="balanced")
rf_model.fit(X, y)

# Model Predictions
y_pred = rf_model.predict(X_test)

# # Evaluation Metrics
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.98      0.92      8520
           1       0.52      0.13      0.20      1480

    accuracy                           0.85     10000
   macro avg       0.69      0.55      0.56     10000
weighted avg       0.81      0.85      0.81     10000



# Modeling with oversample dataset

In [19]:
# Select all rows where 'booking_complete' is 0, representing the majority class
majority_df = df[df["booking_complete"] == 0]

# Select all rows where 'booking_complete' is 1, representing the minority class
mijority_df = df[df["booking_complete"] == 1]  # Note: 'mijority_df' is a typo, should be 'minority_df'

# Resample the minority class to match the number of samples in the majority class
# This is done to balance the dataset by oversampling the minority class
resample_df = resample(mijority_df, n_samples=len(majority_df), random_state=42)

# Concatenate the resampled minority class with the majority class to create a balanced dataset
new_df = pd.concat([resample_df, majority_df], axis=0, ignore_index=True)


In [20]:
# Splitting the dataset into training and testing sets
train_set, test_set = train_test_split(new_df, test_size=0.2, random_state=42)


In [21]:
# Select all nominal (categorical) variables from the train set
nomial_variables = train_set.select_dtypes(object)

# Pop (remove) the 'flight_day' column from nominal variables, 
# as it is considered an ordinal variable, and store its name in 'ordinal_variables'
ordinal_variables = [nomial_variables.pop("flight_day").name]


In [22]:
# Store the remaining nominal variable names for later use
nomial_variables = nomial_variables.columns

# Define the order for ordinal encoding of the days of the week
day_order = ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"]

# Initialize OrdinalEncoder with the specified day order for the 'flight_day' column
ordinal_encoder = OrdinalEncoder(categories=[day_order])


In [23]:
# Apply ordinal encoding on 'flight_day' in the train set
train_set[ordinal_variables] = ordinal_encoder.fit_transform(train_set[ordinal_variables])

# Apply the same transformation on the test set
test_set[ordinal_variables] = ordinal_encoder.transform(test_set[ordinal_variables])

# Initialize OneHotEncoder to encode nominal variables, ignoring unknown categories in test data
# 'drop="first"' avoids dummy variable trap, 'sparse=False' returns dense array
nominal_encoder = OneHotEncoder(handle_unknown="ignore", drop="first", sparse=False)


In [24]:
# Fit OneHotEncoder on the train set and transform it into encoded arrays
encoded_train = nominal_encoder.fit_transform(train_set[nomial_variables])

# Transform the test set using the already fitted encoder
encoded_test = nominal_encoder.transform(test_set[nomial_variables])

# Create DataFrames from the encoded train and test arrays with appropriate column names
train_df_encoded = pd.DataFrame(encoded_train, columns=nominal_encoder.get_feature_names_out(nomial_variables))
test_df_encoded = pd.DataFrame(encoded_test, columns=nominal_encoder.get_feature_names_out(nomial_variables))




In [25]:
# Drop the original nominal variables from the train and test sets and reset the index
train_df = train_set.drop(columns=nomial_variables).reset_index(drop=True)
test_df = test_set.drop(columns=nomial_variables).reset_index(drop=True)


In [26]:
# Concatenate the encoded nominal variables with the remaining features in the train and test sets
train_df = pd.concat([train_df, train_df_encoded], axis=1)
test_df = pd.concat([test_df, test_df_encoded], axis=1)# Select all rows where 'booking_complete' is 0, representing the majority class
majority_df = df[df["booking_complete"] == 0]

# Select all rows where 'booking_complete' is 1, representing the minority class
mijority_df = df[df["booking_complete"] == 1]  # Note: 'mijority_df' is a typo, should be 'minority_df'

# Resample the minority class to match the number of samples in the majority class
# This is done to balance the dataset by oversampling the minority class
resample_df = resample(mijority_df, n_samples=len(majority_df), random_state=42)

# Concatenate the resampled minority class with the majority class to create a balanced dataset
new_df = pd.concat([resample_df, majority_df], axis=0, ignore_index=True)


In [27]:
# Copy the train and test sets to prepare features (X) and target labels (y)
X = train_df.copy()
y = X.pop("booking_complete")  # Target variable 'booking_complete' removed from features

X_test = test_df.copy()
y_test = X_test.pop("booking_complete")  # Target variable removed from test features


In [28]:
# Initialize and train a Random Forest Classifier with balanced class weights and random state for reproducibility
rf_model = RandomForestClassifier(random_state=42, class_weight="balanced")
rf_model.fit(X, y)  # Fit the model on the training data

# Predict the target variable for the test set
y_pred = rf_model.predict(X_test)

# Print evaluation metrics for the model predictions, including precision, recall, F1-score
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.92      0.96      8496
           1       0.93      0.99      0.96      8513

    accuracy                           0.96     17009
   macro avg       0.96      0.96      0.96     17009
weighted avg       0.96      0.96      0.96     17009

