In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("Hotel Reservations.csv")


In [4]:
df

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.00,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.00,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.00,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.50,0,Canceled
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,INN36271,3,0,2,6,Meal Plan 1,0,Room_Type 4,85,2018,8,3,Online,0,0,0,167.80,1,Not_Canceled
36271,INN36272,2,0,1,3,Meal Plan 1,0,Room_Type 1,228,2018,10,17,Online,0,0,0,90.95,2,Canceled
36272,INN36273,2,0,2,6,Meal Plan 1,0,Room_Type 1,148,2018,7,1,Online,0,0,0,98.39,2,Not_Canceled
36273,INN36274,2,0,0,3,Not Selected,0,Room_Type 1,63,2018,4,21,Online,0,0,0,94.50,0,Canceled


In [4]:
df['booking_status']

0        Not_Canceled
1        Not_Canceled
2            Canceled
3            Canceled
4            Canceled
             ...     
36270    Not_Canceled
36271        Canceled
36272    Not_Canceled
36273        Canceled
36274    Not_Canceled
Name: booking_status, Length: 36275, dtype: object

In [5]:
numericalFeatures = ['no_of_adults', 'no_of_children', 'no_of_week_nights',
                     'no_of_week_nights', 'lead_time', 'arrival_date',
                     'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled',
                     'avg_price_per_room']

categoricalFeatures = ['type_of_meal_plan', 'required_car_parking_space',
                       'no_of_special_requests', 'room_type_reserved', 'arrival_year',
                       'arrival_month', 'market_segment_type', 'repeated_guest', 'booking_status']

In [6]:
from sklearn.preprocessing import LabelEncoder

def MinMax(X, features=numericalFeatures):
  for i in features:
    X[i] = (X[i] - min(X[i])) / (max(X[i]) - min(X[i]))
  return X

def StandardScaling(X, features=numericalFeatures):
  for i in features:
    X[i] = (X[i] - np.mean(X[i])) / np.std(X[i])
  return X

def LabelEncoding(X, features=categoricalFeatures):
  le = LabelEncoder()
  for i in features:
    X[i] = le.fit_transform(X[i])
  return X

In [7]:
df = LabelEncoding(df) 

In [8]:
df

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,0,0,0,224,0,9,2,3,0,0,0,65.00,0,1
1,INN00002,2,0,2,3,3,0,0,5,1,10,6,4,0,0,0,106.68,1,1
2,INN00003,1,0,2,1,0,0,0,1,1,1,28,4,0,0,0,60.00,0,0
3,INN00004,2,0,0,2,0,0,0,211,1,4,20,4,0,0,0,100.00,0,0
4,INN00005,2,0,1,1,3,0,0,48,1,3,11,4,0,0,0,94.50,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,INN36271,3,0,2,6,0,0,3,85,1,7,3,4,0,0,0,167.80,1,1
36271,INN36272,2,0,1,3,0,0,0,228,1,9,17,4,0,0,0,90.95,2,0
36272,INN36273,2,0,2,6,0,0,0,148,1,6,1,4,0,0,0,98.39,2,1
36273,INN36274,2,0,0,3,3,0,0,63,1,3,21,4,0,0,0,94.50,0,0


In [9]:
var_columns = [c for c in df.columns if c not in ['Booking_ID','booking_status']]

X = df.loc[:,var_columns]
y = df.loc[:,'booking_status']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape
X_train

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests
29580,2,0,1,3,0,0,0,200,1,7,29,4,0,0,0,90.95,0
6325,2,0,1,3,0,0,0,79,1,2,24,4,0,0,0,90.95,1
33899,2,0,1,4,0,0,3,78,1,3,6,4,0,0,0,99.45,1
720,2,0,2,0,0,0,0,61,0,9,4,3,0,0,0,91.00,0
22120,2,1,0,4,0,0,0,201,1,10,1,4,0,0,0,82.28,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16850,2,0,2,2,0,0,0,43,0,11,26,3,0,0,0,55.00,2
6265,1,0,0,2,1,0,0,102,0,9,16,4,0,0,0,80.00,0
11284,2,0,0,2,0,0,0,5,1,4,24,2,0,0,0,110.00,1
860,2,0,0,3,0,0,0,213,1,5,7,4,0,0,0,130.00,0


In [19]:
model_gbm = GradientBoostingClassifier(n_estimators=5000,
                                       learning_rate=0.05,
                                       max_depth=3,
                                       subsample=0.5,
                                       validation_fraction=0.1,
                                       n_iter_no_change=20,
                                       max_features='log2',
                                       verbose=1)

In [20]:
model_gbm.fit(X_train, y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.2517           0.0038            1.48m
         2           1.2547           0.0064            1.37m
         3           1.2402           0.0125            1.25m
         4           1.2222           0.0153           56.27s
         5           1.2094           0.0163            1.27m
         6           1.2018           0.0051            1.28m
         7           1.1968           0.0102            1.28m
         8           1.1635           0.0270            1.28m
         9           1.1483           0.0170            1.28m
        10           1.1446           0.0080            1.28m
        20           1.0324           0.0109            1.28m
        30           0.9634           0.0076            1.24m
        40           0.9162           0.0025            1.30m
        50           0.8834           0.0013            1.37m
        60           0.8446           0.0023            1.40m
       

In [24]:
len(model_gbm.estimators_)

1683

In [14]:
acc = model_gbm.score(X_valid,y_valid)
print(f'Score {acc*100}')

Score 87.89855738307452


In [26]:
print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                    roc_auc_score(y_valid, y_valid_pred)))

AUC Train: 0.9496
AUC Valid: 0.9387


In [27]:
import pickle

# Huấn luyện mô hình Linear Regression

# Lưu mô hình
with open('GRADIENT_BOOSTING_MODEL.pkl', 'wb') as f:
    pickle.dump(model_gbm, f)


In [1]:
# Tải lại mô hình
import pickle
with open('GRADIENT_BOOSTING_MODEL.pkl', 'rb') as f:
    model_gbm = pickle.load(f)
