# Load and Process Data

In [1]:
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
df_hotel = pd.read_csv("hotel_bookings.csv")

In [3]:
import numpy as np
# Create a new boolean feature considering if it's a company reservation or not
df_hotel['is_company'] = np.where(df_hotel['company'].isna(), False, True)
# Drop company column as we don't need it anymore and we can get rid from nan values
df = df_hotel.drop(['company'], axis=1)

In [4]:
# Create a new boolean feature considering if it's a reservation made by an agent or not
df_hotel['is_agent'] = np.where(df_hotel['agent'].isna(), False, True)

In [5]:
# Correct datetime
df_hotel[['reservation_status_date']] = df_hotel[['reservation_status_date']].astype('datetime64[ns]')

# Correct all categorical (except our target 'is_canceled') features including booleans (we can't use bool type or target variable as categorical to fit the model)
df_hotel[["hotel", "meal", "country", 
      "market_segment", "distribution_channel", 
      "reserved_room_type", "assigned_room_type", 
      "deposit_type", "customer_type", 
      "reservation_status"]] = df_hotel[["hotel", "meal", "country", "market_segment", 
                                       "distribution_channel", "reserved_room_type", "assigned_room_type", 
                                       "deposit_type", "customer_type", 
                                       "reservation_status",]].astype('category')

df_hotel['is_repeated_guest'] = df_hotel['is_repeated_guest'].astype(bool)



# Fill missing values from agent column and transform it into categorical column
from statistics import mode
df_hotel[["agent"]] = df_hotel[["agent"]].astype(pd.Int32Dtype())
df_hotel_list = df_hotel[["agent"]].values.tolist()
def flatten(t):
    return [item for sublist in t for item in sublist]
flat_hotel = flatten(df_hotel_list)
flat_mode = mode(flat_hotel)
df_hotel[["agent"]] = df_hotel[["agent"]].fillna(flat_mode)
df_hotel[["agent"]] = df_hotel[["agent"]].astype('int64')

# Fill na's from float column with the median
df_hotel['children'] = df_hotel['children'].fillna(df_hotel['children'].median()).astype('int64')

# Drop nan's rows from columns with inexpressive nan's count
df_hotel = df_hotel.dropna(subset = ['country', 'children'], axis = 0)

In [6]:
df_hotel = df_hotel.drop(['arrival_date_month'], axis=1)

In [7]:
df = df_hotel[['hotel', 'lead_time', 'arrival_date_year', 
                    'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 
                    'babies', 'meal','country', 'market_segment', 'distribution_channel',
                    'is_repeated_guest', 'previous_cancellations',
                    'previous_bookings_not_canceled', 'reserved_room_type',
                    'assigned_room_type', 'booking_changes', 'deposit_type',
                    'days_in_waiting_list', 'customer_type', 'adr',
                    'required_car_parking_spaces', 'total_of_special_requests',
                    'is_company','is_agent', 'is_canceled']]

df = df.dropna(how='any',axis=0) 

df['is_canceled'] = df['is_canceled'].astype(float)

#df = pd.get_dummies(df)

In [8]:
# Split the features and labels
X = df.loc[:, df.columns != 'is_canceled'] #.astype(float)
X = pd.get_dummies(X).astype(float)

In [9]:
# #standardizing the input feature
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X = sc.fit_transform(X)

#le = preprocessing.LabelEncoder()

# Split the features and labels
# X = df.loc[:, df.columns != 'is_canceled']

# Change the data into one-hot encoding (for features) and change label to 0-1
# X = pd.get_dummies(X)
# y = le.fit_transform(y)

In [10]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
# from sklearn.preprocessing import OneHotEncoder
#encode class values as integers
y = df.is_canceled
encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)

# Run Model

## Binary Classification: Baseline

In [11]:
# Binary Classification: Baseline
from pandas import read_csv
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
#encoded_Y = np.asarray(encoded_Y).astype('float32')
# baseline model
def create_baseline():
    # create model
    model = Sequential()
    model.add(Dense(243, input_dim=243, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
# evaluate model with standardized dataset
estimator = KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=1000, verbose=2)
kfold = StratifiedKFold(n_splits=3, shuffle=True)
results = cross_val_score(estimator, X, encoded_Y, cv=kfold)

2022-04-09 18:14:00.503401: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-09 18:14:00.509393: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2022-04-09 18:14:00.510090: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-04-09 18:14:00.511269: I tensorflow/core/platform/cpu_f

Epoch 1/100
80/80 - 2s - loss: 2.4309 - accuracy: 0.6346 - 2s/epoch - 21ms/step
Epoch 2/100
80/80 - 0s - loss: 0.4709 - accuracy: 0.7837 - 437ms/epoch - 5ms/step
Epoch 3/100
80/80 - 0s - loss: 0.4880 - accuracy: 0.7654 - 410ms/epoch - 5ms/step
Epoch 4/100
80/80 - 0s - loss: 0.4797 - accuracy: 0.7742 - 406ms/epoch - 5ms/step
Epoch 5/100
80/80 - 0s - loss: 0.4465 - accuracy: 0.7841 - 430ms/epoch - 5ms/step
Epoch 6/100
80/80 - 0s - loss: 0.4469 - accuracy: 0.7908 - 436ms/epoch - 5ms/step
Epoch 7/100
80/80 - 0s - loss: 0.4449 - accuracy: 0.7877 - 438ms/epoch - 5ms/step
Epoch 8/100
80/80 - 0s - loss: 0.5049 - accuracy: 0.7679 - 430ms/epoch - 5ms/step
Epoch 9/100
80/80 - 0s - loss: 0.4462 - accuracy: 0.7933 - 412ms/epoch - 5ms/step
Epoch 10/100
80/80 - 0s - loss: 0.4234 - accuracy: 0.8013 - 444ms/epoch - 6ms/step
Epoch 11/100
80/80 - 0s - loss: 0.4746 - accuracy: 0.7795 - 449ms/epoch - 6ms/step
Epoch 12/100
80/80 - 0s - loss: 0.4340 - accuracy: 0.7974 - 424ms/epoch - 5ms/step
Epoch 13/100
80

In [13]:
import joblib
joblib.dump(KerasClassifier, 'baseline.pkl')

['baseline.pkl']

In [14]:
baseline = print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 82.65% (0.68%)


## Binary Classification: Standardized

In [15]:
# Binary Classification: Standardized
from pandas import read_csv
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# baseline model
def create_baseline():
    # create model
    model = Sequential()
    model.add(Dense(243, input_dim=243, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
# evaluate baseline model with standardized dataset
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=1000, verbose=2)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=3, shuffle=True)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Standardized: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))



Epoch 1/100
80/80 - 1s - loss: 0.4536 - accuracy: 0.7806 - 751ms/epoch - 9ms/step
Epoch 2/100
80/80 - 0s - loss: 0.3603 - accuracy: 0.8285 - 416ms/epoch - 5ms/step
Epoch 3/100
80/80 - 0s - loss: 0.3431 - accuracy: 0.8394 - 415ms/epoch - 5ms/step
Epoch 4/100
80/80 - 0s - loss: 0.3340 - accuracy: 0.8439 - 411ms/epoch - 5ms/step
Epoch 5/100
80/80 - 0s - loss: 0.3284 - accuracy: 0.8472 - 397ms/epoch - 5ms/step
Epoch 6/100
80/80 - 0s - loss: 0.3230 - accuracy: 0.8497 - 409ms/epoch - 5ms/step
Epoch 7/100
80/80 - 0s - loss: 0.3194 - accuracy: 0.8509 - 370ms/epoch - 5ms/step
Epoch 8/100
80/80 - 0s - loss: 0.3149 - accuracy: 0.8532 - 376ms/epoch - 5ms/step
Epoch 9/100
80/80 - 0s - loss: 0.3128 - accuracy: 0.8549 - 373ms/epoch - 5ms/step
Epoch 10/100
80/80 - 0s - loss: 0.3097 - accuracy: 0.8565 - 368ms/epoch - 5ms/step
Epoch 11/100
80/80 - 0s - loss: 0.3066 - accuracy: 0.8583 - 369ms/epoch - 5ms/step
Epoch 12/100
80/80 - 0s - loss: 0.3050 - accuracy: 0.8590 - 368ms/epoch - 5ms/step
Epoch 13/100


In [16]:
import joblib
joblib.dump(pipeline, 'standardized.pkl')

['standardized.pkl']

In [17]:
standardized = print("Standardized: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Standardized: 86.00% (0.12%)


## Binary Classification: Standardized Small

In [18]:
# Binary Classification: Standardized Small
from pandas import read_csv
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# smaller model
def create_smaller():
    # create model
    model = Sequential()
    model.add(Dense(243, input_dim=243, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_smaller, epochs=100, batch_size=1000, verbose=2)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=3, shuffle=True)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)



Epoch 1/100
80/80 - 1s - loss: 0.4340 - accuracy: 0.7941 - 743ms/epoch - 9ms/step
Epoch 2/100
80/80 - 0s - loss: 0.3549 - accuracy: 0.8314 - 391ms/epoch - 5ms/step
Epoch 3/100
80/80 - 0s - loss: 0.3389 - accuracy: 0.8408 - 409ms/epoch - 5ms/step
Epoch 4/100
80/80 - 0s - loss: 0.3303 - accuracy: 0.8449 - 401ms/epoch - 5ms/step
Epoch 5/100
80/80 - 0s - loss: 0.3240 - accuracy: 0.8481 - 410ms/epoch - 5ms/step
Epoch 6/100
80/80 - 0s - loss: 0.3190 - accuracy: 0.8515 - 390ms/epoch - 5ms/step
Epoch 7/100
80/80 - 0s - loss: 0.3149 - accuracy: 0.8533 - 396ms/epoch - 5ms/step
Epoch 8/100
80/80 - 0s - loss: 0.3120 - accuracy: 0.8546 - 407ms/epoch - 5ms/step
Epoch 9/100
80/80 - 0s - loss: 0.3087 - accuracy: 0.8563 - 400ms/epoch - 5ms/step
Epoch 10/100
80/80 - 0s - loss: 0.3065 - accuracy: 0.8581 - 389ms/epoch - 5ms/step
Epoch 11/100
80/80 - 0s - loss: 0.3029 - accuracy: 0.8595 - 387ms/epoch - 5ms/step
Epoch 12/100
80/80 - 0s - loss: 0.3002 - accuracy: 0.8609 - 390ms/epoch - 5ms/step
Epoch 13/100


In [19]:
import joblib
joblib.dump(pipeline, 'smaller.pkl')

['smaller.pkl']

In [20]:
smaller = print("Smaller: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Smaller: 85.97% (0.11%)


## Binary Classification: Standardized Larger

In [21]:
# Binary Classification: Standardized Larger
from pandas import read_csv
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# larger model
def create_larger():
    # create model
    model = Sequential()
    model.add(Dense(243, input_dim=243, activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_larger, epochs=1000, batch_size=1000, verbose=2)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=3, shuffle=True)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)



Epoch 1/1000
80/80 - 1s - loss: 0.4405 - accuracy: 0.7878 - 852ms/epoch - 11ms/step
Epoch 2/1000
80/80 - 0s - loss: 0.3499 - accuracy: 0.8329 - 490ms/epoch - 6ms/step
Epoch 3/1000
80/80 - 0s - loss: 0.3341 - accuracy: 0.8422 - 491ms/epoch - 6ms/step
Epoch 4/1000
80/80 - 0s - loss: 0.3234 - accuracy: 0.8482 - 493ms/epoch - 6ms/step
Epoch 5/1000
80/80 - 0s - loss: 0.3169 - accuracy: 0.8509 - 474ms/epoch - 6ms/step
Epoch 6/1000
80/80 - 0s - loss: 0.3101 - accuracy: 0.8547 - 460ms/epoch - 6ms/step
Epoch 7/1000
80/80 - 0s - loss: 0.3056 - accuracy: 0.8572 - 426ms/epoch - 5ms/step
Epoch 8/1000
80/80 - 0s - loss: 0.3017 - accuracy: 0.8586 - 437ms/epoch - 5ms/step
Epoch 9/1000
80/80 - 0s - loss: 0.2975 - accuracy: 0.8606 - 455ms/epoch - 6ms/step
Epoch 10/1000
80/80 - 0s - loss: 0.2965 - accuracy: 0.8614 - 447ms/epoch - 6ms/step
Epoch 11/1000
80/80 - 0s - loss: 0.2931 - accuracy: 0.8628 - 432ms/epoch - 5ms/step
Epoch 12/1000
80/80 - 0s - loss: 0.2910 - accuracy: 0.8637 - 440ms/epoch - 6ms/step


In [22]:
import joblib
joblib.dump(pipeline, 'larger.pkl')

['larger.pkl']

In [23]:
larger = print("Larger: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Larger: 84.90% (0.19%)


## Predict

In [24]:
pipe = joblib.load('model.pkl')

In [25]:
# New data to predict
df_hotel = pd.read_csv('hotel_bookings.csv')
import numpy as np
# Create a new boolean feature considering if it's a company reservation or not
df_hotel['is_company'] = np.where(df_hotel['company'].isna(), False, True)
# Drop company column as we don't need it anymore and we can get rid from nan values
df = df_hotel.drop(['company'], axis=1)
# Create a new boolean feature considering if it's a reservation made by an agent or not
df_hotel['is_agent'] = np.where(df_hotel['agent'].isna(), False, True)
# Correct datetime
df_hotel[['reservation_status_date']] = df_hotel[['reservation_status_date']].astype('datetime64[ns]')

# Correct all categorical (except our target 'is_canceled') features including booleans (we can't use bool type or target variable as categorical to fit the model)
df_hotel[["hotel", "meal", "country", 
      "market_segment", "distribution_channel", 
      "reserved_room_type", "assigned_room_type", 
      "deposit_type", "customer_type", 
      "reservation_status"]] = df_hotel[["hotel", "meal", "country", "market_segment", 
                                       "distribution_channel", "reserved_room_type", "assigned_room_type", 
                                       "deposit_type", "customer_type", 
                                       "reservation_status",]].astype('category')

df_hotel['is_repeated_guest'] = df_hotel['is_repeated_guest'].astype(bool)



# Fill missing values from agent column and transform it into categorical column
from statistics import mode
df_hotel[["agent"]] = df_hotel[["agent"]].astype(pd.Int32Dtype())
df_hotel_list = df_hotel[["agent"]].values.tolist()
def flatten(t):
    return [item for sublist in t for item in sublist]
flat_hotel = flatten(df_hotel_list)
flat_mode = mode(flat_hotel)
df_hotel[["agent"]] = df_hotel[["agent"]].fillna(flat_mode)
df_hotel[["agent"]] = df_hotel[["agent"]].astype('int64')

# Fill na's from float column with the median
df_hotel['children'] = df_hotel['children'].fillna(df_hotel['children'].median()).astype('int64')

# Drop nan's rows from columns with inexpressive nan's count
df_hotel = df_hotel.dropna(subset = ['country', 'children'], axis = 0)
df_hotel = df_hotel.drop(['arrival_date_month'], axis=1)
df = df_hotel[['hotel', 'lead_time', 'arrival_date_year', 
                    'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 
                    'babies', 'meal','country', 'market_segment', 'distribution_channel',
                    'is_repeated_guest', 'previous_cancellations',
                    'previous_bookings_not_canceled', 'reserved_room_type',
                    'assigned_room_type', 'booking_changes', 'deposit_type',
                    'days_in_waiting_list', 'customer_type', 'adr',
                    'required_car_parking_spaces', 'total_of_special_requests',
                    'is_company','is_agent', 'is_canceled']]

df = df.dropna(how='any',axis=0) 

df['is_canceled'] = df['is_canceled'].astype(float)

#df = pd.get_dummies(df)

In [26]:
# Split the features and labels
# standardizing the input feature
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
X = df.loc[:, df.columns != 'is_canceled'] #.astype(float)
X = pd.get_dummies(X).astype(float)

In [27]:
# standardizing the input feature
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

le = preprocessing.LabelEncoder()

Split the features and labels
X = df.loc[:, df.columns != 'is_canceled']

# Change the data into one-hot encoding (for features) and change label to 0-1
X = pd.get_dummies(X)
y = le.fit_transform(y)

SyntaxError: invalid syntax (4064625610.py, line 8)

In [None]:
pred_cols = list(X.values)
# apply the whole pipeline to data
pred = pd.Series(pipe.predict(X=X[pred_cols]))

In [None]:
y_pred=pipe.predict(encoded_Y)
y_pred =(y_pred>0.5)

In [None]:
# New data to predict
pred_cols = list(pr.is_canceled.values)

In [None]:
pred = pd.Series(pipe.predict(X=pr['is_canceled'].values))

In [None]:
sc = StandardScaler()
sc.fit(pred_cols)
x = sc.transform(pred_cols)


y_pred=pipe.predict(x)
y_pred =(y_pred>0.5)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)


In [None]:
# Convert into Numpy array
samples_to_predict = np.array(samples_to_predict)

## Example X

In [None]:
#standardizing the input feature
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
X

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from tensorflow import keras
from keras import Sequential
from keras.layers import Dense

In [None]:
classifier = Sequential()
#First Hidden Layer
classifier.add(Dense(4, activation='relu', kernel_initializer='random_normal', input_dim=8))
#Second  Hidden Layer
classifier.add(Dense(4, activation='relu', kernel_initializer='random_normal'))
#Output Layer
classifier.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))

In [None]:
#Compiling the neural network
classifier.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])

In [None]:
import tensorflow as tf
X_train = Input(shape=input_shape)
# #Fitting the data to the training dataset
# #X_train = tf.expand_dims(X_train, axis=-1)
# X_train = X_train.reshape(None, 8)
# #y_train = tf.expand_dims(y_train, axis=-1)
# y_train = y_train.reshape(None, 8)
classifier.fit(X_train,y_train, batch_size=10, epochs=100)

In [None]:
eval_model=classifier.evaluate(X_train, y_train)
eval_model