### In Progress

# Preprocessing

In [1]:
# Import our dependencies.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

import pandas as pd
import tensorflow as tf

In [2]:
# Import and read the cleaned data.
hotel_df=pd.read_csv("../Resources/hotel_reservations.csv")
print(hotel_df.shape)

print(hotel_df.columns)
hotel_df.head()

(119390, 32)
Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date'],
      dtype='object')


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [3]:
# Determine information in each column.
hotel_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [4]:
# Determine if we have duplication in the dataset.
print(hotel_df.duplicated().sum())

# Eliminate duplication in the dataset.
hotel_df1=hotel_df.drop_duplicates()
print(hotel_df1.shape)

31994
(87396, 32)


In [5]:
# Determine missing values.
null_values=hotel_df1.isnull().sum()
print(null_values)

hotel                                 0
is_canceled                           0
lead_time                             0
arrival_date_year                     0
arrival_date_month                    0
arrival_date_week_number              0
arrival_date_day_of_month             0
stays_in_weekend_nights               0
stays_in_week_nights                  0
adults                                0
children                              4
babies                                0
meal                                  0
country                             452
market_segment                        0
distribution_channel                  0
is_repeated_guest                     0
previous_cancellations                0
previous_bookings_not_canceled        0
reserved_room_type                    0
assigned_room_type                    0
booking_changes                       0
deposit_type                          0
agent                             12193
company                           82137


In [6]:
# Drop agent and company columns. 
hotel_df1_clean=hotel_df1.drop(["company","agent"],axis=1)

# Drop the null rows.
hotel_df1_clean=hotel_df1_clean.dropna()

In [7]:
# Drop rows with not sufficinet information (Adult=0 and children=0)
hotel_df1_clean=hotel_df1_clean.drop(hotel_df1_clean[(hotel_df1_clean.adults==0) & (hotel_df1_clean.children==0)].index)
hotel_df1_clean.shape

(86779, 30)

In [8]:
# Split reservation_status_date to day, month and year.
hotel_df1_clean[["year", "month", "day"]] = hotel_df1_clean["reservation_status_date"].str.split("-", expand = True)
hotel_df1_clean=hotel_df1_clean.drop(["reservation_status_date"],axis=1)
hotel_df1_clean.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,year,month,day
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,0,Transient,0.0,0,0,Check-Out,2015,7,1
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,0,Transient,0.0,0,0,Check-Out,2015,7,1
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,0,Transient,75.0,0,0,Check-Out,2015,7,2
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,0,Transient,75.0,0,0,Check-Out,2015,7,2
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,0,Transient,98.0,0,1,Check-Out,2015,7,3


In [9]:
# Separate the categorical variables from others.
hotel_df1_category=hotel_df1_clean.dtypes[hotel_df1_clean.dtypes=='object'].index.tolist()
#hotel_df1_clean_cat=[c for c in hotel_df1_clean.columns if hotel_df1_clean[c].dtype=='object']

hotel_df1_other=hotel_df1_clean.dtypes[hotel_df1_clean.dtypes!='object'].index.tolist()
#hotel_df1_clean_other=[n for n in hotel_df1_clean.columns if n not in hotel_df1_clean_cat]

print(f"Categorical variables:\n{hotel_df1_category}")
print("----------------------------------------------------------------------")
print(f"Other variables:\n{hotel_df1_other}")

Categorical variables:
['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type', 'reservation_status', 'year', 'month', 'day']
----------------------------------------------------------------------
Other variables:
['is_canceled', 'lead_time', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list', 'adr', 'required_car_parking_spaces', 'total_of_special_requests']


In [10]:
# Replace null values by undefined in categorical variables and by zero in other variables
hotel_df1_clean[hotel_df1_category].fillna("Undefined", inplace = True)
hotel_df1_clean[hotel_df1_other].fillna(0, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [11]:
# Determine the number of unique values in categorical columns.
print(hotel_df1_clean[hotel_df1_category].nunique())

hotel                     2
arrival_date_month       12
meal                      5
country                 177
market_segment            7
distribution_channel      5
reserved_room_type        9
assigned_room_type       11
deposit_type              3
customer_type             4
reservation_status        3
year                      4
month                    12
day                      31
dtype: int64


In [12]:
# Determine the number of unique values in other columns.
print(hotel_df1_clean[hotel_df1_other].nunique())

is_canceled                          2
lead_time                          479
arrival_date_year                    3
arrival_date_week_number            53
arrival_date_day_of_month           31
stays_in_weekend_nights             15
stays_in_week_nights                31
adults                              14
children                             5
babies                               5
is_repeated_guest                    2
previous_cancellations              15
previous_bookings_not_canceled      73
booking_changes                     19
days_in_waiting_list               127
adr                               8857
required_car_parking_spaces          5
total_of_special_requests            6
dtype: int64


In [13]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df=pd.DataFrame(enc.fit_transform(hotel_df1_clean[hotel_df1_category]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(hotel_df1_category)
encode_df.head()

# # Create a OrdinalEncoder instance
# oe = OrdinalEncoder()

# # Fit and transform the OrdinalEncoder using the categorical variable list
# encode_df=pd.DataFrame(oe.fit_transform(hotel_df1_clean[hotel_df1_category]))
                      
# # Add the encoded variable names to the dataframe
# encode_df.columns=hotel_df1_category
# encode_df.head()

Unnamed: 0,hotel_City Hotel,hotel_Resort Hotel,arrival_date_month_April,arrival_date_month_August,arrival_date_month_December,arrival_date_month_February,arrival_date_month_January,arrival_date_month_July,arrival_date_month_June,arrival_date_month_March,...,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# # Merge encoded features and drop the originals.
# hotel_df_merge=hotel_df1_clean.merge(encode_df,left_index=True,right_index=True)
# hotel_df_merge=hotel_df_merge.drop(hotel_df1_category,1)

hotel_df_merge=hotel_df1_clean[hotel_df1_other].merge(encode_df,left_index=True,right_index=True)
hotel_df_merge.head()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,...,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31
0,0,342,2015,27,1,0,0,2,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,737,2015,27,1,0,0,2,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,7,2015,27,1,0,1,1,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,13,2015,27,1,0,1,1,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,14,2015,27,1,0,2,2,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Normalize the nominal columns.
X_num=hotel_df1_clean[hotel_df1_other].values

# define min max scaler
scaler = MinMaxScaler()
# transform data
scaled = scaler.fit_transform(X_num)
X_scaled_num=pd.DataFrame(scaled)

horel_df_merge=X_scaled_num.merge(encode_df,left_index=True,right_index=True)
horel_df_merge

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31
0,0.0,0.464043,0.0,0.500000,0.000000,0.000,0.000,0.036364,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.000000,0.0,0.500000,0.000000,0.000,0.000,0.036364,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.009498,0.0,0.500000,0.000000,0.000,0.025,0.018182,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.017639,0.0,0.500000,0.000000,0.000,0.025,0.018182,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.018996,0.0,0.500000,0.000000,0.000,0.050,0.036364,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86774,0.0,0.031208,1.0,0.653846,0.966667,0.125,0.125,0.036364,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86775,0.0,0.138399,1.0,0.653846,1.000000,0.125,0.125,0.054545,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86776,0.0,0.046133,1.0,0.653846,1.000000,0.125,0.125,0.036364,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86777,0.0,0.147897,1.0,0.653846,1.000000,0.125,0.125,0.036364,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Split data into train and test
y=hotel_df_merge['is_canceled'].values
X=hotel_df_merge.drop(['is_canceled'],1).values

  This is separate from the ipykernel package so we can avoid doing imports until


In [17]:
# from sklearn.ensemble import GradientBoostingClassifier
# # report which features were selected by RFE
# from sklearn.datasets import make_classification
# from sklearn.feature_selection import RFE
# from sklearn.tree import DecisionTreeClassifier
# rfe = RFE(estimator=GradientBoostingClassifier(), n_features_to_select=20)
# # fit RFE
# rfe.fit(X, y)

# ind=[]
# for i in range(X.shape[1]):
#     if rfe.support_[i]==True:
#         ind.append(i)
#     print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))
# hotel_df_merge_new=hotel_df_merge.iloc[:,ind]
# X_new=hotel_df_merge_new.drop(['is_canceled'],1).values

In [18]:
# Split the data into a training and testing dataset
# X_train, X_test, y_train, y_test=train_test_split(X_new,y,random_state=78)
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=78)

In [19]:
print(X_train.shape)
print(X_test.shape)

(44772, 302)
(14925, 302)


## Neural Network Model

In [20]:
# Cross Validation 
def model_structure():
    
    nn = tf.keras.models.Sequential()

    number_input_features=len(X_train[0])
    hidden_nodes_layer1 =  100
    hidden_nodes_layer2 = 50
    # hidden_nodes_layer3 = 20


    # First hidden layer
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,
                             input_dim=number_input_features,
                             activation='relu'))

    # Second hidden layer
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2,
                             activation='relu'))

    # Third hidden layer
    # nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3,
    #                          activation='relu'))

    # Output layer
    nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    
    # Check the structure of the model
    return nn
    
# estimator = KerasClassifier(build_fn=model_structure, nb_epoch=100, batch_size=100, verbose=0)
# kfold = KFold(n_splits=10)
# results = cross_val_score(estimator, X_train , y_train, cv=kfold)
# print(f"Mean: {results.mean()}, STD: {results.std()}")

In [21]:
# Train the model
model=model_structure()
fit_model=model.fit(X_train,y_train,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [22]:
# Evaluate the model using the test data
model_loss, model_accuracy = model.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

467/467 - 0s - loss: 0.1574 - accuracy: 0.9367
Loss: 0.15735936164855957, Accuracy: 0.9366834163665771
