In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [7]:
df = pd.read_csv("../../data/H2_categorical.csv")

In [8]:
df.shape

(78547, 37)

In [9]:
df.head()

Unnamed: 0,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,Country,MarketSegment,DistributionChannel,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,ReservedRoomType,AssignedRoomType,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,ArrivalDate,ReservationDate,ChangedRoom,TotalStay,CheckOutDate,StayChanges,ReservationMonth
0,6.0,2015,July,27,1,0,2,1.0,0.0,0.0,HB,National,Offline TA/TO,TA/TO,0,0,0,A,A,0,No Deposit,other_Agent,no_Company,0.0,Transient,0.0,0,0,Check-Out,2015-07-03,2015-07-01,2015-06-25,0,2.0,2015-07-03,No Changes,June
1,88.0,2015,July,27,1,0,4,2.0,0.0,0.0,BB,National,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,other_Agent,no_Company,0.0,Transient,76.5,0,1,Canceled,2015-07-01,2015-07-01,2015-04-04,0,4.0,2015-07-05,No Changes,April
2,65.0,2015,July,27,1,0,4,1.0,0.0,0.0,BB,National,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,other_Agent,no_Company,0.0,Transient,68.0,0,1,Canceled,2015-04-30,2015-07-01,2015-04-27,0,4.0,2015-07-05,No Changes,April
3,92.0,2015,July,27,1,2,4,2.0,0.0,0.0,BB,National,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,other_Agent,no_Company,0.0,Transient,76.5,0,2+,Canceled,2015-06-23,2015-07-01,2015-03-31,0,6.0,2015-07-07,No Changes,March
4,100.0,2015,July,27,2,0,2,2.0,0.0,0.0,BB,National,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,other_Agent,no_Company,0.0,Transient,76.5,0,1,Canceled,2015-04-02,2015-07-02,2015-03-24,0,2.0,2015-07-04,No Changes,March


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78547 entries, 0 to 78546
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   LeadTime                     78547 non-null  float64
 1   ArrivalDateYear              78547 non-null  int64  
 2   ArrivalDateMonth             78547 non-null  object 
 3   ArrivalDateWeekNumber        78547 non-null  int64  
 4   ArrivalDateDayOfMonth        78547 non-null  int64  
 5   StaysInWeekendNights         78547 non-null  object 
 6   StaysInWeekNights            78547 non-null  object 
 7   Adults                       78547 non-null  float64
 8   Children                     78547 non-null  float64
 9   Babies                       78547 non-null  float64
 10  Meal                         78547 non-null  object 
 11  Country                      78547 non-null  object 
 12  MarketSegment                78547 non-null  object 
 13  DistributionChan

In [11]:
# Converts all non float variables in categorical
cat_cols = df.select_dtypes(exclude='float64').columns
df[cat_cols] = df[cat_cols].astype('category')

# Convert variables to categorical
df['Adults'] = df['Adults'].astype('category')
df['Children'] = df['Children'].astype('category')
df['Babies'] = df['Babies'].astype('category')

In [12]:
# Creates variable exclusion list for modeling
to_drop = ['ReservationDate', 'ReservationStatusDate', 'CheckOutDate', 
           'ArrivalDate', 'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth', 
           'ArrivalDateYear', 'AssignedRoomType', 'DaysInWaitingList', 'TotalStay']

# Removes features
df.drop(to_drop, axis=1, inplace=True)

In [13]:
# Selects stratifier variable and 
X = df.drop(['ArrivalDateMonth'], axis=1)
y = df['ArrivalDateMonth']

# Creates a generator for a stratified random shuffled sample of 20% of observations 
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
split = sss.split(X, y)

# Selects sample dataset from split generator
for train_index, test_index in split:
    sample_df = df.iloc[test_index]

In [14]:
# Shows the resulting sample size
sample_df.shape

(15710, 27)

#### Categorical to cardinal transformation

In [15]:
# Get dummy variables for each categorical variable
encoded_df = pd.get_dummies(sample_df, drop_first=True)

# Check number of total variables
encoded_df.shape

(15710, 85)

#### Feature scaling

In [16]:
# Initialized the scaler
scaler = StandardScaler()
# Fit anf transform
scaler.fit(encoded_df)
scaled = scaler.transform(encoded_df)

In [17]:
# Creates dataframe with the scaled dataset
df_scaled = pd.DataFrame(scaled, columns=encoded_df.columns)

In [18]:
# Shows a sample of the final dataset
df_scaled.sample(10)

Unnamed: 0,LeadTime,ADR,ArrivalDateMonth_August,ArrivalDateMonth_December,ArrivalDateMonth_February,ArrivalDateMonth_January,ArrivalDateMonth_July,ArrivalDateMonth_June,ArrivalDateMonth_March,ArrivalDateMonth_May,ArrivalDateMonth_November,ArrivalDateMonth_October,ArrivalDateMonth_September,StaysInWeekendNights_1,StaysInWeekendNights_2,StaysInWeekendNights_3+,StaysInWeekNights_1,StaysInWeekNights_2,StaysInWeekNights_3,StaysInWeekNights_4,StaysInWeekNights_5,StaysInWeekNights_6+,Adults_1.0,Adults_2.0,Adults_3.0,Adults_4.0,Children_1.0,Babies_1.0,Meal_FB,Meal_HB,Meal_SC,Country_Germany,Country_National,Country_Other_Africa,Country_Other_Americas,Country_Other_Asia,Country_Other_Europe,Country_Other_Oceania,Country_Spain,Country_United_Kingdom,MarketSegment_Complementary,MarketSegment_Corporate,MarketSegment_Direct,MarketSegment_Groups,MarketSegment_Offline TA/TO,MarketSegment_Online TA,DistributionChannel_Direct,DistributionChannel_GDS,DistributionChannel_TA/TO,IsRepeatedGuest_1,PreviousCancellations_1,PreviousCancellations_0,PreviousCancellations_1.1,PreviousCancellations_2+,PreviousBookingsNotCanceled_0,PreviousBookingsNotCanceled_1-2,PreviousBookingsNotCanceled_3+,ReservedRoomType_D,ReservedRoomType_Other,BookingChanges_1,BookingChanges_2+,DepositType_Non Refund,Agent_other_Agent,Company_other_Company,CustomerType_Group,CustomerType_Transient,CustomerType_Transient-Party,RequiredCarParkingSpaces_1,TotalOfSpecialRequests_1,TotalOfSpecialRequests_2+,ReservationStatus_Check-Out,ReservationStatus_No-Show,ChangedRoom_1,StayChanges_No Changes,ReservationMonth_August,ReservationMonth_December,ReservationMonth_February,ReservationMonth_January,ReservationMonth_July,ReservationMonth_June,ReservationMonth_March,ReservationMonth_May,ReservationMonth_November,ReservationMonth_October,ReservationMonth_September
5984,-0.455489,-0.774227,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,3.363319,-0.339896,-0.241094,-0.325737,-0.320692,-0.608782,1.768529,-0.093448,-0.599948,-0.715258,-0.50606,-0.299097,-0.196492,-0.114415,-0.504869,0.598967,-0.247675,-0.015959,-0.258216,-0.069722,-0.023942,-0.298465,-0.392376,-0.297959,-0.803457,-0.108262,-0.229031,-0.193148,-0.479117,-0.071543,-0.251417,-0.263643,-0.090634,-0.200132,-0.283668,-0.468541,1.935516,-0.969652,-0.30287,-0.052997,0.389672,-0.156377,-0.007979,-0.736522,-0.258637,-0.049241,0.877804,-0.088832,-0.112979,-0.413931,-0.251131,-0.316693,-0.191903,-0.438439,0.34477,-0.223121,-0.062434,0.584358,-0.532838,-0.156591,-0.60456,-0.37964,0.839651,-0.104592,3.167381,0.007979,-0.269804,-0.299097,-0.356219,2.510693,-0.31523,-0.215468,-0.294144,-0.257936,-0.29631,-0.341757,-0.266942
14582,1.09911,-0.095214,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,-0.241094,-0.325737,-0.320692,-0.608782,-0.565442,-0.093448,-0.599948,-0.715258,1.976051,-0.299097,-0.196492,-0.114415,-0.504869,0.598967,-0.247675,-0.015959,-0.258216,-0.069722,-0.023942,-0.298465,-0.392376,-0.297959,-0.803457,-0.108262,-0.229031,-0.193148,2.087173,-0.071543,-0.251417,-0.263643,-0.090634,-0.200132,-0.283668,-0.468541,-0.516658,1.031298,-0.30287,-0.052997,0.389672,-0.156377,-0.007979,-0.736522,-0.258637,-0.049241,-1.139206,-0.088832,-0.112979,-0.413931,3.981987,3.157637,-0.191903,-0.438439,0.34477,-0.223121,-0.062434,0.584358,-0.532838,-0.156591,-0.60456,-0.37964,-1.190971,-0.104592,-0.315718,0.007979,3.706391,-0.299097,-0.356219,-0.398296,-0.31523,-0.215468,-0.294144,-0.257936,-0.29631,-0.341757,-0.266942
9055,-0.871253,0.567762,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,-0.241094,3.069965,-0.320692,1.642623,-0.565442,-0.093448,-0.599948,-0.715258,1.976051,-0.299097,-0.196492,-0.114415,-0.504869,0.598967,-0.247675,-0.015959,-0.258216,-0.069722,-0.023942,-0.298465,-0.392376,-0.297959,-0.803457,-0.108262,-0.229031,-0.193148,-0.479117,-0.071543,-0.251417,-0.263643,-0.090634,-0.200132,3.525245,-0.468541,-0.516658,-0.969652,3.301744,-0.052997,-2.566263,-0.156377,-0.007979,1.357733,-0.258637,-0.049241,0.877804,-0.088832,-0.112979,-0.413931,-0.251131,3.157637,-0.191903,-0.438439,0.34477,-0.223121,-0.062434,0.584358,-0.532838,-0.156591,1.654094,-0.37964,0.839651,-0.104592,-0.315718,0.007979,-0.269804,-0.299097,-0.356219,-0.398296,-0.31523,-0.215468,-0.294144,-0.257936,-0.29631,-0.341757,3.74613
874,-0.744716,0.389796,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,3.004251,-0.297325,-0.339896,-0.241094,-0.325737,-0.320692,-0.608782,-0.565442,-0.093448,1.666811,-0.715258,-0.50606,-0.299097,-0.196492,-0.114415,-0.504869,0.598967,-0.247675,-0.015959,-0.258216,-0.069722,-0.023942,-0.298465,-0.392376,-0.297959,1.244622,-0.108262,-0.229031,-0.193148,-0.479117,-0.071543,-0.251417,-0.263643,-0.090634,-0.200132,-0.283668,-0.468541,1.935516,-0.969652,-0.30287,-0.052997,-2.566263,-0.156377,-0.007979,-0.736522,-0.258637,-0.049241,-1.139206,-0.088832,-0.112979,-0.413931,-0.251131,-0.316693,-0.191903,-0.438439,0.34477,-0.223121,-0.062434,-1.711281,1.876743,-0.156591,-0.60456,-0.37964,-1.190971,-0.104592,-0.315718,0.007979,-0.269804,-0.299097,-0.356219,-0.398296,-0.31523,-0.215468,-0.294144,3.876933,-0.29631,-0.341757,-0.266942
10504,1.117186,-0.246537,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,2.942075,-0.241094,-0.325737,-0.320692,-0.608782,-0.565442,-0.093448,-0.599948,-0.715258,-0.50606,3.343396,-0.196492,-0.114415,-0.504869,0.598967,-0.247675,-0.015959,-0.258216,-0.069722,-0.023942,-0.298465,-0.392376,3.356167,-0.803457,-0.108262,-0.229031,-0.193148,-0.479117,-0.071543,-0.251417,-0.263643,-0.090634,-0.200132,3.525245,-0.468541,-0.516658,-0.969652,3.301744,-0.052997,-2.566263,-0.156377,-0.007979,1.357733,-0.258637,-0.049241,0.877804,-0.088832,-0.112979,-0.413931,-0.251131,-0.316693,-0.191903,-0.438439,0.34477,-0.223121,-0.062434,0.584358,-0.532838,-0.156591,1.654094,-0.37964,0.839651,-0.104592,-0.315718,0.007979,-0.269804,-0.299097,-0.356219,-0.398296,-0.31523,-0.215468,-0.294144,-0.257936,-0.29631,2.926051,-0.266942
14959,-0.88933,-0.179282,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,-0.241094,-0.325737,3.118261,-0.608782,-0.565442,-0.093448,-0.599948,1.398096,-0.50606,-0.299097,-0.196492,-0.114415,-0.504869,0.598967,-0.247675,-0.015959,-0.258216,-0.069722,-0.023942,-0.298465,-0.392376,-0.297959,-0.803457,-0.108262,-0.229031,-0.193148,2.087173,-0.071543,-0.251417,-0.263643,-0.090634,-0.200132,-0.283668,2.134283,-0.516658,-0.969652,-0.30287,-0.052997,0.389672,-0.156377,-0.007979,1.357733,-0.258637,-0.049241,0.877804,-0.088832,-0.112979,-0.413931,-0.251131,-0.316693,-0.191903,-0.438439,0.34477,-0.223121,-0.062434,-1.711281,1.876743,-0.156591,-0.60456,-0.37964,0.839651,-0.104592,3.167381,0.007979,-0.269804,-0.299097,-0.356219,-0.398296,-0.31523,-0.215468,-0.294144,-0.257936,-0.29631,-0.341757,3.74613
14756,-0.988752,-0.256883,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,3.004251,-0.297325,-0.339896,-0.241094,-0.325737,-0.320692,1.642623,-0.565442,-0.093448,-0.599948,-0.715258,-0.50606,-0.299097,-0.196492,-0.114415,1.980712,-1.669541,-0.247675,-0.015959,-0.258216,-0.069722,-0.023942,-0.298465,-0.392376,-0.297959,-0.803457,-0.108262,-0.229031,-0.193148,-0.479117,-0.071543,3.977453,-0.263643,-0.090634,4.996693,-0.283668,-0.468541,-0.516658,-0.969652,-0.30287,-0.052997,-2.566263,-0.156377,-0.007979,1.357733,-0.258637,-0.049241,0.877804,-0.088832,-0.112979,-0.413931,-0.251131,-0.316693,-0.191903,-0.438439,-2.900487,4.48188,-0.062434,0.584358,-0.532838,-0.156591,-0.60456,-0.37964,0.839651,-0.104592,-0.315718,0.007979,-0.269804,-0.299097,-0.356219,-0.398296,-0.31523,4.641062,-0.294144,-0.257936,-0.29631,-0.341757,-0.266942
3104,1.505836,0.294863,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,-0.241094,-0.325737,3.118261,-0.608782,-0.565442,-0.093448,-0.599948,-0.715258,1.976051,-0.299097,-0.196492,-0.114415,-0.504869,0.598967,-0.247675,-0.015959,-0.258216,-0.069722,-0.023942,-0.298465,-0.392376,3.356167,-0.803457,-0.108262,-0.229031,-0.193148,-0.479117,-0.071543,-0.251417,-0.263643,-0.090634,-0.200132,-0.283668,-0.468541,1.935516,-0.969652,-0.30287,-0.052997,0.389672,-0.156377,-0.007979,1.357733,-0.258637,-0.049241,0.877804,-0.088832,-0.112979,-0.413931,-0.251131,-0.316693,-0.191903,-0.438439,0.34477,-0.223121,-0.062434,-1.711281,1.876743,-0.156591,-0.60456,-0.37964,0.839651,-0.104592,-0.315718,0.007979,-0.269804,3.343396,-0.356219,-0.398296,-0.31523,-0.215468,-0.294144,-0.257936,-0.29631,-0.341757,-0.266942
925,-0.844138,-0.731029,-0.357239,4.258544,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,-0.241094,-0.325737,-0.320692,-0.608782,1.768529,-0.093448,-0.599948,-0.715258,-0.50606,3.343396,-0.196492,-0.114415,-0.504869,0.598967,-0.247675,-0.015959,-0.258216,-0.069722,-0.023942,-0.298465,-0.392376,-0.297959,1.244622,-0.108262,-0.229031,-0.193148,-0.479117,-0.071543,-0.251417,-0.263643,-0.090634,-0.200132,-0.283668,2.134283,-0.516658,-0.969652,-0.30287,-0.052997,0.389672,-0.156377,-0.007979,-0.736522,-0.258637,-0.049241,-1.139206,-0.088832,-0.112979,-0.413931,-0.251131,-0.316693,-0.191903,2.280818,0.34477,-0.223121,-0.062434,0.584358,-0.532838,-0.156591,-0.60456,-0.37964,-1.190971,-0.104592,-0.315718,0.007979,-0.269804,3.343396,-0.356219,-0.398296,-0.31523,-0.215468,-0.294144,-0.257936,-0.29631,-0.341757,-0.266942
11413,-0.00357,0.638121,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,-0.241094,-0.325737,3.118261,-0.608782,-0.565442,-0.093448,-0.599948,-0.715258,1.976051,-0.299097,-0.196492,-0.114415,-0.504869,0.598967,-0.247675,-0.015959,-0.258216,-0.069722,-0.023942,-0.298465,-0.392376,-0.297959,-0.803457,-0.108262,-0.229031,-0.193148,2.087173,-0.071543,-0.251417,-0.263643,-0.090634,-0.200132,-0.283668,-0.468541,-0.516658,1.031298,-0.30287,-0.052997,0.389672,-0.156377,-0.007979,1.357733,-0.258637,-0.049241,0.877804,-0.088832,-0.112979,-0.413931,-0.251131,-0.316693,-0.191903,-0.438439,0.34477,-0.223121,-0.062434,0.584358,-0.532838,-0.156591,1.654094,-0.37964,0.839651,-0.104592,-0.315718,0.007979,-0.269804,-0.299097,-0.356219,-0.398296,-0.31523,-0.215468,-0.294144,3.876933,-0.29631,-0.341757,-0.266942


In [19]:
# Save DataFrame to CSV
df_scaled.to_csv("../../data/H2_categorical_encoded_scaled.csv", index=False)
sample_df.to_csv("../../data/H2_categorical_sample.csv", index=False)