# 3. Preprocessing

1. Import packages and dataset
2. Sample selection
3. Categorical to cardinal transformation
4. Feature scaling

---
## 3.1. Import packages and dataset

In [1]:
# Import the required packages

import os
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import pre-processed dataset and load it into a dataframe

# Create path variable
original_file = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-3/data/interim/category_transformation.csv'

# Load csv in dataframe
dfo = pd.read_csv(original_file, index_col=0)

In [3]:
# Check # of columns and rows imported
dfo.shape

(78547, 37)

In [4]:
# Print info
dfo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78547 entries, 0 to 79329
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   LeadTime                     78547 non-null  float64
 1   ArrivalDateYear              78547 non-null  int64  
 2   ArrivalDateMonth             78547 non-null  object 
 3   ArrivalDateWeekNumber        78547 non-null  int64  
 4   ArrivalDateDayOfMonth        78547 non-null  int64  
 5   StaysInWeekendNights         78547 non-null  object 
 6   StaysInWeekNights            78547 non-null  object 
 7   Adults                       78547 non-null  float64
 8   Children                     78547 non-null  float64
 9   Babies                       78547 non-null  float64
 10  Meal                         78547 non-null  object 
 11  Country                      78547 non-null  object 
 12  MarketSegment                78547 non-null  object 
 13  DistributionChan

In [5]:
# Converts all non float variables in categorical
cat_cols = dfo.select_dtypes(exclude='float64').columns
dfo[cat_cols] = dfo[cat_cols].astype('category')

# Convert variables to categorical
dfo['Adults'] = dfo['Adults'].astype('category')
dfo['Children'] = dfo['Children'].astype('category')
dfo['Babies'] = dfo['Babies'].astype('category')

---
## 3.2. Sample selection

In this section we will take a reduced sample of the whole dataset for computation feasibility purposes. There are ~80,000 observations which make very time-consuming to run different tests with different clustering algorithms. Therefore, our approach will be to select 20% of the rows in the dataset stratifying them by the variable _ArrivalDateMonth_ to get the most representative sample as possible.

Similarly, there are 37 variables in the original dataset, which some of them are not relevant for the customer segmentation analysis. Those irrelevant variables are related with hotel processes rather than customer choices.

In [6]:
# Creates variable exclusion list for modeling
to_drop = ['ReservationDate', 'ReservationStatusDate', 'CheckOutDate', 
           'ArrivalDate', 'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth', 
           'ArrivalDateYear', 'AssignedRoomType', 'DaysInWaitingList', 'TotalStay']

# Removes features
dfo.drop(to_drop, axis=1, inplace=True)

In [7]:
# Selects stratifier variable and 
X = dfo.drop(['ArrivalDateMonth'], axis=1)
y = dfo['ArrivalDateMonth']

# Creates a generator for a stratified random shuffled sample of 20% of observations 
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=45)
split = sss.split(X, y)

# Selects sample dataset from split generator
for train_index, test_index in split:
    sample_df = dfo.iloc[test_index]

In [8]:
# Shows the resulting sample size
sample_df.shape

(15710, 27)

---
## 3.3. Categorical to cardinal transformation

In present section, we will encode all categories with numeric variables to prepare the dataset for the scaling step. We will assign a new variable for each category but one to avoid colinearity. 

In [9]:
# Get dummy variables for each categorical variable
encoded_df = pd.get_dummies(sample_df, drop_first=True)

# Check number of total variables
encoded_df.shape

(15710, 91)

We can see that we end up with 91 variables in a very sparse dataset

---
## 3.4. Feature scaling

Finally, we will scale the whole dataset so all variables have a mean of 0 and a variance of 1

In [10]:
# Initialized the scaler
scaler = StandardScaler()
# Fit anf transform
scaler.fit(encoded_df)
scaled = scaler.transform(encoded_df)

In [11]:
# Creates dataframe with the scaled dataset
df_scaled = pd.DataFrame(scaled, columns=encoded_df.columns)

In [12]:
# Shows a sample of the final dataset
df_scaled.sample(10)

Unnamed: 0,LeadTime,ADR,ArrivalDateMonth_August,ArrivalDateMonth_December,ArrivalDateMonth_February,ArrivalDateMonth_January,ArrivalDateMonth_July,ArrivalDateMonth_June,ArrivalDateMonth_March,ArrivalDateMonth_May,...,ReservationMonth_December,ReservationMonth_February,ReservationMonth_January,ReservationMonth_July,ReservationMonth_June,ReservationMonth_March,ReservationMonth_May,ReservationMonth_November,ReservationMonth_October,ReservationMonth_September
5116,0.409272,0.753965,2.799245,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,...,-0.301365,2.78601,-0.397545,-0.30872,-0.213357,-0.296055,-0.262814,-0.289391,-0.340129,-0.267899
10414,0.246477,-0.496747,-0.357239,-0.234822,-0.258917,-0.222963,2.978975,-0.332862,-0.297325,-0.339896,...,-0.301365,2.78601,-0.397545,-0.30872,-0.213357,-0.296055,-0.262814,-0.289391,-0.340129,-0.267899
15652,0.798174,-0.394669,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,...,-0.301365,2.78601,-0.397545,-0.30872,-0.213357,-0.296055,-0.262814,-0.289391,-0.340129,-0.267899
160,-0.829785,1.110983,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,...,-0.301365,-0.358936,-0.397545,-0.30872,-0.213357,-0.296055,-0.262814,-0.289391,-0.340129,3.732755
5282,-0.576547,0.484734,2.799245,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,...,-0.301365,-0.358936,-0.397545,3.239185,-0.213357,-0.296055,-0.262814,-0.289391,-0.340129,-0.267899
14583,2.100541,-1.109216,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,...,-0.301365,-0.358936,-0.397545,-0.30872,-0.213357,-0.296055,-0.262814,-0.289391,2.94006,-0.267899
8551,0.680599,-0.67028,-0.357239,4.258544,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,...,-0.301365,-0.358936,-0.397545,-0.30872,4.686986,-0.296055,-0.262814,-0.289391,-0.340129,-0.267899
404,0.174123,0.547002,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,3.004251,-0.297325,-0.339896,...,-0.301365,-0.358936,2.515437,-0.30872,-0.213357,-0.296055,-0.262814,-0.289391,-0.340129,-0.267899
8271,0.183167,-0.139474,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,...,3.318238,-0.358936,-0.397545,-0.30872,-0.213357,-0.296055,-0.262814,-0.289391,-0.340129,-0.267899
15169,-0.241911,-1.259781,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,...,-0.301365,-0.358936,-0.397545,3.239185,-0.213357,-0.296055,-0.262814,-0.289391,-0.340129,-0.267899


---
The resulting dataframe is loaded in a csv file for further steps in a different notebook.

In [13]:
# Writing df dataframe to csv

path1 = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-3/data/processed/preprocessed.csv'
path2 = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-3/data/processed/non_preprocessed.csv'

df_scaled.to_csv(path1)
sample_df.to_csv(path2)