# 3. Preprocessing

1. Importing packages and dataset
2. Categorical to cardinal transformation
3. Feature scaling

## 3.1. Importing packages and dataset

In [1]:
# Import the required packages

import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import pre-processed dataset and load it into a dataframe

# Create path variable
original_file = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-3/data/interim/category_transformation.csv'

# Load csv in dataframe
dfo = pd.read_csv(original_file, index_col=0)

In [3]:
# Check # of columns and rows imported
dfo.shape

(79120, 37)

In [4]:
# Print info
dfo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79120 entries, 0 to 79329
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   LeadTime                     79120 non-null  float64
 1   ArrivalDateYear              79120 non-null  int64  
 2   ArrivalDateMonth             79120 non-null  object 
 3   ArrivalDateWeekNumber        79120 non-null  int64  
 4   ArrivalDateDayOfMonth        79120 non-null  int64  
 5   StaysInWeekendNights         79120 non-null  object 
 6   StaysInWeekNights            79120 non-null  object 
 7   Adults                       79120 non-null  float64
 8   Children                     79120 non-null  float64
 9   Babies                       79120 non-null  float64
 10  Meal                         79120 non-null  object 
 11  Country                      79120 non-null  object 
 12  MarketSegment                79120 non-null  object 
 13  DistributionChan

In [5]:
# Creates variable exclusion list for modeling
to_drop = ['ReservationDate', 'ReservationStatusDate', 'CheckOutDate', 
           'ArrivalDate', 'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth', 
           'ArrivalDateYear', 'AssignedRoomType']

# Removes features
dfo.drop(to_drop, axis=1, inplace=True)

In [6]:
# Converts all non float variables in categorical
cat_cols = dfo.select_dtypes(exclude='float64').columns
dfo[cat_cols] = dfo[cat_cols].astype('category')

# Convert variables to categorical
dfo['Adults'] = dfo['Adults'].astype('category')
dfo['Children'] = dfo['Children'].astype('category')
dfo['Babies'] = dfo['Babies'].astype('category')

## 3.2. Categorical to cardinal transformation

In [7]:
# Get dummy variables for each categorical variable
df = pd.get_dummies(dfo, drop_first=True)

# Check number of total variables
df.shape

(79120, 95)

## 3.3. Feature scaling

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df)
scaled = scaler.transform(df)

In [9]:
df_scaled = pd.DataFrame(scaled, columns=df.columns)

In [10]:
df_scaled.sample(10)

Unnamed: 0,LeadTime,DaysInWaitingList,ADR,TotalStay,ArrivalDateMonth_August,ArrivalDateMonth_December,ArrivalDateMonth_February,ArrivalDateMonth_January,ArrivalDateMonth_July,ArrivalDateMonth_June,...,ReservationMonth_December,ReservationMonth_February,ReservationMonth_January,ReservationMonth_July,ReservationMonth_June,ReservationMonth_March,ReservationMonth_May,ReservationMonth_November,ReservationMonth_October,ReservationMonth_September
1866,0.073174,-0.154717,-1.190002,0.561835,2.797925,-0.234199,-0.258338,-0.222401,-0.336856,-0.332701,...,-0.298173,-0.358037,-0.395907,-0.311496,-0.213524,-0.294489,-0.264792,-0.295981,-0.335879,-0.267379
21571,0.019107,-0.154717,-0.435796,0.561835,-0.357408,4.269882,-0.258338,-0.222401,-0.336856,-0.332701,...,-0.298173,-0.358037,-0.395907,-0.311496,-0.213524,-0.294489,-0.264792,-0.295981,-0.335879,3.740012
46378,-0.665753,-0.154717,1.146757,-1.081871,-0.357408,-0.234199,-0.258338,-0.222401,-0.336856,-0.332701,...,-0.298173,2.793007,-0.395907,-0.311496,-0.213524,-0.294489,-0.264792,-0.295981,-0.335879,-0.267379
44232,-0.990161,-0.154717,-0.215926,-1.081871,-0.357408,-0.234199,3.870896,-0.222401,-0.336856,-0.332701,...,-0.298173,2.793007,-0.395907,-0.311496,-0.213524,-0.294489,-0.264792,-0.295981,-0.335879,-0.267379
2558,-0.854991,-0.154717,-0.011396,0.561835,-0.357408,-0.234199,-0.258338,-0.222401,-0.336856,-0.332701,...,-0.298173,-0.358037,-0.395907,-0.311496,-0.213524,-0.294489,-0.264792,-0.295981,-0.335879,-0.267379
42387,-0.061995,-0.154717,0.301791,0.561835,-0.357408,4.269882,-0.258338,-0.222401,-0.336856,-0.332701,...,-0.298173,-0.358037,-0.395907,-0.311496,-0.213524,-0.294489,-0.264792,-0.295981,-0.335879,3.740012
53311,-0.990161,-0.154717,2.5708,-0.533969,-0.357408,-0.234199,-0.258338,-0.222401,2.968623,-0.332701,...,-0.298173,-0.358037,-0.395907,3.21031,-0.213524,-0.294489,-0.264792,-0.295981,-0.335879,-0.267379
60693,-0.107052,-0.154717,-0.521443,0.561835,-0.357408,-0.234199,-0.258338,-0.222401,-0.336856,-0.332701,...,-0.298173,-0.358037,-0.395907,3.21031,-0.213524,-0.294489,-0.264792,-0.295981,-0.335879,-0.267379
17308,-0.827957,-0.154717,0.602195,-1.081871,-0.357408,-0.234199,-0.258338,-0.222401,-0.336856,-0.332701,...,-0.298173,-0.358037,-0.395907,-0.311496,-0.213524,-0.294489,-0.264792,-0.295981,-0.335879,3.740012
11555,-0.620697,-0.154717,-0.420457,-1.081871,-0.357408,-0.234199,-0.258338,-0.222401,-0.336856,-0.332701,...,-0.298173,-0.358037,-0.395907,-0.311496,-0.213524,-0.294489,-0.264792,-0.295981,-0.335879,-0.267379


---
The resulting dataframe is loaded in a csv file for further steps in a different notebook.

In [11]:
# Writing df dataframe to csv

path1 = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-3/data/processed/preprocessed.csv'
path2 = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-3/data/processed/non_preprocessed.csv'

df_scaled.to_csv(path1)
dfo.to_csv(path2)