In [61]:
# Import Libraries
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

#### 1. Load Data

In [62]:
# load data
df = pd.read_csv('../../data/H2.csv')

#### 2. Handle Missing Values

In [63]:
# Check for missing values
null_columns = df.columns[df.isnull().any()]
null_counts = df[null_columns].isnull().sum()
null_counts

Children     4
Country     24
dtype: int64

In [64]:
# Drop rows with null values in 'Children' and 'Country' columns
df = df.dropna(subset=['Children', 'Country'])

In [65]:
# Drop rows where adults, babies and children are zero at the same time
df = df[(df["Adults"] != 0) | (df["Babies"] != 0) | (df["Children"] != 0)]

In [66]:
# Check if any missing values left in the dataset
df.isnull().sum().any()

np.False_

#### 3. Remove Duplicates

In [67]:
# Check for Duplicate rows
print(f"Total Number of Duplicate Rows: {len(df) - len(df.drop_duplicates())}")

Total Number of Duplicate Rows: 25876


In [68]:
# Remove Duplicate Rows
df = df.drop_duplicates()

#### 4. Edit some of the Columns

In [69]:
# Create function for splitting Data
def split_columns(df):
    categorical_columns = df.select_dtypes(exclude=[np.number]).columns
    numerical_columns = df.select_dtypes(include=[np.number]).columns
    
    return categorical_columns, numerical_columns

In [70]:
# Split dataset as categorical and numerical types
categorical_columns, numerical_columns = split_columns(df)

In [71]:
# Remove leading and trailing spaces from all categorical columns, if any
for col in categorical_columns:
    df[col] = df[col].str.strip()

In [72]:
# Convert 'Children' column from float to int
df['Children'] = df['Children'].astype(int)

In [73]:
# Correct the wrong type in 'Agent' and 'Company' columns
# Replace 'NULL' values with 0
replace_null = lambda col: df[col].replace("NULL", 0)
# Convert the column to integer
convert_to_int = lambda col: df[col].astype(int)

# Correct the 'Agent' column
df["Agent"] = replace_null("Agent")
df["Agent"] = convert_to_int("Agent")

# Correct the 'Company' column
df["Company"] = replace_null("Company")
df["Company"] = convert_to_int("Company")

#### 5. Remove Unnecessary Columns

In [74]:
# Drop the Country column
df.drop(columns=["Country"], inplace=True)

#### 6. Some Feature Extraction

In [75]:
# Create Features from 'ReservationStatusDate' column
df['ReservationStatusDate'] = pd.to_datetime(df['ReservationStatusDate'])

df['ReservationStatusYear'] = df['ReservationStatusDate'].dt.year
df['ReservationStatusMonth'] = df['ReservationStatusDate'].dt.month
df['ReservationStatusDay'] = df['ReservationStatusDate'].dt.day

df.drop(['ReservationStatusDate'] , axis = 1, inplace = True)

#### 7. Remove Rare Categories

In [76]:
# Remove rare categories
def remove_rare_categories(df, categorical_cols, min_count=10):
    """
    In categorical columns, it deletes categories that are below a certain threshold value.
        Args:
            df: Pandas DataFrame
            categorical_cols: List of categorical columns
            min_count: Minimum number of observations for a category to remain
    """

    for col in categorical_cols:
        value_counts = df[col].value_counts()
        common_categories = value_counts[value_counts >= min_count].index
        df = df[df[col].isin(common_categories)]
    return df


cat_col, _ = split_columns(df)

df = remove_rare_categories(df, cat_col, min_count=10)

#### 8. Feature Transformation

In [77]:
# Check how many categories categorical features have, we wil encode accordingly
categorical_columns, numerical_columns = split_columns(df)

# Print the categorical columns and their unique value counts
for col in categorical_columns:
    print(f"{col}: {df[col].nunique()} unique values")

ArrivalDateMonth: 12 unique values
Meal: 4 unique values
MarketSegment: 7 unique values
DistributionChannel: 4 unique values
ReservedRoomType: 7 unique values
AssignedRoomType: 8 unique values
DepositType: 3 unique values
CustomerType: 4 unique values
ReservationStatus: 3 unique values


In [78]:
# Map the months to their corresponding numerical values
month_mapping = {
    "January": 0,
    "February": 1,
    "March": 2,
    "April": 3,
    "May": 4,
    "June": 5,
    "July": 6,
    "August": 7,
    "September": 8,
    "October": 9,
    "November": 10,
    "December": 11,
}

# Apply the mapping to the 'ArrivalDateMonth' column
df["ArrivalDateMonth"] = df["ArrivalDateMonth"].map(month_mapping)

In [79]:
from sklearn.preprocessing import LabelEncoder

# List of categorical columns to be encoded
categorical_cols_to_encode = [
    "Meal",
    "MarketSegment",
    "DistributionChannel",
    "ReservedRoomType",
    "AssignedRoomType",
    "DepositType",
    "CustomerType",
    "ReservationStatus",
]

# Initialize the LabelEncoder
le = LabelEncoder()

from sklearn.preprocessing import LabelEncoder

# Create a dictionary to store the mappings
label_mappings = {}

# Encode each categorical column and get the mapping of encoded values to original categories
for col in categorical_cols_to_encode:
    # fit LabelEncoder with the original data
    le.fit(df[col])

    # Transform the original data to get the encoded values
    encoded_values = le.transform(df[col])

    # Assign new encoded values to column
    df[col] = encoded_values

    # Get the original categories using inverse_transform
    original_categories = le.inverse_transform(encoded_values)

    # Create a mapping dictionary for the column
    label_mappings[col] = dict(zip(encoded_values, original_categories))


label_mappings

{'Meal': {np.int64(2): 'HB',
  np.int64(0): 'BB',
  np.int64(3): 'SC',
  np.int64(1): 'FB'},
 'MarketSegment': {np.int64(5): 'Offline TA/TO',
  np.int64(6): 'Online TA',
  np.int64(4): 'Groups',
  np.int64(1): 'Complementary',
  np.int64(3): 'Direct',
  np.int64(2): 'Corporate',
  np.int64(0): 'Aviation'},
 'DistributionChannel': {np.int64(3): 'TA/TO',
  np.int64(1): 'Direct',
  np.int64(0): 'Corporate',
  np.int64(2): 'GDS'},
 'ReservedRoomType': {np.int64(0): 'A',
  np.int64(1): 'B',
  np.int64(3): 'D',
  np.int64(5): 'F',
  np.int64(4): 'E',
  np.int64(6): 'G',
  np.int64(2): 'C'},
 'AssignedRoomType': {np.int64(0): 'A',
  np.int64(1): 'B',
  np.int64(5): 'F',
  np.int64(3): 'D',
  np.int64(6): 'G',
  np.int64(4): 'E',
  np.int64(7): 'K',
  np.int64(2): 'C'},
 'DepositType': {np.int64(0): 'No Deposit',
  np.int64(1): 'Non Refund',
  np.int64(2): 'Refundable'},
 'CustomerType': {np.int64(2): 'Transient',
  np.int64(3): 'Transient-Party',
  np.int64(0): 'Contract',
  np.int64(1): 'Gro

In [80]:
# Check for final state of data
df.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,MarketSegment,DistributionChannel,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,ReservedRoomType,AssignedRoomType,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusYear,ReservationStatusMonth,ReservationStatusDay
0,0,6,2015,6,27,1,0,2,1,0,0,2,5,3,0,0,0,0,0,0,0,6,0,0,2,0.0,0,0,1,2015,7,3
1,1,88,2015,6,27,1,0,4,2,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,76.5,0,1,0,2015,7,1
2,1,65,2015,6,27,1,0,4,1,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,68.0,0,1,0,2015,4,30
3,1,92,2015,6,27,1,2,4,2,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,76.5,0,2,0,2015,6,23
4,1,100,2015,6,27,2,0,2,2,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,76.5,0,1,0,2015,4,2
