In [1]:
import math
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
path = '/content/drive/MyDrive/Colab Notebooks/data/'

In [4]:
X_train = pd.read_csv(path + 'hotel_bookings_X_train.csv', sep=",")
X_val = pd.read_csv(path + 'hotel_bookings_X_val.csv', sep=",")
X_test = pd.read_csv(path + 'hotel_bookings_X_test.csv', sep=",")

**Log transform right-skewed variables**

In [5]:
X_train['adr'] = np.log1p(X_train['adr'])
X_train['lead_time'] = np.log1p(X_train['lead_time'])
X_train['days_in_waiting_list'] = np.log1p(X_train['days_in_waiting_list'])

X_val['adr'] = np.log1p(X_val['adr'])
X_val['lead_time'] = np.log1p(X_val['lead_time'])
X_val['days_in_waiting_list'] = np.log1p(X_val['days_in_waiting_list'])

X_test['adr'] = np.log1p(X_test['adr'])
X_test['lead_time'] = np.log1p(X_test['lead_time'])
X_test['days_in_waiting_list'] = np.log1p(X_test['days_in_waiting_list'])

**Scale variables**

In [6]:
ohe_bin_prefixes = ['customer_type',
                    'deposit_type',
                    'distribution_channel',
                    'hotel',
                    'market_segment',
                    'meal',
                    'continent',
                    'is_reserved_room_type',
                    'is_repeated_guest',
                    'agent',
                    'company',]
ohe_bin_columns = [col for col in X_train.columns if any(col.startswith(p) for p in ohe_bin_prefixes)]
numerical_cols = X_train.drop(ohe_bin_columns, axis=1).columns
processor = ColumnTransformer(
    transformers = [
        ("num", StandardScaler(), numerical_cols),
        ("onehot", 'passthrough', ohe_bin_columns),
    ],
    remainder='passthrough'
)
processor.set_output(transform="pandas")
processor.fit(X_train)
X_train = processor.transform(X_train)
X_train.columns = X_train.columns.map(lambda x: x.split('__')[-1])

X_val = processor.transform(X_val)
X_val.columns = X_val.columns.map(lambda x: x.split('__')[-1])

X_test = processor.transform(X_test)
X_test.columns = X_test.columns.map(lambda x: x.split('__')[-1])

**Save transformed data**

In [7]:
X_train.to_csv(path + 'hotel_bookings_X_train.csv', index=False)
X_val.to_csv(path + 'hotel_bookings_X_val.csv', index=False)
X_test.to_csv(path + 'hotel_bookings_X_test.csv', index=False)