# Hotel Facilito – Desarrollo

![](./images/header.png)

Estábamos creando un modelo para predecir (y calcular las probabilidades) de que un cliente cancelara su reserva de hotel. El resultado es el código que tenemos a continuación.

 > Este notebook es la primera versión, con código "sucio" como lo dejamos en la sesión anterior

In [None]:
import pandas as pd

In [None]:
hotel_bookings = pd.read_csv("data/hotel_bookings_training.csv")

## Elimina columnas innecesarias / peligrosas

In [None]:
# Remove personal information of customers
hotel_bookings = hotel_bookings.drop(['name', 'email', 'phone-number', 'credit_card'], axis=1)

# Avoid data leakage
hotel_bookings = hotel_bookings.drop(['reservation_status', 'reservation_status_date'], axis=1)

In [None]:
# Convert objects to strings
object_columns = hotel_bookings.select_dtypes('object').columns
hotel_bookings[object_columns] = hotel_bookings[object_columns].astype(str)

## Separa columnas y divide el dataset

In [None]:
is_canceled = hotel_bookings['is_canceled'].copy()
hotel_data = hotel_bookings.drop(['is_canceled'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split

train_proportion = 0.60
test_proportion = 0.5

original_count = len(hotel_bookings)
training_size = int(original_count * train_proportion)
test_size = int((1 - train_proportion) * test_proportion * training_size)

train_x, rest_x, train_y, rest_y = train_test_split(hotel_data, is_canceled, train_size=training_size)
test_x, validate_x, test_y, validate_y = train_test_split(rest_x, rest_y, train_size=test_size)

## Creación del pipeline de featurización y entrenamiento

In [None]:
from sklearn.preprocessing import OneHotEncoder, Binarizer, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion, Pipeline

In [None]:
# One-hot encoder

internal_one_hot_encoding = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
columns_to_encode = [
    "hotel",
    "meal", 
    "distribution_channel", 
    "reserved_room_type", 
    "assigned_room_type", 
    "customer_type"
]

one_hot_encoding = ColumnTransformer([
    (
        'one_hot_encode',
        internal_one_hot_encoding,
        columns_to_encode
    )
])

In [None]:
# Binarizer

internal_binarizer = Binarizer()
columns_to_binarize = [
    "total_of_special_requests", 
    "required_car_parking_spaces", 
    "booking_changes", 
    "previous_bookings_not_canceled", 
    "previous_cancellations",
]
internal_encoder_binarizer = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

binarizer = ColumnTransformer([
    (
        'binarizer',
        internal_binarizer,
        columns_to_binarize
    )
])

one_hot_binarized = Pipeline([
    ("binarizer", binarizer),
    ("one_hot_encoder", internal_encoder_binarizer),
])

In [None]:
# Scaler
internal_scaler = RobustScaler()
columns_to_scale = ["adr"]

scaler = ColumnTransformer([
    ("scaler", internal_scaler, columns_to_scale)
])

In [None]:
# Passthrough columns

pass_columns = [
    "stays_in_week_nights",
    "stays_in_weekend_nights",
]

passthrough = ColumnTransformer([
    (
        "pass_columns",
        "passthrough",
        pass_columns
    )
])

In [None]:
feature_engineering_pipeline  = Pipeline([
    (
        "features",
        FeatureUnion([
            ('categories', one_hot_encoding),
            ('binaries', one_hot_binarized),
            ('scaled', scaler),
            ('passthrough', passthrough)
        ])
    )
])

In [None]:
# Machine learning model

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)

In [None]:
# Full pipeline

In [None]:
final_pipeline = Pipeline([
    ("feature_engineering", feature_engineering_pipeline),
    ("model", model)
])

## Model training

In [None]:
final_pipeline.fit(train_x, train_y)

## Model validation

In [None]:
from sklearn.metrics import accuracy_score, recall_score

In [None]:
train_pred_y = final_pipeline.predict(train_x)
validate_pred_y = final_pipeline.predict(validate_x)

In [None]:
train_accuracy = accuracy_score(train_pred_y, train_y)
train_recall = recall_score(train_pred_y, train_y)

validate_accuracy = accuracy_score(validate_pred_y, validate_y)
validate_recall = recall_score(validate_pred_y, validate_y)

In [None]:
print('Train accuracy', train_accuracy)
print('Train recall', train_recall)

print('Validate accuracy', validate_accuracy)
print('Validate recall', validate_recall)