In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from loguru import logger
from datetime import datetime
import uuid

In [24]:
class FeatureEngineeringProcessor:
    def __init__(self, raw_data: pd.DataFrame, pipeline_name: str) -> None:
        # Guarda el DataFrame original.
        self.raw_data = raw_data
        # Guarda el nombre del pipeline.
        self.pipeline_name = pipeline_name
        # Inicializa la tabla de características como None.
        self.feature_table = None

    def impute_scale(self, n_components: int = 2) -> pd.DataFrame:
        # Define las columnas numéricas a procesar.
        numeric_cols= [
            "lead_time",
            "adults",
            "children",
            "babies",
            "adr"
        ]
        pipe = Pipeline(
            steps=[
                # Imputa valores faltantes con la media.
                ("imputer_mean", SimpleImputer(strategy="mean")),
                # Escala las variables numéricas.
                ("std_scaling", StandardScaler()),
                # Reduce la dimensionalidad con PCA.
                ("pca", PCA(n_components=n_components))
            ]
        )
        # Devuelve un DataFrame con las nuevas características numéricas.
        return pd.DataFrame(
            pipe.fit_transform(self.raw_data[numeric_cols]),
            columns=["great_feature1", "great_feature2"]
        )

    def encode_categoricals(self) -> pd.DataFrame:
        encoded_vars = []
        for var in ["hotel", "market_segment", "reserved_room_type"]:
            # Muestra en el log qué variable se está codificando.
            logger.info(f"Codificando con OHE {var}")
            encoder = OneHotEncoder()
            # Codifica la variable categórica usando OneHotEncoder.
            encoded = encoder.fit_transform(self.raw_data[[var]]).toarray()
            cols  = [f"{var}_{col}" for col in encoder.categories_[0]]
            # Genera los nombres de las columnas codificadas.
            _dataframe = pd.DataFrame(
                encoded,
                columns= cols
            )
            # Añade el DataFrame codificado a la lista.
            encoded_vars.append(_dataframe)
        # Devuelve la concatenación de todos los DataFrames codificados.
        return pd.concat(encoded_vars,axis=1)

    def run(self) -> pd.DataFrame:
        # Log de inicio del pipeline.
        logger.info(f"Inicializando pipeline {self.pipeline_name}")

        # Codifica las variables categóricas.
        categorical = self.encode_categoricals()
        # Procesa las variables numéricas.
        numerics = self.impute_scale()

        # Une las variables categóricas y numéricas.
        modeling_dataset = pd.concat([categorical, numerics], axis=1)

        pipe = Pipeline(
            steps=[
                # Elimina variables con baja varianza.
                ("feature_selection", VarianceThreshold()),
                # Escala las variables usando RobustScaler.
                ("scaling_robust", RobustScaler())
            ]
        )
        # Aplica el pipeline y guarda el resultado en feature_table.
        self.feature_table =  pd.DataFrame(
            pipe.fit_transform(modeling_dataset),
            columns=modeling_dataset.columns
        )

        # Añade una columna de IDs únicos.
        self.feature_table["booking_id"] = [str(uuid.uuid4()) for _ in range(self.feature_table.shape[0])]
        # Añade una columna de timestamp.
        self.feature_table["event_timestamp"] = [datetime.now() for _ in range(self.feature_table.shape[0])]
        
        import time
        # Espera 1 segundo.
        time.sleep(1)
        # Añade una columna de timestamp de creación.
        self.feature_table["created"] = [datetime.now() for _ in range(self.feature_table.shape[0])]

        # Devuelve la tabla final de características.
        return self.feature_table

    def write_feature_table(self, filepath: str) -> None:
        # Log de escritura de la tabla.
        logger.info(f"Escribiendo feature table en {filepath}")
        if not self.feature_table.empty: # -> True o False
            # Guarda la tabla en formato parquet.
            self.feature_table.to_parquet(f"{filepath}.parquet", index=False)
            # Guarda la tabla en formato csv.
            self.feature_table.to_csv(f"{filepath}.csv", index=False)
        else:
            # Lanza excepción si la tabla no existe.
            raise Exception("La feature table no ha sido creada. Ejecutar el comando .run()")  

In [25]:
# Dataset Hotel Booking -> https://www.kaggle.com/datasets/jessemostipak/hotel-booking-demand/data
raw_data = pd.read_csv("../data/raw/hotel_bookings.csv")

# Train Test Split

In [26]:
# Divide el DataFrame en conjuntos de entrenamiento y prueba usando scikit-learn.
train_raw_data, test_raw_data = train_test_split(
    raw_data, test_size=0.2, random_state=42
)

In [27]:
train_raw_data["reserved_room_type"].value_counts()

reserved_room_type
A    68690
D    15475
E     5246
F     2326
G     1664
B      891
C      732
H      473
P       10
L        5
Name: count, dtype: int64

# Feature Engineering with Data Train

In [28]:
train_processor = FeatureEngineeringProcessor(
    raw_data=train_raw_data,
    pipeline_name="train_pipeline"
)
# Ejecuta el procesamiento de características.
train_processor.run()

# Guarda la tabla de características en disco.
train_processor.write_feature_table("../data/processed/bookings_feature_table")

[32m2025-07-18 12:59:08.860[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun[0m:[36m56[0m - [1mInicializando pipeline train_pipeline[0m
[32m2025-07-18 12:59:08.861[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m39[0m - [1mCodificando con OHE hotel[0m
[32m2025-07-18 12:59:08.879[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m39[0m - [1mCodificando con OHE market_segment[0m
[32m2025-07-18 12:59:08.897[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m39[0m - [1mCodificando con OHE reserved_room_type[0m
[32m2025-07-18 12:59:10.600[0m | [1mINFO    [0m | [36m__main__[0m:[36mwrite_feature_table[0m:[36m96[0m - [1mEscribiendo feature table en ../data/processed/bookings_feature_table[0m


In [29]:
test_processor = FeatureEngineeringProcessor(
    raw_data=test_raw_data,
    pipeline_name="test_pipeline"
)
# Ejecuta el procesamiento de características.
test_processor.run()

[32m2025-07-18 12:59:12.407[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun[0m:[36m56[0m - [1mInicializando pipeline test_pipeline[0m
[32m2025-07-18 12:59:12.408[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m39[0m - [1mCodificando con OHE hotel[0m
[32m2025-07-18 12:59:12.415[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m39[0m - [1mCodificando con OHE market_segment[0m
[32m2025-07-18 12:59:12.422[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m39[0m - [1mCodificando con OHE reserved_room_type[0m


Unnamed: 0,hotel_City Hotel,hotel_Resort Hotel,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,reserved_room_type_A,...,reserved_room_type_F,reserved_room_type_G,reserved_room_type_H,reserved_room_type_L,reserved_room_type_P,great_feature1,great_feature2,booking_id,event_timestamp,created
0,-1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,...,1.0,0.0,0.0,0.0,0.0,-0.391759,0.985978,10caa1a7-3d3d-479c-9a32-ed66adfe8615,2025-07-18 12:59:12.547413,2025-07-18 12:59:13.569891
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.188518,0.201427,5f505f0c-e44e-4997-ac75-df05c109ef1a,2025-07-18 12:59:12.547417,2025-07-18 12:59:13.569901
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.239553,0.745687,af4a0b55-193c-49e9-80d0-2079e10f7682,2025-07-18 12:59:12.547417,2025-07-18 12:59:13.569903
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.513294,-1.230322,b816b6fc-0565-4cf1-a378-5c778a8f2c64,2025-07-18 12:59:12.547417,2025-07-18 12:59:13.569904
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.264629,0.099065,2d2bf53e-1b60-4859-a65c-c7d1719023c7,2025-07-18 12:59:12.547418,2025-07-18 12:59:13.569906
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23873,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.719946,0.378556,5a575944-37a7-4a10-82a8-cc9a82b912d3,2025-07-18 12:59:12.554769,2025-07-18 12:59:13.604112
23874,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,-0.616730,0.672916,a6f20ad9-ff1d-4593-8e31-a7cebb17bbcf,2025-07-18 12:59:12.554769,2025-07-18 12:59:13.604113
23875,-1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-1.101622,-0.947801,cd15efbb-6609-4c75-9928-269aa5fb6e65,2025-07-18 12:59:12.554769,2025-07-18 12:59:13.604114
23876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.161923,0.354046,17e441e7-a23a-4b0d-8448-d8b11c325f1a,2025-07-18 12:59:12.554770,2025-07-18 12:59:13.604115


In [33]:
df_parquet = pd.read_parquet("../data/processed/bookings_feature_table.parquet")
df_parquet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95512 entries, 0 to 95511
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   hotel_City Hotel              95512 non-null  float64       
 1   hotel_Resort Hotel            95512 non-null  float64       
 2   market_segment_Aviation       95512 non-null  float64       
 3   market_segment_Complementary  95512 non-null  float64       
 4   market_segment_Corporate      95512 non-null  float64       
 5   market_segment_Direct         95512 non-null  float64       
 6   market_segment_Groups         95512 non-null  float64       
 7   market_segment_Offline TA/TO  95512 non-null  float64       
 8   market_segment_Online TA      95512 non-null  float64       
 9   market_segment_Undefined      95512 non-null  float64       
 10  reserved_room_type_A          95512 non-null  float64       
 11  reserved_room_type_B        

In [37]:
df_parquet[["booking_id", "event_timestamp", "created", "great_feature1", "great_feature2"]].head()

Unnamed: 0,booking_id,event_timestamp,created,great_feature1,great_feature2
0,f1ccaa85-7747-4dc3-be4d-5f229a2e07a4,2025-07-18 12:59:09.409866,2025-07-18 12:59:10.502401,2.169662,0.890985
1,1466b87e-16c3-484e-b122-c0de8f375e0e,2025-07-18 12:59:09.409871,2025-07-18 12:59:10.502405,2.309743,-0.817679
2,a00c7815-5045-498a-a213-b4e7b6bab60f,2025-07-18 12:59:09.409872,2025-07-18 12:59:10.502406,5.391121,-1.74961
3,f82773fb-7e89-4d8e-a1eb-0eff9dca30cb,2025-07-18 12:59:09.409872,2025-07-18 12:59:10.502406,-0.049387,1.275659
4,fce2b600-6d77-4618-960b-0de9feddadb6,2025-07-18 12:59:09.409873,2025-07-18 12:59:10.502407,-0.62645,-0.132513
