In [None]:
# Run Initial Preparation of Data
%run 1_prep_data.ipynb

In [60]:
# Import Libraries
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [61]:
# Check What's the data look
df.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,MarketSegment,DistributionChannel,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,ReservedRoomType,AssignedRoomType,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusYear,ReservationStatusMonth,ReservationStatusDay
0,0,6,2015,6,27,1,0,2,1,0,0,2,5,3,0,0,0,0,0,0,0,6,0,0,2,0.0,0,0,1,2015,7,3
1,1,88,2015,6,27,1,0,4,2,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,76.5,0,1,0,2015,7,1
2,1,65,2015,6,27,1,0,4,1,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,68.0,0,1,0,2015,4,30
3,1,92,2015,6,27,1,2,4,2,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,76.5,0,2,0,2015,6,23
4,1,100,2015,6,27,2,0,2,2,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,76.5,0,1,0,2015,4,2


#### 1. Feature Engineer Pipeline

In [62]:
# Define custom transformations as a class
class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Ensure a copy to avoid modifying the original dataframe
        df = X.copy()
        
        # TotalStay
        df["TotalStay"] = df["StaysInWeekendNights"] + df["StaysInWeekNights"]

        # SpendingPerPerson
        with np.errstate(divide='ignore', invalid='ignore'):  # Ignore warnings during division
            df["SpendingPerPerson"] = df["ADR"] / (df["Adults"] + df["Children"] + df["Babies"])
        df["SpendingPerPerson"].replace([np.inf, -np.inf], 0, inplace=True)  # Replace inf values

        # GroupType
        def determine_group_type(row):
            if row["Adults"] == 1 and (row["Children"] + row["Babies"] == 0):
                return "Single"
            elif row["Adults"] == 2 and (row["Children"] + row["Babies"] == 0):
                return "Couple"
            elif (row["Children"] + row["Babies"]) > 0:
                return "Family"
            elif row["Adults"] > 2:
                return "Big_Group"
            else:
                return "Other"
        
        df["GroupType"] = df.apply(determine_group_type, axis=1)

        # BookingUrgency
        def determine_booking_urgency(lead_time):
            if lead_time > 60:
                return "Early_Booking"
            elif 30 <= lead_time <= 60:
                return "Medium_Term_Booking"
            else:
                return "Last_Minute_Booking"
        
        df["BookingUrgency"] = df["LeadTime"].apply(determine_booking_urgency)

        # HighDemand
        df["HighDemand"] = df["TotalOfSpecialRequests"].apply(
            lambda x: "High" if x >= 2 else "Low"
        )

        # LoyaltyScore
        df["LoyaltyScore"] = df["IsRepeatedGuest"] * (df["PreviousBookingsNotCanceled"] + 1)

        # ChildRatio
        df["ChildRatio"] = (df["Children"] + df["Babies"]) / (
            df["Adults"] + df["Children"] + df["Babies"]
        )
        df["ChildRatio"].fillna(0, inplace=True)

        # Occupancy
        df["Occupancy"] = (df["Adults"] + df["Children"] + df["Babies"]) / df["TotalStay"]
        df["Occupancy"].fillna(0, inplace=True)

        # WaitingTimeCategory
        def determine_waiting_time_category(days):
            if days < 7:
                return "Short"
            elif 7 <= days <= 30:
                return "Medium"
            else:
                return "Long"

        df["WaitingTimeCategory"] = df["DaysInWaitingList"].apply(
            determine_waiting_time_category
        )

        # CancellationRisk
        df["CancellationRisk"] = df["PreviousCancellations"] * df["LeadTime"]


        # Check for infinity or very large values
        df = df.replace([np.inf, -np.inf], np.nan)  # Replace infinities with NaN
        df = df.fillna(0)  # Replace NaN with 0

        return df


# Create a feature_engineering pipeline
feature_engineering_pipeline = Pipeline([
    ('feature_engineering', FeatureEngineeringTransformer()),
])

In [63]:
# Let's manually Encode the columns we created.
# Custom mappings for categorical encodings
group_type_mapping = {"Single": 0, "Couple": 1, "Family": 2, "Big_Group": 3}
booking_urgency_mapping = {
    "Early_Booking": 0,
    "Medium_Term_Booking": 1,
    "Last_Minute_Booking": 2,
}
high_demand_mapping = {"Low": 0, "High": 1}
waiting_time_category_mapping = {"Short": 0, "Medium": 1, "Long": 2}


# Custom function for mapping categories
class MapCategories:
    def __init__(self, mapping):
        self.mapping = mapping

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.replace(self.mapping)


# Pipeline for categorical columns with custom mapping
categorical_mapped_transformer = Pipeline(
    [
        (
            "map_categories",
            MapCategories(
                {
                    "GroupType": group_type_mapping,
                    "BookingUrgency": booking_urgency_mapping,
                    "HighDemand": high_demand_mapping,
                    "WaitingTimeCategory": waiting_time_category_mapping,
                }
            ),
        ),
        ("imputer", SimpleImputer(strategy="most_frequent")),
    ]
)

In [64]:
# Create Pipelines for General Numeric and Categorical Features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [65]:
# Full pipeline
transformer_pipeline = Pipeline([
    ('feature_engineering', feature_engineering_pipeline),  # First, apply feature engineering
    ('preprocessor', ColumnTransformer(
        transformers=[
            ("map_categories", categorical_mapped_transformer, ["GroupType", "BookingUrgency", "HighDemand", "WaitingTimeCategory"]),
            ("num", numeric_transformer, make_column_selector(dtype_include=np.number)),
            ("cat", categorical_transformer, make_column_selector(dtype_include=object)),
        ]
    )),
])

In [66]:
# TEST
df_transformed = transformer_pipeline.fit_transform(df)
df_transformed

array([[0., 2., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [1., 1., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 0., 1.]], shape=(53267, 54))