In [140]:
# Run Initial Preparation of Data
%run 1_prep_data.ipynb

Total Number of Duplicate Rows: 25876
ArrivalDateMonth: 12 unique values
Meal: 4 unique values
MarketSegment: 7 unique values
DistributionChannel: 4 unique values
ReservedRoomType: 7 unique values
AssignedRoomType: 8 unique values
DepositType: 3 unique values
CustomerType: 4 unique values
ReservationStatus: 3 unique values


In [141]:
# Import Libraries
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [142]:
# Check What's the data look
df.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,MarketSegment,DistributionChannel,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,ReservedRoomType,AssignedRoomType,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusYear,ReservationStatusMonth,ReservationStatusDay
0,0,6,2015,6,27,1,0,2,1,0,0,2,5,3,0,0,0,0,0,0,0,6,0,0,2,0.0,0,0,1,2015,7,3
1,1,88,2015,6,27,1,0,4,2,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,76.5,0,1,0,2015,7,1
2,1,65,2015,6,27,1,0,4,1,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,68.0,0,1,0,2015,4,30
3,1,92,2015,6,27,1,2,4,2,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,76.5,0,2,0,2015,6,23
4,1,100,2015,6,27,2,0,2,2,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,76.5,0,1,0,2015,4,2


#### 1. Feature Engineer Pipeline

In [143]:
# Define custom transformations as a class
class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Ensure a copy to avoid modifying the original dataframe
        df = X.copy()
        
        # TotalStay
        df["TotalStay"] = df["StaysInWeekendNights"] + df["StaysInWeekNights"]

        # SpendingPerPerson
        with np.errstate(divide='ignore', invalid='ignore'):  # Ignore warnings during division
            df["SpendingPerPerson"] = df["ADR"] / (df["Adults"] + df["Children"] + df["Babies"])
        df["SpendingPerPerson"].replace([np.inf, -np.inf], 0, inplace=True)  # Replace inf values

        # GroupType
        def determine_group_type(row):
            if row["Adults"] == 1 and (row["Children"] + row["Babies"] == 0):
                return "Single"
            elif row["Adults"] == 2 and (row["Children"] + row["Babies"] == 0):
                return "Couple"
            elif (row["Children"] + row["Babies"]) > 0:
                return "Family"
            elif row["Adults"] > 2:
                return "Big_Group"
            else:
                return "Other"
        
        df["GroupType"] = df.apply(determine_group_type, axis=1)

        # BookingUrgency
        def determine_booking_urgency(lead_time):
            if lead_time > 60:
                return "Early_Booking"
            elif 30 <= lead_time <= 60:
                return "Medium_Term_Booking"
            else:
                return "Last_Minute_Booking"
        
        df["BookingUrgency"] = df["LeadTime"].apply(determine_booking_urgency)

        # HighDemand
        df["HighDemand"] = df["TotalOfSpecialRequests"].apply(
            lambda x: "High" if x >= 2 else "Low"
        )

        # LoyaltyScore
        df["LoyaltyScore"] = df["IsRepeatedGuest"] * (df["PreviousBookingsNotCanceled"] + 1)

        # ChildRatio
        df["ChildRatio"] = (df["Children"] + df["Babies"]) / (
            df["Adults"] + df["Children"] + df["Babies"]
        )
        df["ChildRatio"].fillna(0, inplace=True)

        # Occupancy
        df["Occupancy"] = (df["Adults"] + df["Children"] + df["Babies"]) / df["TotalStay"]
        df["Occupancy"].fillna(0, inplace=True)

        # WaitingTimeCategory
        def determine_waiting_time_category(days):
            if days < 7:
                return "Short"
            elif 7 <= days <= 30:
                return "Medium"
            else:
                return "Long"

        df["WaitingTimeCategory"] = df["DaysInWaitingList"].apply(
            determine_waiting_time_category
        )

        # CancellationRisk
        df["CancellationRisk"] = df["PreviousCancellations"] * df["LeadTime"]


        # Check for infinity or very large values
        df = df.replace([np.inf, -np.inf], np.nan)  # Replace infinities with NaN
        df = df.fillna(0)  # Replace NaN with 0

        return df


# Create a feature_engineering pipeline
feature_engineering_pipeline = Pipeline([
    ('feature_engineering', FeatureEngineeringTransformer()),
])



In [144]:
# Let's manually Encode the columns we created.
# Custom mappings for categorical encodings
group_type_mapping = {"Single": 0, "Couple": 1, "Family": 2, "Big_Group": 3}
booking_urgency_mapping = {
    "Early_Booking": 0,
    "Medium_Term_Booking": 1,
    "Last_Minute_Booking": 2,
}
high_demand_mapping = {"Low": 0, "High": 1}
waiting_time_category_mapping = {"Short": 0, "Medium": 1, "Long": 2}


# Custom function for mapping categories
class MapCategories:
    def __init__(self, mapping):
        self.mapping = mapping

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.replace(self.mapping)

    def get_feature_names_out(self, input_features=None):
        # Return the feature names after transformation
        return input_features if input_features is not None else self.feature_names_in_


# Pipeline for categorical columns with custom mapping
categorical_mapped_transformer = Pipeline(
    [
        (
            "map_categories",
            MapCategories(
                {
                    "GroupType": group_type_mapping,
                    "BookingUrgency": booking_urgency_mapping,
                    "HighDemand": high_demand_mapping,
                    "WaitingTimeCategory": waiting_time_category_mapping,
                }
            ),
        ),
        ("imputer", SimpleImputer(strategy="most_frequent")),
    ]
)

In [145]:
# Create Pipelines for General Numeric and Categorical Features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [146]:
# Full pipeline
transformer_pipeline = Pipeline([
    ('feature_engineering', feature_engineering_pipeline),  # First, apply feature engineering
    ('preprocessor', ColumnTransformer(
        transformers=[
            ("map_categories", categorical_mapped_transformer, ["GroupType", "BookingUrgency", "HighDemand", "WaitingTimeCategory"]),
            ("num", numeric_transformer, make_column_selector(dtype_include=np.number)),
            ("cat", categorical_transformer, make_column_selector(dtype_include=object)),
        ]
    )),
])

In [147]:
# Fit the transformer pipeline
transformer_pipeline.fit(df)

# TEST
transformed_data = transformer_pipeline.transform(df)
transformed_data    # NdArray

array([[0., 2., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [1., 1., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 0., 1.]], shape=(53267, 54))

In [148]:
# Get feature names after transformation
feature_names = transformer_pipeline.named_steps["preprocessor"].get_feature_names_out()
print(feature_names)

['map_categories__GroupType' 'map_categories__BookingUrgency'
 'map_categories__HighDemand' 'map_categories__WaitingTimeCategory'
 'num__IsCanceled' 'num__LeadTime' 'num__ArrivalDateYear'
 'num__ArrivalDateMonth' 'num__ArrivalDateWeekNumber'
 'num__ArrivalDateDayOfMonth' 'num__StaysInWeekendNights'
 'num__StaysInWeekNights' 'num__Adults' 'num__Children' 'num__Babies'
 'num__Meal' 'num__MarketSegment' 'num__DistributionChannel'
 'num__IsRepeatedGuest' 'num__PreviousCancellations'
 'num__PreviousBookingsNotCanceled' 'num__ReservedRoomType'
 'num__AssignedRoomType' 'num__BookingChanges' 'num__DepositType'
 'num__Agent' 'num__Company' 'num__DaysInWaitingList' 'num__CustomerType'
 'num__ADR' 'num__RequiredCarParkingSpaces' 'num__TotalOfSpecialRequests'
 'num__ReservationStatus' 'num__ReservationStatusYear'
 'num__ReservationStatusMonth' 'num__ReservationStatusDay'
 'num__TotalStay' 'num__SpendingPerPerson' 'num__LoyaltyScore'
 'num__ChildRatio' 'num__Occupancy' 'num__CancellationRisk'
 'cat

In [149]:
# Convert transformed_data to pandas DataFrame
transformed_df = pd.DataFrame(
    transformed_data,
    columns=feature_names
)
transformed_df

Unnamed: 0,map_categories__GroupType,map_categories__BookingUrgency,map_categories__HighDemand,map_categories__WaitingTimeCategory,num__IsCanceled,num__LeadTime,num__ArrivalDateYear,num__ArrivalDateMonth,num__ArrivalDateWeekNumber,num__ArrivalDateDayOfMonth,num__StaysInWeekendNights,num__StaysInWeekNights,num__Adults,num__Children,num__Babies,num__Meal,num__MarketSegment,num__DistributionChannel,num__IsRepeatedGuest,num__PreviousCancellations,num__PreviousBookingsNotCanceled,num__ReservedRoomType,num__AssignedRoomType,num__BookingChanges,num__DepositType,num__Agent,num__Company,num__DaysInWaitingList,num__CustomerType,num__ADR,num__RequiredCarParkingSpaces,num__TotalOfSpecialRequests,num__ReservationStatus,num__ReservationStatusYear,num__ReservationStatusMonth,num__ReservationStatusDay,num__TotalStay,num__SpendingPerPerson,num__LoyaltyScore,num__ChildRatio,num__Occupancy,num__CancellationRisk,cat__GroupType_Big_Group,cat__GroupType_Couple,cat__GroupType_Family,cat__GroupType_Single,cat__BookingUrgency_Early_Booking,cat__BookingUrgency_Last_Minute_Booking,cat__BookingUrgency_Medium_Term_Booking,cat__HighDemand_High,cat__HighDemand_Low,cat__WaitingTimeCategory_Long,cat__WaitingTimeCategory_Medium,cat__WaitingTimeCategory_Short
0,0.0,2.0,0.0,0.0,-0.656114,-0.874411,-1.902984,0.185429,0.027139,-1.676536,-0.962296,-0.170768,-1.646194,-0.298667,-0.071574,1.163539,-0.164441,0.433788,-0.179182,-0.095855,-0.093869,-0.600661,-0.709741,-0.356899,-0.126961,-0.322338,-0.176568,-0.085664,-0.14545,-2.328143,-0.191229,-0.853375,0.573434,-1.879786,0.241861,-1.452015,-0.559220,-2.005682,-0.101771,-0.30974,-0.597570,-0.061833,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,1.524125,0.124226,-1.902984,0.185429,0.027139,-1.676536,-0.962296,1.088495,0.220775,-0.298667,-0.071574,-0.534495,0.601096,0.433788,-0.179182,-0.095855,-0.093869,-0.600661,-0.709741,-0.356899,-0.126961,-0.255237,-0.176568,-0.085664,-0.14545,-0.727647,-0.191229,0.346106,-1.527907,-1.879786,0.241861,-1.680519,0.420197,-0.714124,-0.101771,-0.30974,-0.597570,-0.061833,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.524125,-0.155879,-1.902984,0.185429,0.027139,-1.676536,-0.962296,1.088495,-1.646194,-0.298667,-0.071574,-0.534495,0.601096,0.433788,-0.179182,-0.095855,-0.093869,-0.600661,-0.709741,-0.356899,-0.126961,-0.255237,-0.176568,-0.085664,-0.14545,-0.905480,-0.191229,0.346106,-1.527907,-1.879786,-0.691841,1.632780,0.420197,0.290421,-0.101771,-0.30974,-0.989074,-0.061833,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,1.0,0.0,1.524125,0.172940,-1.902984,0.185429,0.027139,-1.676536,1.248033,1.088495,0.220775,-0.298667,-0.071574,-0.534495,0.601096,0.433788,-0.179182,-0.095855,-0.093869,-0.600661,-0.709741,-0.356899,-0.126961,-0.255237,-0.176568,-0.085664,-0.14545,-0.727647,-0.191229,1.545586,-1.527907,-1.879786,-0.069373,0.833018,1.399613,-0.714124,-0.101771,-0.30974,-0.858573,-0.061833,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,1.524125,0.270368,-1.902984,0.185429,0.027139,-1.562928,-0.962296,-0.170768,0.220775,-0.298667,-0.071574,-0.534495,0.601096,0.433788,-0.179182,-0.095855,-0.093869,-0.600661,-0.709741,-0.356899,-0.126961,-0.255237,-0.176568,-0.085664,-0.14545,-0.727647,-0.191229,0.346106,-1.527907,-1.879786,-0.691841,-1.566267,-0.559220,-0.714124,-0.101771,-0.30974,0.185439,-0.061833,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53262,1.0,2.0,0.0,0.0,-0.656114,-0.667376,1.131434,0.510832,0.616286,1.618099,1.248033,1.718127,0.220775,-0.298667,-0.071574,-0.534495,-0.164441,0.433788,-0.179182,-0.095855,-0.093869,-0.600661,-0.709741,-0.356899,-0.126961,8.356016,-0.176568,-0.085664,-0.14545,-0.316749,-0.191229,-0.853375,0.573434,1.183060,0.864329,-1.109260,1.889321,-0.382540,-0.101771,-0.30974,-0.933145,-0.061833,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
53263,3.0,0.0,1.0,0.0,-0.656114,0.294725,1.131434,0.510832,0.616286,1.731707,1.248033,1.718127,2.087745,-0.298667,-0.071574,-0.534495,0.601096,0.433788,-0.179182,-0.095855,-0.093869,1.923134,1.616124,-0.356899,-0.126961,-0.255237,-0.176568,-0.085664,-0.14545,2.388193,-0.191229,1.545586,0.573434,1.183060,0.864329,-0.995009,1.889321,0.531624,-0.101771,-0.30974,-0.709428,-0.061833,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
53264,1.0,1.0,1.0,0.0,-0.656114,-0.533413,1.131434,0.510832,0.616286,1.731707,1.248033,1.718127,0.220775,-0.298667,-0.071574,-0.534495,0.601096,0.433788,-0.179182,-0.095855,-0.093869,1.292185,1.034658,-0.356899,-0.126961,-0.255237,-0.176568,-0.085664,-0.14545,0.971389,-0.191229,3.944546,0.573434,1.183060,0.864329,-0.995009,1.889321,0.656953,-0.101771,-0.30974,-0.933145,-0.061833,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
53265,1.0,0.0,0.0,0.0,-0.656114,0.379975,1.131434,0.510832,0.616286,1.731707,1.248033,1.718127,0.220775,-0.298667,-0.071574,-0.534495,0.601096,0.433788,-0.179182,-0.095855,-0.093869,-0.600661,-0.709741,-0.356899,-0.126961,1.534114,-0.176568,-0.085664,-0.14545,-0.143937,-0.191229,-0.853375,0.573434,1.183060,0.864329,-0.995009,1.889321,-0.243086,-0.101771,-0.30974,-0.933145,-0.061833,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
