In [1]:
# Run Initial Preparation of Data
%run 1_prep_data.ipynb

Total Number of Duplicate Rows: 25876
ArrivalDateMonth: 12 unique values
Meal: 4 unique values
MarketSegment: 7 unique values
DistributionChannel: 4 unique values
ReservedRoomType: 7 unique values
AssignedRoomType: 8 unique values
DepositType: 3 unique values
CustomerType: 4 unique values
ReservationStatus: 3 unique values


In [2]:
# Import Libraries
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [3]:
# Check What's the data look
df.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,MarketSegment,DistributionChannel,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,ReservedRoomType,AssignedRoomType,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusYear,ReservationStatusMonth,ReservationStatusDay
0,0,6,2015,6,27,1,0,2,1,0,0,2,5,3,0,0,0,0,0,0,0,6,0,0,2,0.0,0,0,1,2015,7,3
1,1,88,2015,6,27,1,0,4,2,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,76.5,0,1,0,2015,7,1
2,1,65,2015,6,27,1,0,4,1,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,68.0,0,1,0,2015,4,30
3,1,92,2015,6,27,1,2,4,2,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,76.5,0,2,0,2015,6,23
4,1,100,2015,6,27,2,0,2,2,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,76.5,0,1,0,2015,4,2


#### 1. Feature Engineer Pipeline

In [4]:
# Define custom transformations as a class
class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Ensure a copy to avoid modifying the original dataframe
        df = X.copy()
        
        # TotalStay
        df["TotalStay"] = df["StaysInWeekendNights"] + df["StaysInWeekNights"]

        # SpendingPerPerson
        with np.errstate(divide='ignore', invalid='ignore'):  # Ignore warnings during division
            df["SpendingPerPerson"] = df["ADR"] / (df["Adults"] + df["Children"] + df["Babies"])
        df["SpendingPerPerson"].replace([np.inf, -np.inf], 0, inplace=True)  # Replace inf values

        # GroupType
        def determine_group_type(row):
            if row["Adults"] == 1 and (row["Children"] + row["Babies"] == 0):
                return "Single"
            elif row["Adults"] == 2 and (row["Children"] + row["Babies"] == 0):
                return "Couple"
            elif (row["Children"] + row["Babies"]) > 0:
                return "Family"
            elif row["Adults"] > 2:
                return "Big_Group"
            else:
                return "Other"
        
        df["GroupType"] = df.apply(determine_group_type, axis=1)

        # BookingUrgency
        def determine_booking_urgency(lead_time):
            if lead_time > 60:
                return "Early_Booking"
            elif 30 <= lead_time <= 60:
                return "Medium_Term_Booking"
            else:
                return "Last_Minute_Booking"
        
        df["BookingUrgency"] = df["LeadTime"].apply(determine_booking_urgency)

        # HighDemand
        df["HighDemand"] = df["TotalOfSpecialRequests"].apply(
            lambda x: "High" if x >= 2 else "Low"
        )

        # LoyaltyScore
        df["LoyaltyScore"] = df["IsRepeatedGuest"] * (df["PreviousBookingsNotCanceled"] + 1)

        # ChildRatio
        df["ChildRatio"] = (df["Children"] + df["Babies"]) / (
            df["Adults"] + df["Children"] + df["Babies"]
        )
        df["ChildRatio"].fillna(0, inplace=True)

        # Occupancy
        df["Occupancy"] = (df["Adults"] + df["Children"] + df["Babies"]) / df["TotalStay"]
        df["Occupancy"].fillna(0, inplace=True)

        # WaitingTimeCategory
        def determine_waiting_time_category(days):
            if days < 7:
                return "Short"
            elif 7 <= days <= 30:
                return "Medium"
            else:
                return "Long"

        df["WaitingTimeCategory"] = df["DaysInWaitingList"].apply(
            determine_waiting_time_category
        )

        # CancellationRisk
        df["CancellationRisk"] = df["PreviousCancellations"] * df["LeadTime"]


        # Check for infinity or very large values
        df = df.replace([np.inf, -np.inf], np.nan)  # Replace infinities with NaN
        df = df.fillna(0)  # Replace NaN with 0

        return df


# Create a feature_engineering pipeline
feature_engineering_pipeline = Pipeline([
    ('feature_engineering', FeatureEngineeringTransformer()),
])

# Apply the feature engineering pipeline to the dataframe
df = feature_engineering_pipeline.fit_transform(df)

In [5]:
df.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,MarketSegment,DistributionChannel,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,ReservedRoomType,AssignedRoomType,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusYear,ReservationStatusMonth,ReservationStatusDay,TotalStay,SpendingPerPerson,GroupType,BookingUrgency,HighDemand,LoyaltyScore,ChildRatio,Occupancy,WaitingTimeCategory,CancellationRisk
0,0,6,2015,6,27,1,0,2,1,0,0,2,5,3,0,0,0,0,0,0,0,6,0,0,2,0.0,0,0,1,2015,7,3,2,0.0,Single,Last_Minute_Booking,Low,0,0.0,0.5,Short,0
1,1,88,2015,6,27,1,0,4,2,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,76.5,0,1,0,2015,7,1,4,38.25,Couple,Early_Booking,Low,0,0.0,0.5,Short,0
2,1,65,2015,6,27,1,0,4,1,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,68.0,0,1,0,2015,4,30,4,68.0,Single,Early_Booking,Low,0,0.0,0.25,Short,0
3,1,92,2015,6,27,1,2,4,2,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,76.5,0,2,0,2015,6,23,6,38.25,Couple,Early_Booking,High,0,0.0,0.333333,Short,0
4,1,100,2015,6,27,2,0,2,2,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,76.5,0,1,0,2015,4,2,2,38.25,Couple,Early_Booking,Low,0,0.0,1.0,Short,0


In [6]:
# Let's manually Encode the columns we created.
# Custom mappings for categorical encodings
group_type_mapping = {"Single": 0, "Couple": 1, "Family": 2, "Big_Group": 3}
booking_urgency_mapping = {
    "Early_Booking": 0,
    "Medium_Term_Booking": 1,
    "Last_Minute_Booking": 2,
}
high_demand_mapping = {"Low": 0, "High": 1}
waiting_time_category_mapping = {"Short": 0, "Medium": 1, "Long": 2}


# Custom function for mapping categories
class MapCategories:
    def __init__(self, mapping):
        self.mapping = mapping

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.replace(self.mapping)

    def get_feature_names_out(self, input_features=None):
        # Return the feature names after transformation
        return input_features if input_features is not None else self.feature_names_in_


# Apply the custom mappings to the dataframe
mapper = MapCategories(
    {
        "GroupType": group_type_mapping,
        "BookingUrgency": booking_urgency_mapping,
        "HighDemand": high_demand_mapping,
        "WaitingTimeCategory": waiting_time_category_mapping,
    }
)

# Fit the mapper
mapper.fit(df)

# Transform the dataframe
df = mapper.transform(df)

df.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,MarketSegment,DistributionChannel,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,ReservedRoomType,AssignedRoomType,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusYear,ReservationStatusMonth,ReservationStatusDay,TotalStay,SpendingPerPerson,GroupType,BookingUrgency,HighDemand,LoyaltyScore,ChildRatio,Occupancy,WaitingTimeCategory,CancellationRisk
0,0,6,2015,6,27,1,0,2,1,0,0,2,5,3,0,0,0,0,0,0,0,6,0,0,2,0.0,0,0,1,2015,7,3,2,0.0,0,2,0,0,0.0,0.5,0,0
1,1,88,2015,6,27,1,0,4,2,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,76.5,0,1,0,2015,7,1,4,38.25,1,0,0,0,0.0,0.5,0,0
2,1,65,2015,6,27,1,0,4,1,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,68.0,0,1,0,2015,4,30,4,68.0,0,0,0,0,0.0,0.25,0,0
3,1,92,2015,6,27,1,2,4,2,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,76.5,0,2,0,2015,6,23,6,38.25,1,0,1,0,0.0,0.333333,0,0
4,1,100,2015,6,27,2,0,2,2,0,0,0,6,3,0,0,0,0,0,0,0,9,0,0,2,76.5,0,1,0,2015,4,2,2,38.25,1,0,0,0,0.0,1.0,0,0


In [7]:
# Drop columns that are not needed for clustering analysis 
columns_to_drop = [
    "IsCanceled", "ReservationStatus", "ReservationStatusYear",
    "ReservationStatusMonth", "ReservationStatusDay", "Agent", "Company"
]
df = df.drop(columns=columns_to_drop)

In [8]:
# Create Pipelines for General Numeric and Categorical Features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [9]:
categorical_columns, numerical_columns = split_columns(df)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

transformer_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

In [10]:
# Fit the transformer pipeline
transformer_pipeline.fit(df)

# TEST
transformed_data = transformer_pipeline.transform(df)
transformed_data    # NdArray

array([[-0.95698887, -1.87303831,  0.19028319, ..., -0.60799036,
        -0.05297688, -0.0863955 ],
       [ 0.17792266, -1.87303831,  0.19028319, ..., -0.60799036,
        -0.05297688, -0.0863955 ],
       [-0.14040618, -1.87303831,  0.19028319, ..., -1.04114252,
        -0.05297688, -0.0863955 ],
       ...,
       [ 0.37168804,  1.13231667,  0.51752321, ..., -0.73174812,
        -0.05297688, -0.0863955 ],
       [ 0.46857073,  1.13231667,  0.51752321, ..., -0.97926364,
        -0.05297688, -0.0863955 ],
       [ 1.79724765,  1.13231667,  0.51752321, ..., -1.08927053,
        -0.05297688, -0.0863955 ]], shape=(43097, 35))

In [11]:
# Get feature names after transformation
feature_names = transformer_pipeline.named_steps["preprocessor"].get_feature_names_out()
print(feature_names)

['num__LeadTime' 'num__ArrivalDateYear' 'num__ArrivalDateMonth'
 'num__ArrivalDateWeekNumber' 'num__ArrivalDateDayOfMonth'
 'num__StaysInWeekendNights' 'num__StaysInWeekNights' 'num__Adults'
 'num__Children' 'num__Babies' 'num__Meal' 'num__MarketSegment'
 'num__DistributionChannel' 'num__IsRepeatedGuest'
 'num__PreviousCancellations' 'num__PreviousBookingsNotCanceled'
 'num__ReservedRoomType' 'num__AssignedRoomType' 'num__BookingChanges'
 'num__DepositType' 'num__DaysInWaitingList' 'num__CustomerType'
 'num__ADR' 'num__RequiredCarParkingSpaces' 'num__TotalOfSpecialRequests'
 'num__TotalStay' 'num__SpendingPerPerson' 'num__GroupType'
 'num__BookingUrgency' 'num__HighDemand' 'num__LoyaltyScore'
 'num__ChildRatio' 'num__Occupancy' 'num__WaitingTimeCategory'
 'num__CancellationRisk']


In [12]:
# Convert transformed_data to pandas DataFrame
transformed_df = pd.DataFrame(
    transformed_data,
    columns=feature_names
)
transformed_df

Unnamed: 0,num__LeadTime,num__ArrivalDateYear,num__ArrivalDateMonth,num__ArrivalDateWeekNumber,num__ArrivalDateDayOfMonth,num__StaysInWeekendNights,num__StaysInWeekNights,num__Adults,num__Children,num__Babies,num__Meal,num__MarketSegment,num__DistributionChannel,num__IsRepeatedGuest,num__PreviousCancellations,num__PreviousBookingsNotCanceled,num__ReservedRoomType,num__AssignedRoomType,num__BookingChanges,num__DepositType,num__DaysInWaitingList,num__CustomerType,num__ADR,num__RequiredCarParkingSpaces,num__TotalOfSpecialRequests,num__TotalStay,num__SpendingPerPerson,num__GroupType,num__BookingUrgency,num__HighDemand,num__LoyaltyScore,num__ChildRatio,num__Occupancy,num__WaitingTimeCategory,num__CancellationRisk
0,-0.956989,-1.873038,0.190283,0.031539,-1.669494,-1.040662,-0.180909,-1.844884,-0.246941,0.0,1.059910,-0.362871,0.36699,0.0,-0.114531,-0.039132,-0.584216,-0.691612,-0.36887,-0.103179,-0.054073,-0.103445,-2.992429,0.0,-0.883343,-0.678557,-2.258944,-1.418423,1.231519,-0.446099,0.0,-0.244706,-0.607990,-0.052977,-0.086395
1,0.177923,-1.873038,0.190283,0.031539,-1.669494,-1.040662,1.315280,0.139071,-0.246941,0.0,-0.568486,0.543612,0.36699,0.0,-0.114531,-0.039132,-0.584216,-0.691612,-0.36887,-0.103179,-0.054073,-0.103445,-0.905175,0.0,0.362670,0.538868,-0.793491,-0.108003,-0.999357,-0.446099,0.0,-0.244706,-0.607990,-0.052977,-0.086395
2,-0.140406,-1.873038,0.190283,0.031539,-1.669494,-1.040662,1.315280,-1.844884,-0.246941,0.0,-0.568486,0.543612,0.36699,0.0,-0.114531,-0.039132,-0.584216,-0.691612,-0.36887,-0.103179,-0.054073,-0.103445,-1.137092,0.0,0.362670,0.538868,0.346307,-1.418423,-0.999357,-0.446099,0.0,-0.244706,-1.041143,-0.052977,-0.086395
3,0.233284,-1.873038,0.190283,0.031539,-1.669494,1.343676,1.315280,0.139071,-0.246941,0.0,-0.568486,0.543612,0.36699,0.0,-0.114531,-0.039132,-0.584216,-0.691612,-0.36887,-0.103179,-0.054073,-0.103445,-0.905175,0.0,1.608683,1.756292,-0.793491,-0.108003,-0.999357,2.241657,0.0,-0.244706,-0.896758,-0.052977,-0.086395
4,0.344007,-1.873038,0.190283,0.031539,-1.556199,-1.040662,-0.180909,0.139071,-0.246941,0.0,-0.568486,0.543612,0.36699,0.0,-0.114531,-0.039132,-0.584216,-0.691612,-0.36887,-0.103179,-0.054073,-0.103445,-0.905175,0.0,0.362670,-0.678557,-0.793491,-0.108003,-0.999357,-0.446099,0.0,-0.244706,0.258314,-0.052977,-0.086395
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43092,0.828421,1.132317,0.517523,0.624196,1.616073,1.343676,1.315280,2.123025,-0.246941,0.0,-0.568486,0.543612,0.36699,0.0,-0.114531,-0.039132,3.668695,3.142668,-0.36887,-0.103179,-0.054073,-0.103445,2.710003,0.0,-0.883343,1.756292,0.410161,2.512837,-0.999357,-0.446099,0.0,-0.244706,-0.607990,-0.052977,-0.086395
43093,1.229792,1.132317,0.517523,0.624196,1.729369,1.343676,1.315280,0.139071,-0.246941,0.0,-0.568486,-0.362871,0.36699,0.0,-0.114531,-0.039132,-0.584216,-0.691612,-0.36887,-0.103179,-0.054073,-0.103445,-0.602318,0.0,-0.883343,1.756292,-0.580856,-0.108003,-0.999357,-0.446099,0.0,-0.244706,-0.896758,-0.052977,-0.086395
43094,0.371688,1.132317,0.517523,0.624196,1.729369,1.343676,2.063375,2.123025,-0.246941,0.0,-0.568486,0.543612,0.36699,0.0,-0.114531,-0.039132,2.251058,1.864575,-0.36887,-0.103179,-0.054073,-0.103445,3.158285,0.0,1.608683,2.365004,0.619986,2.512837,-0.999357,2.241657,0.0,-0.244706,-0.731748,-0.052977,-0.086395
43095,0.468571,1.132317,0.517523,0.624196,1.729369,1.343676,2.063375,0.139071,-0.246941,0.0,-0.568486,0.543612,0.36699,0.0,-0.114531,-0.039132,-0.584216,-0.691612,-0.36887,-0.103179,-0.054073,-0.103445,-0.143941,0.0,-0.883343,2.365004,-0.259031,-0.108003,-0.999357,-0.446099,0.0,-0.244706,-0.979264,-0.052977,-0.086395
