In [6]:
import numpy as np
import pandas as pd

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error , r2_score
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor 

In [7]:
df = pd.read_csv('final_data.csv')

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,property_id,check_in_date,room_id,successful_bookings,capacity,mmm yy,week no,occupancy_rate,property_name,is_weekend
0,0,16559,2022-05-01,RT1,25,30,May 22,19,83.0,Atliq Exotica,1
1,1,19562,2022-05-01,RT1,28,30,May 22,19,93.0,Atliq Bay,1
2,2,19563,2022-05-01,RT1,23,30,May 22,19,77.0,Atliq Palace,1
3,3,17558,2022-05-01,RT1,13,19,May 22,19,68.0,Atliq Grands,1
4,4,16558,2022-05-01,RT1,18,19,May 22,19,95.0,Atliq Grands,1


In [9]:
df.drop(columns=['Unnamed: 0','property_name','successful_bookings','mmm yy','week no','capacity'],inplace=True)

In [10]:
df.head()

Unnamed: 0,property_id,check_in_date,room_id,occupancy_rate,is_weekend
0,16559,2022-05-01,RT1,83.0,1
1,19562,2022-05-01,RT1,93.0,1
2,19563,2022-05-01,RT1,77.0,1
3,17558,2022-05-01,RT1,68.0,1
4,16558,2022-05-01,RT1,95.0,1


In [11]:
# Convert the check_in_date to datetime
df['check_in_date'] = pd.to_datetime(df['check_in_date'])

# Extract the year, month, and day
df['year'] = df['check_in_date'].dt.year
df['month'] = df['check_in_date'].dt.month
df['day'] = df['check_in_date'].dt.day

In [12]:
df.head()

Unnamed: 0,property_id,check_in_date,room_id,occupancy_rate,is_weekend,year,month,day
0,16559,2022-05-01,RT1,83.0,1,2022,5,1
1,19562,2022-05-01,RT1,93.0,1,2022,5,1
2,19563,2022-05-01,RT1,77.0,1,2022,5,1
3,17558,2022-05-01,RT1,68.0,1,2022,5,1
4,16558,2022-05-01,RT1,95.0,1,2022,5,1


In [13]:
df.drop(columns=['check_in_date'],inplace=True)

In [14]:
df.head()

Unnamed: 0,property_id,room_id,occupancy_rate,is_weekend,year,month,day
0,16559,RT1,83.0,1,2022,5,1
1,19562,RT1,93.0,1,2022,5,1
2,19563,RT1,77.0,1,2022,5,1
3,17558,RT1,68.0,1,2022,5,1
4,16558,RT1,95.0,1,2022,5,1


In [15]:
# Step 1 -> train/test/split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['occupancy_rate']),
                                                 df['occupancy_rate'],
                                                 test_size=0.2,
                                                random_state=42)

In [16]:
X_train.head()

Unnamed: 0,property_id,room_id,is_weekend,year,month,day
706,17560,RT1,1,2022,5,8
8871,18558,RT3,0,2022,7,28
4855,17562,RT3,1,2022,6,18
5193,16559,RT4,0,2022,6,21
6886,19562,RT4,0,2022,7,8


In [17]:
y_train.sample(5)

8271    46.0
4365    50.0
8306    62.0
7252    48.0
2056    63.0
Name: occupancy_rate, dtype: float64

In [20]:
trf1 = ColumnTransformer([
    ('ohe_property_room_weekend', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [0, 1, 2])
], remainder='passthrough')

In [21]:
# Scaling
trf2 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,36))
])

In [22]:
# train the model
trf3 = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

In [23]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
])

In [24]:

# train
pipe.fit(X_train,y_train)

In [25]:
y_pred = pipe.predict(X_test)

In [26]:
print("Testing Accuracy:",pipe.score(X_test,y_test)*100)

Testing Accuracy: 79.90207802113353


In [27]:
print("Training Accuracy:",pipe.score(X_train,y_train)*100)

Training Accuracy: 80.06180398637987


In [28]:
import pickle
# Save the model to a file using pickle
with open('model_pipeline.pkl', 'wb') as f:
    pickle.dump(pipe, f)

print("Model saved successfully!")

Model saved successfully!


In [29]:
new_data = pd.DataFrame({
    'property_id': [16558],  # example categorical
    'room_id': ['RT1'],     # example categorical
    'is_weekend': [0],           # example binary

    'year': [2025],
    'month': [12],
    'day': [2]
})

# Predict occupancy rate
predicted_occupancy_rate = pipe.predict(new_data)

print("Predicted Occupancy Rate:", predicted_occupancy_rate)

Predicted Occupancy Rate: [57.20244183]
