In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 

%matplotlib inline 

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score

from sklearn.metrics import RocCurveDisplay
import datetime

In [3]:
dp = pd.read_csv('dynamic_pricing.csv')
dp

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Location_Category,Customer_Loyalty_Status,Number_of_Past_Rides,Average_Ratings,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration,Historical_Cost_of_Ride
0,90,45,Urban,Silver,13,4.47,Night,Premium,90,284.257273
1,58,39,Suburban,Silver,72,4.06,Evening,Economy,43,173.874753
2,42,31,Rural,Silver,0,3.99,Afternoon,Premium,76,329.795469
3,89,28,Rural,Regular,67,4.31,Afternoon,Premium,134,470.201232
4,78,22,Rural,Regular,74,3.77,Afternoon,Economy,149,579.681422
...,...,...,...,...,...,...,...,...,...,...
995,33,23,Urban,Gold,24,4.21,Morning,Premium,11,91.389526
996,84,29,Urban,Regular,92,4.55,Morning,Premium,94,424.155987
997,44,6,Suburban,Gold,80,4.13,Night,Premium,40,157.364830
998,53,27,Suburban,Regular,78,3.63,Night,Premium,58,279.095048


In [4]:
dp = pd.get_dummies(dp, drop_first=True, columns=["Location_Category", "Customer_Loyalty_Status", "Time_of_Booking", "Vehicle_Type"])
dp

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Number_of_Past_Rides,Average_Ratings,Expected_Ride_Duration,Historical_Cost_of_Ride,Location_Category_Suburban,Location_Category_Urban,Customer_Loyalty_Status_Regular,Customer_Loyalty_Status_Silver,Time_of_Booking_Evening,Time_of_Booking_Morning,Time_of_Booking_Night,Vehicle_Type_Premium
0,90,45,13,4.47,90,284.257273,False,True,False,True,False,False,True,True
1,58,39,72,4.06,43,173.874753,True,False,False,True,True,False,False,False
2,42,31,0,3.99,76,329.795469,False,False,False,True,False,False,False,True
3,89,28,67,4.31,134,470.201232,False,False,True,False,False,False,False,True
4,78,22,74,3.77,149,579.681422,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,33,23,24,4.21,11,91.389526,False,True,False,False,False,True,False,True
996,84,29,92,4.55,94,424.155987,False,True,True,False,False,True,False,True
997,44,6,80,4.13,40,157.364830,True,False,False,False,False,False,True,True
998,53,27,78,3.63,58,279.095048,True,False,True,False,False,False,True,True


In [5]:
X = dp.drop("Historical_Cost_of_Ride", axis=1)
y = dp["Historical_Cost_of_Ride"]

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)
len(X_train)

800

In [7]:
dp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Number_of_Riders                 1000 non-null   int64  
 1   Number_of_Drivers                1000 non-null   int64  
 2   Number_of_Past_Rides             1000 non-null   int64  
 3   Average_Ratings                  1000 non-null   float64
 4   Expected_Ride_Duration           1000 non-null   int64  
 5   Historical_Cost_of_Ride          1000 non-null   float64
 6   Location_Category_Suburban       1000 non-null   bool   
 7   Location_Category_Urban          1000 non-null   bool   
 8   Customer_Loyalty_Status_Regular  1000 non-null   bool   
 9   Customer_Loyalty_Status_Silver   1000 non-null   bool   
 10  Time_of_Booking_Evening          1000 non-null   bool   
 11  Time_of_Booking_Morning          1000 non-null   bool   
 12  Time_of_Booking_Night

In [8]:
model = RandomForestRegressor(n_jobs=-1, random_state=42)
model.fit(X_train, y_train)
model.score(X_valid, y_valid)

0.8655479146623615

In [14]:
y_train[:10]

516    206.147332
379    651.730389
468    135.005874
960    467.019895
115    192.879806
655    289.129681
891    513.990077
450    443.775020
376    430.785851
848    381.391205
Name: Historical_Cost_of_Ride, dtype: float64

In [13]:
model.predict(X_train[:10])

array([189.00571546, 623.59497004, 127.97797158, 471.61363642,
       205.74695673, 332.39459559, 512.35611954, 482.31712987,
       408.86480984, 375.95499126])

In [17]:

rs_grid = {"n_estimators": np.arange(10, 100, 10),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2),
           "max_features": [0.5, 1.0, "sqrt"]}

rs_model = RandomizedSearchCV(RandomForestRegressor(),
                         param_distributions=rs_grid,
                         n_iter = 100,
                         cv = 5,
                         verbose = True)

rs_model.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [19]:
rs_model.score(X_valid, y_valid)

0.8685263055107194