In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Rides_Data.csv")

In [3]:
df.columns = df.columns.str.lower().str.replace(" ", "_")

In [4]:
df["start_date*"] = pd.to_datetime(df["start_date*"])
df["end_date*"] = pd.to_datetime(df["end_date*"])

In [5]:
df["hour"] = df["start_date*"].dt.hour

In [6]:
df["dayofweek"] = df["start_date*"].dt.dayofweek

In [7]:
df["is_weekday"] = df["dayofweek"] < 5

### Assumptions for the Project

###### REVENUE_PER_MILE = 1
###### COST_PER_MILE = 0.5

#### Recommendations:

##### Recommendation 1: Prioritize rides during peak hours on weekdays (8-11 AM and 5-8 PM) to maximize profitability.

In [8]:
REVENUE_PER_MILE = 1
COST_PER_MILE = 0.5

In [9]:
df["revenue"] = df["miles*"] * REVENUE_PER_MILE
df["cost"] = df["miles*"] * COST_PER_MILE
df["profit"] = df["revenue"] - df["cost"]


In [10]:
peak_hours = df[df["is_weekday"] & df["hour"].between(8, 11) | df["hour"].between(17, 20)]

In [11]:
recommendation_1 = (peak_hours.groupby("category*").agg(rides = ("miles*", "count"), avg_profit = ("profit", "mean"), total_profit = ("profit", "sum")))

In [12]:
print(recommendation_1)

           rides  avg_profit  total_profit
category*                                 
Business     461   10.595119       4884.35
Personal      32    2.607813         83.45


#### Recommendation 2: Target Audience:

In [13]:
recommendation_2 = df.groupby("purpose*").agg(rides = ("miles*", "count"), avg_profit = ("profit", "mean"), total_profit = ("profit", "sum")).sort_values("avg_profit", ascending=False)

In [14]:
print(recommendation_2)

                 rides  avg_profit  total_profit
purpose*                                        
Commute              1   90.100000         90.10
Customer Visit     111   10.874775       1207.10
Meeting            184    7.686685       1414.35
Charity ($)          1    7.550000          7.55
Meal/Entertain     163    6.853988       1117.20
Between Offices     18    5.472222         98.50
Temporary Site      48    5.293750        254.10
Errand/Supplies    117    4.159829        486.70
Airport/Travel       3    2.750000          8.25
Moving               4    2.275000          9.10


##### So this recommendation suggests target audience as ride-sharing companies and drivers who can optimize their schedules based on ride purpose to enhance profitability.

#### Recommendation 3: Profitability by hour

In [15]:
recommendation_3 = df.groupby("hour").agg(rides = ("miles*", "count"), avg_profit = ("profit", "mean"), total_profit_per_hour = ("profit", "sum")).sort_values("avg_profit", ascending = False)

In [16]:
print(recommendation_3)

      rides  avg_profit  total_profit_per_hour
hour                                          
2.0       2   41.325000                  82.65
17.0    101   19.876238                2007.50
16.0     92   17.022283                1566.05
19.0     66   16.215909                1070.25
12.0     77   11.837662                 911.50
18.0     91    9.489011                 863.50
3.0       3    9.200000                  27.60
22.0     29    8.315517                 241.15
6.0       4    7.025000                  28.10
23.0     26    6.961538                 181.00
7.0      12    6.229167                  74.75
15.0     94    5.937766                 558.15
13.0     93    5.355376                 498.05
21.0     53    5.261321                 278.85
20.0     67    5.097015                 341.50
9.0      49    4.843878                 237.35
5.0       4    4.550000                  18.20
14.0    100    4.244000                 424.40
1.0       5    4.180000                  20.90
10.0     64  

#### so this recommendation suggests profitability varies by hour, with certain hours yielding higher average profits.

#### ML Modeling:

In [17]:
df["profit"] = (df["miles*"] * REVENUE_PER_MILE) - (df["miles*"] * COST_PER_MILE)

##### High profit vs low profit classification

In [18]:
features = ["category*", "start*", "stop*", "purpose*", "hour", "dayofweek", "is_weekday"]

In [19]:
X = df[features]

In [22]:
df["miles*"] = df["miles*"].fillna(0)

In [23]:
y = df["miles*"]

##### split into train-test sets

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.30, random_state = 42)

#### preprocessing pipeline

In [27]:
num_features = ["is_weekday", "hour", "dayofweek"]
cat_features = ["category*", "start*", "stop*", "purpose*"]

In [28]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
preprocessor = ColumnTransformer(transformers=[("num", StandardScaler(), num_features), ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output = False), cat_features)])

In [29]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
model = Pipeline(steps = [("preprocess", preprocessor), ("regressor", RandomForestRegressor(n_estimators=200, random_state=42))])

#### Training

In [30]:
model.fit(X_train, y_train)

#### Model Evaluation

In [31]:
y_pred = model.predict(X_test)

In [43]:
results = X_test.copy()
results["actual_miles"] = y_test.values
results["predicted_miles"] = y_pred
results["error"] = results["actual_miles"] - results["predicted_miles"]
results["abs_error"] = results["error"].abs()

In [44]:
results.head(10)

Unnamed: 0,category*,start*,stop*,purpose*,hour,dayofweek,is_weekday,actual_miles,predicted_miles,error,abs_error
390,Business,Savon Height,Whitebridge,Errand/Supplies,21.0,0.0,True,3.6,7.315833,-3.715833,3.715833
247,Business,Midtown,Midtown,Meal/Entertain,8.0,4.0,True,1.1,3.8105,-2.7105,2.7105
260,Business,Midtown,Washington Avenue,Meeting,7.0,6.0,False,5.9,7.288,-1.388,1.388
155,Business,Unknown Location,Islamabad,Temporary Site,14.0,6.0,False,13.0,11.2435,1.7565,1.7565
984,Business,Berkeley,El Cerrito,Temporary Site,20.0,2.0,True,2.3,6.0895,-3.7895,3.7895
413,Business,Cary,Durham,Meeting,21.0,6.0,False,9.9,9.980975,-0.080975,0.080975
802,Business,Unknown Location,Lahore,,10.0,4.0,True,33.2,7.68,25.52,25.52
58,Business,Cary,Cary,Customer Visit,9.0,4.0,True,4.6,9.617,-5.017,5.017
752,Business,Unknown Location,Islamabad,,12.0,1.0,True,8.8,6.501,2.299,2.299
901,Business,Whitebridge,Whitebridge,,19.0,1.0,True,900.0,3.189,896.811,896.811


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae = mean_absolute_error(y_test, y_pred)

In [34]:
rmse = mean_squared_error(y_test, y_pred)

In [35]:
r2 = r2_score(y_test, y_pred)

In [37]:
print(mae)
print(rmse)
print(r2)

75.64367984598853
1153164.6518879624
-0.007520844798165616


#### Feature Importance

In [38]:
feature_names = model.named_steps["preprocess"].get_feature_names_out()

In [39]:
importance = model.named_steps["regressor"].feature_importances_

In [40]:
importance_df = pd.DataFrame({"feature": feature_names, "importance": importance})

In [41]:
print(importance_df.head(5))

                   feature  importance
0          num__is_weekday    0.048096
1                num__hour    0.327021
2           num__dayofweek    0.149248
3  cat__category*_Business    0.000213
4  cat__category*_Personal    0.000241


#### Conclusion:

##### This model can help in making informed decisions to optimize ride scheduling and improve overall profitability for the ride-sharing service.