In [67]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [41]:
df_orders = pd.read_parquet('data/masked_orders.parquet')
df_driver_order_mapping = pd.read_parquet('data/masked_driver_order_mapping.parquet')
df_service_times = pd.read_parquet('data/masked_service_times.parquet')
df_order_articles = pd.read_parquet('data/masked_order_articles.parquet')

In [42]:
# Exploration
df_orders

Unnamed: 0,warehouse_id,order_time,has_elevator,floor,is_pre_order,is_business,web_order_id,customer_id
0,18,2024-12-17 09:00:00+01:00,False,0.0,True,True,806432,166859
1,8,2024-12-10 09:45:00+01:00,False,0.0,True,True,678738,167463
2,8,2024-11-21 10:45:00+01:00,False,,True,True,347665,49336
3,13,2024-11-08 10:00:00+01:00,False,0.0,True,True,110643,172552
4,13,2025-01-06 13:15:48.730000+01:00,False,,False,True,1139043,136281
...,...,...,...,...,...,...,...,...
1534684,4,2024-11-27 15:45:00+01:00,False,0.0,True,True,460582,433427
1534685,6,2025-01-20 09:11:40.430000+01:00,False,0.0,False,True,1382719,404857
1534686,6,2024-11-26 08:45:00+01:00,False,0.0,True,True,419104,404857
1534687,6,2024-12-02 13:15:00+01:00,False,0.0,True,True,542528,393199


In [43]:
df_order_articles

Unnamed: 0,warehouse_id,box_id,article_id,article_weight_in_g,web_order_id
0,3,,3657,17250,1250399
1,3,,3657,17250,1250154
2,32,,2576,17000,1296951
3,32,,3975,15100,978668
4,25,,3670,11000,104995
...,...,...,...,...,...
15586377,35,,3953,17800,1533370
15586378,35,,3953,17800,1533370
15586379,35,,3953,17800,1533370
15586380,35,,3953,17800,1533370


In [44]:
df_driver_order_mapping

Unnamed: 0,web_order_id,driver_id
0,51222,182
1,53503,182
2,166709,182
3,136190,182
4,176394,182
...,...,...
1534811,473982,9240
1534812,474113,9240
1534813,475628,9240
1534814,468676,9240


In [45]:
df_service_times

Unnamed: 0,service_time_start,service_time_end,service_time_in_minutes,order_datetime,web_order_id,driver_id,trip_id,customer_id
0,2024-11-11 21:48:59+01:00,2024-11-11 21:57:56+01:00,8.950000,2024-11-11 20:45:00+01:00,1,3621,29687,35422
1,2024-11-05 13:27:30+01:00,2024-11-05 13:45:05+01:00,17.583333,2024-11-05 13:15:00+01:00,2,6575,10097,159165
2,2024-11-05 21:52:03+01:00,2024-11-05 22:09:28+01:00,17.416666,2024-11-05 20:00:00+01:00,3,6811,12316,377355
3,2024-11-12 21:01:30.013000+01:00,2024-11-12 21:15:30.087000+01:00,14.000000,2024-11-12 19:00:00+01:00,4,5183,31795,146455
4,2024-11-01 15:28:07+01:00,2024-11-01 15:37:01+01:00,8.900000,2024-11-01 13:45:00+01:00,5,694,966,210296
...,...,...,...,...,...,...,...,...
1534829,2025-01-30 09:28:42+01:00,2025-01-30 09:33:43+01:00,5.016666,2025-01-30 08:10:51.200000+01:00,1534830,10536,250427,454002
1534830,2025-01-30 09:36:48+01:00,2025-01-30 09:45:29+01:00,8.683333,2025-01-30 08:13:38.747000+01:00,1534831,10755,250425,221139
1534831,2025-01-30 09:49:06+01:00,2025-01-30 09:58:29+01:00,9.383333,2025-01-30 08:15:32.833000+01:00,1534832,6570,250428,529887
1534832,2025-01-30 09:59:50+01:00,2025-01-30 10:07:27+01:00,7.616666,2025-01-30 08:47:31.837000+01:00,1534833,10925,250431,365722


In [46]:
# Preprocessing - merge tables
df = pd.merge(df_orders, df_service_times, on="web_order_id", how='left', suffixes=('', '_y'))
df = pd.merge(df, df_driver_order_mapping, on="web_order_id", how='left', suffixes=('', '_y'))

df.drop(df.filter(regex='_y$').columns, axis=1, inplace=True)

In [49]:
# Preprocessing - get article total weight
article_total_weight = df_order_articles[["article_weight_in_g", "web_order_id"]].groupby("web_order_id").sum()
df = pd.merge(df, article_total_weight, on="web_order_id", how='left')

In [54]:
# Preprocessing - drop na
df = df.dropna()

In [58]:
# Preprocessing - train test split
df_train = df.sample(frac=0.8, random_state=0)
df_test = df.drop(df_train.index)

In [50]:
df

Unnamed: 0,warehouse_id,order_time,has_elevator,floor,is_pre_order,is_business,web_order_id,customer_id,service_time_start,service_time_end,service_time_in_minutes,order_datetime,driver_id,trip_id,article_weight_in_g
0,18,2024-12-17 09:00:00+01:00,False,0.0,True,True,806432,166859,2024-12-17 10:09:30.067000+01:00,2024-12-17 10:27:00.030000+01:00,17.500000,2024-12-17 09:00:00+01:00,9045,133625,164750.0
1,8,2024-12-10 09:45:00+01:00,False,0.0,True,True,678738,167463,2024-12-10 09:26:00.077000+01:00,2024-12-10 09:35:30.100000+01:00,9.500000,2024-12-10 09:45:00+01:00,9480,111163,148400.0
2,8,2024-11-21 10:45:00+01:00,False,,True,True,347665,49336,2024-11-21 12:16:00.100000+01:00,2024-11-21 12:29:30.047000+01:00,13.500000,2024-11-21 10:45:00+01:00,4418,55292,211400.0
3,13,2024-11-08 10:00:00+01:00,False,0.0,True,True,110643,172552,2024-11-08 12:00:04.260000+01:00,2024-11-08 12:42:00.157000+01:00,41.933333,2024-11-08 10:00:00+01:00,4661,19021,171300.0
4,13,2025-01-06 13:15:48.730000+01:00,False,,False,True,1139043,136281,2025-01-06 14:42:16+01:00,2025-01-06 14:52:02+01:00,9.766666,2025-01-06 13:15:48.730000+01:00,4847,183626,129390.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1534684,4,2024-11-27 15:45:00+01:00,False,0.0,True,True,460582,433427,2024-11-27 17:21:13+01:00,2024-11-27 17:36:58+01:00,15.750000,2024-11-27 15:45:00+01:00,3793,74157,260501.0
1534685,6,2025-01-20 09:11:40.430000+01:00,False,0.0,False,True,1382719,404857,2025-01-20 10:58:21+01:00,2025-01-20 11:21:11+01:00,22.833333,2025-01-20 09:11:40.430000+01:00,5245,223705,176760.0
1534686,6,2024-11-26 08:45:00+01:00,False,0.0,True,True,419104,404857,2024-11-26 10:18:30.110000+01:00,2024-11-26 10:24:00.090000+01:00,5.500000,2024-11-26 08:45:00+01:00,1997,68794,149117.0
1534687,6,2024-12-02 13:15:00+01:00,False,0.0,True,True,542528,393199,2024-12-02 14:24:34.453000+01:00,2024-12-02 14:36:00.097000+01:00,11.433333,2024-12-02 13:15:00+01:00,5357,87211,232700.0


# Linear Regression

In [59]:
target = "service_time_in_minutes"
features = ["article_weight_in_g", "warehouse_id", "driver_id", "is_business", "is_pre_order", "has_elevator", "floor"]

X = df_train[features].astype(float)
y = df_train[target].astype(float)

model = LinearRegression()
model.fit(X, y)

In [60]:
model.coef_

array([ 4.98785798e-05,  2.53973162e-02,  2.88150191e-04,  8.04893018e-01,
       -1.07118138e-01,  2.26725462e+00,  8.37650852e-04])

In [61]:
model.intercept_

np.float64(3.109716798014892)

In [64]:
# Prediction
X_test = df_test[features].astype(float)
y_test = df_test[target].astype(float)

y_pred = model.predict(X_test)

In [68]:
# Evaluation
print(f"MSE = {mean_squared_error(y_test, y_pred)}")
print(f"MAE = {mean_absolute_error(y_test, y_pred)}")
print(f"R2 = {model.score(X_test, y_test)}")

MSE = 27.622596911310186
MAE = 3.581488941580563
R2 = 0.28456199479555244
