In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from GRANDE import GRANDE

In [2]:
df_orders = pd.read_parquet('data/masked_orders.parquet')
df_driver_order_mapping = pd.read_parquet('data/masked_driver_order_mapping.parquet')
df_service_times = pd.read_parquet('data/masked_service_times.parquet')
df_order_articles = pd.read_parquet('data/masked_order_articles.parquet')

In [3]:
# Preprocessing - merge tables
df = pd.merge(df_orders, df_service_times, on="web_order_id", how='left', suffixes=('', '_y'))
df = pd.merge(df, df_driver_order_mapping, on="web_order_id", how='left', suffixes=('', '_y'))

df.drop(df.filter(regex='_y$').columns, axis=1, inplace=True)

In [4]:
# Preprocessing - get article total weight
article_total_weight = df_order_articles[["article_weight_in_g", "web_order_id"]].groupby("web_order_id").sum()
df = pd.merge(df, article_total_weight, on="web_order_id", how='left')

In [5]:
# Preprocessing - one hot encoding for warehouse_id
# df = pd.get_dummies(df, columns=["warehouse_id", "driver_id"])

In [6]:
# Preprocessing - drop na
df = df.dropna()

In [7]:
df

Unnamed: 0,warehouse_id,order_time,has_elevator,floor,is_pre_order,is_business,web_order_id,customer_id,service_time_start,service_time_end,service_time_in_minutes,order_datetime,driver_id,trip_id,article_weight_in_g
0,18,2024-12-17 09:00:00+01:00,False,0.0,True,True,806432,166859,2024-12-17 10:09:30.067000+01:00,2024-12-17 10:27:00.030000+01:00,17.500000,2024-12-17 09:00:00+01:00,9045,133625,164750.0
1,8,2024-12-10 09:45:00+01:00,False,0.0,True,True,678738,167463,2024-12-10 09:26:00.077000+01:00,2024-12-10 09:35:30.100000+01:00,9.500000,2024-12-10 09:45:00+01:00,9480,111163,148400.0
3,13,2024-11-08 10:00:00+01:00,False,0.0,True,True,110643,172552,2024-11-08 12:00:04.260000+01:00,2024-11-08 12:42:00.157000+01:00,41.933333,2024-11-08 10:00:00+01:00,4661,19021,171300.0
5,9,2025-01-24 11:31:48.413000+01:00,False,0.0,False,True,1463699,193987,2025-01-24 13:35:19+01:00,2025-01-24 13:58:59+01:00,23.666666,2025-01-24 11:31:48.413000+01:00,9591,237536,111914.0
6,8,2025-01-22 10:15:00+01:00,False,0.0,True,True,1411462,193954,2025-01-22 10:53:12.747000+01:00,2025-01-22 11:00:00.057000+01:00,6.800000,2025-01-22 10:15:00+01:00,362,230239,4404.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1534684,4,2024-11-27 15:45:00+01:00,False,0.0,True,True,460582,433427,2024-11-27 17:21:13+01:00,2024-11-27 17:36:58+01:00,15.750000,2024-11-27 15:45:00+01:00,3793,74157,260501.0
1534685,6,2025-01-20 09:11:40.430000+01:00,False,0.0,False,True,1382719,404857,2025-01-20 10:58:21+01:00,2025-01-20 11:21:11+01:00,22.833333,2025-01-20 09:11:40.430000+01:00,5245,223705,176760.0
1534686,6,2024-11-26 08:45:00+01:00,False,0.0,True,True,419104,404857,2024-11-26 10:18:30.110000+01:00,2024-11-26 10:24:00.090000+01:00,5.500000,2024-11-26 08:45:00+01:00,1997,68794,149117.0
1534687,6,2024-12-02 13:15:00+01:00,False,0.0,True,True,542528,393199,2024-12-02 14:24:34.453000+01:00,2024-12-02 14:36:00.097000+01:00,11.433333,2024-12-02 13:15:00+01:00,5357,87211,232700.0


In [8]:
# Preprocessing - convert service-time-start to time of day ordinal buckets
df["service_time_start"] = pd.to_datetime(df["service_time_start"])
df["service_time_start"] = df["service_time_start"].dt.hour

In [9]:
# Preprocessing - train test split
df_train = df.sample(frac=0.8, random_state=0)
df_test = df.drop(df_train.index)

In [10]:
# Preprocessing - add num_previous_orders_customer column
num_previous_orders_customer = df_train["customer_id"].value_counts().to_dict()
df_train["num_previous_orders_customer"] = df_train["customer_id"].map(num_previous_orders_customer).fillna(0)
df_test["num_previous_orders_customer"] = df_test["customer_id"].map(num_previous_orders_customer).fillna(0)

In [11]:
# Preprocessing - create buckets categorizing the customers into slow and fast customer (10 buckets ordinal scale) based on the average service time
# ONLY IF THERE ARE MORE THAN 5 ORDERS PER CUSTOMER
# customer_avg_service_time = df_train[["customer_id", "service_time_in_minutes"]].groupby("customer_id").mean()
# customer_avg_service_time["customer_speed"] = pd.cut(customer_avg_service_time["service_time_in_minutes"], bins=9, labels=False)
# df_train["customer_speed"] = df_train["customer_id"].map(customer_avg_service_time["customer_speed"]).fillna(4)
# df_test["customer_speed"] = df_test["customer_id"].map(customer_avg_service_time["customer_speed"]).fillna(4)

# Count the number of orders per customer
customer_order_counts = df_train["customer_id"].value_counts()
# Filter customers with more than 5 orders
customers_with_more_than_5_orders = customer_order_counts[customer_order_counts > 5].index
# Calculate the average service time for these customers
customer_avg_service_time = df_train[df_train["customer_id"].isin(customers_with_more_than_5_orders)][["customer_id", "service_time_in_minutes"]].groupby("customer_id").mean()

customer_avg_service_time["customer_speed"] = pd.cut(customer_avg_service_time["service_time_in_minutes"], bins=9, labels=False)
df_train["customer_speed"] = df_train["customer_id"].map(customer_avg_service_time["customer_speed"]).fillna(4)
df_test["customer_speed"] = df_test["customer_id"].map(customer_avg_service_time["customer_speed"]).fillna(4)

# Linear Regression

In [12]:
# Get all column names for one hot encoding
warehouse_id_cols = df.columns[df.columns.str.contains("warehouse_id")]
driver_id_cols = df.columns[df.columns.str.contains("driver_id")]

In [13]:
target = "service_time_in_minutes"
features = ["article_weight_in_g", "is_business", "is_pre_order", "has_elevator", "floor", "num_previous_orders_customer", "customer_speed"]
# features.extend(warehouse_id_cols)
# features.extend(driver_id_cols)

X = df_train[features].astype(float)
y = df_train[target].astype(float)

model = LinearRegression()
model.fit(X, y)

In [14]:
model.coef_

array([ 4.93907065e-05,  7.65854166e-01, -2.37650826e-01,  2.29948927e+00,
        8.13195329e-04,  7.64430103e-02,  2.88083640e-01])

In [15]:
model.intercept_

3.84748775873282

In [16]:
# Prediction
X_test = df_test[features].astype(float)
y_test = df_test[target].astype(float)

y_pred = model.predict(X_test)

In [17]:
# Evaluation
print(f"MSE = {mean_squared_error(y_test, y_pred)}")
print(f"MAE = {mean_absolute_error(y_test, y_pred)}")
print(f"R2 = {model.score(X_test, y_test)}")

MSE = 28.448773375136547
MAE = 3.6431403083170877
R2 = 0.2631636431805826


# GRANDE

In [42]:
params = {
        'depth': 5, # tree depth
        'n_estimators': 64, # number of estimators / trees

        'learning_rate_weights': 0.005, # learning rate for leaf weights
        'learning_rate_index': 0.01, # learning rate for split indices
        'learning_rate_values': 0.01, # learning rate for split values
        'learning_rate_leaf': 0.01, # learning rate for leafs (logits)

        'optimizer': 'adam', # optimizer
        'cosine_decay_steps': 0, # decay steps for lr schedule (CosineDecayRestarts)

        'loss': 'mse', # loss function (default 'crossentropy' for binary & multi-class classification and 'mse' for regression)
        'focal_loss': False, # use focal loss {True, False}
        'temperature': 0.0, # temperature for stochastic re-weighted GD (0.0, 1.0)

        'from_logits': True, # use logits for weighting {True, False}
        'use_class_weights': True, # use class weights for training {True, False}

        'dropout': 0.0, # dropout rate (here, dropout randomly disables individual estimators of the ensemble during training)

        'selected_variables': 0.5, # feature subset percentage (0.0, 1.0)
        'data_subset_fraction': 0.1, # data subset percentage (0.0, 1.0)
}

args = {
    'epochs': 10, # number of epochs for training
    'early_stopping_epochs': 25, # patience for early stopping (best weights are restored)
    'batch_size': 128,  # batch size for training

    'cat_idx': [], # put list of categorical indices
    'objective': 'regression', # objective / task {'binary', 'classification', 'regression'}
    
    'random_seed': 42,
    'verbose': 1,       
}

In [43]:
model_grande = GRANDE(params=params, args=args)

In [None]:
target = "service_time_in_minutes"
features = ["article_weight_in_g", "is_business", "is_pre_order", "has_elevator", "floor", "customer_speed"]
# features.extend(warehouse_id_cols)
# features.extend(driver_id_cols)

X = df_train[features].astype(float)
y = df_train[target].astype(float)

# Split data into training and validation
X_train = X.sample(frac=0.8, random_state=0)
X_val = X.drop(X_train.index)
y_train = y[X_train.index]
y_val = y[X_val.index]

model_grande.fit(X_train, y_train, X_val, y_val, )

Epoch 1/10
[1m  12/7052[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:55[0m 16ms/step - loss: 1.1739

In [None]:
# Prediction
y_pred = model.predict(X_test)

In [None]:
# Evaluation
print(f"MSE = {mean_squared_error(y_test, y_pred)}")
print(f"MAE = {mean_absolute_error(y_test, y_pred)}")
print(f"R2 = {model.score(X_test, y_test)}")