In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mutual_info_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.feature_extraction import FeatureHasher
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report,mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold 
from scipy.stats import randint, uniform
import mord as m
from catboost import CatBoostClassifier, CatBoostRegressor
import pickle
from sklearn.model_selection import RandomizedSearchCV, KFold
from scipy.stats import randint, uniform


### Dataset: https://www.kaggle.com/datasets/ahsan81/food-ordering-and-delivery-app-dataset/data

In [2]:
df = pd.read_csv("../data/food_order.csv")

In [3]:
df

Unnamed: 0,order_id,customer_id,restaurant_name,cuisine_type,cost_of_the_order,day_of_the_week,rating,food_preparation_time,delivery_time
0,1477147,337525,Hangawi,Korean,30.75,Weekend,Not given,25,20
1,1477685,358141,Blue Ribbon Sushi Izakaya,Japanese,12.08,Weekend,Not given,25,23
2,1477070,66393,Cafe Habana,Mexican,12.23,Weekday,5,23,28
3,1477334,106968,Blue Ribbon Fried Chicken,American,29.20,Weekend,3,25,15
4,1478249,76942,Dirty Bird to Go,American,11.59,Weekday,4,25,24
...,...,...,...,...,...,...,...,...,...
1893,1476701,292602,Chipotle Mexican Grill $1.99 Delivery,Mexican,22.31,Weekend,5,31,17
1894,1477421,397537,The Smile,American,12.18,Weekend,5,31,19
1895,1477819,35309,Blue Ribbon Sushi,Japanese,25.22,Weekday,Not given,31,24
1896,1477513,64151,Jack's Wife Freda,Mediterranean,12.18,Weekday,5,23,31


### As stated in the notebook in EDA, we drop rating column because it has many missing values, and it does not have any statistical correlation or mutual information with the food_preparation_time. Only ratings had missing values.

In [4]:
df = df.drop(["rating"], axis=1)

In [5]:
df

Unnamed: 0,order_id,customer_id,restaurant_name,cuisine_type,cost_of_the_order,day_of_the_week,food_preparation_time,delivery_time
0,1477147,337525,Hangawi,Korean,30.75,Weekend,25,20
1,1477685,358141,Blue Ribbon Sushi Izakaya,Japanese,12.08,Weekend,25,23
2,1477070,66393,Cafe Habana,Mexican,12.23,Weekday,23,28
3,1477334,106968,Blue Ribbon Fried Chicken,American,29.20,Weekend,25,15
4,1478249,76942,Dirty Bird to Go,American,11.59,Weekday,25,24
...,...,...,...,...,...,...,...,...
1893,1476701,292602,Chipotle Mexican Grill $1.99 Delivery,Mexican,22.31,Weekend,31,17
1894,1477421,397537,The Smile,American,12.18,Weekend,31,19
1895,1477819,35309,Blue Ribbon Sushi,Japanese,25.22,Weekday,31,24
1896,1477513,64151,Jack's Wife Freda,Mediterranean,12.18,Weekday,23,31


In [6]:
categorical_cols =list(df.dtypes[df.dtypes == 'object'].index)
categorical_cols

['restaurant_name', 'cuisine_type', 'day_of_the_week']

In [7]:
categorical_cols

['restaurant_name', 'cuisine_type', 'day_of_the_week']

In [8]:
hashed_cols = ["order_id", "customer_id"]

In [9]:
numeric_cols = ["cost_of_the_order"]

### I dropped "delivery time" as this is post food processing time, and ideally has no information about the food processing time.

In [10]:
df["food_preparation_time"].value_counts()

food_preparation_time
21    135
23    123
27    123
22    123
28    121
24    121
20    119
30    119
33    118
35    117
31    116
26    115
25    113
34    113
32    113
29    109
Name: count, dtype: int64

### We see that the food preparation time is not really all continuous. It is sort of categorical with multiple values for different times/

### We also check if there is any signal at all in the data that will help to predict food_preparation time

In [12]:
#For categorical feature to categorical target
mutual_info = {}

for column in categorical_cols:
    mi = mutual_info_score(df[column], df["food_preparation_time"])
    mutual_info[column] = mi
    # mi = mutual_info_regression([df[column]], df["food_preparation_time"], random_state=0)
mutual_info   

{'restaurant_name': 0.5316188675170863,
 'cuisine_type': 0.06007066623943955,
 'day_of_the_week': 0.0024528753374482987}

In [13]:
#For categorical feature to numerical target
mutual_info = {}

for column in categorical_cols:
    X = pd.get_dummies(df[column])
    y = df["food_preparation_time"]
    mi = mutual_info_regression(X, y)
    mutual_info[column] = mi.mean()
    # mi = mutual_info_regression([df[column]], df["food_preparation_time"], random_state=0)
mutual_info  

{'restaurant_name': 0.005086319990385343,
 'cuisine_type': 0.002048572136738306,
 'day_of_the_week': 0.01504816979143353}

### As we see, when the target is categorical, we tend to have more mutual information for restaurant name, indicating that some restaurants have consistent patterns with processing time. This means it will be more beneficial for modeling to treat this task as a classification task. Surprisingly, day of the week and cuisine types have weak mutual information with food processing time.

### Split data

In [14]:
df

Unnamed: 0,order_id,customer_id,restaurant_name,cuisine_type,cost_of_the_order,day_of_the_week,food_preparation_time,delivery_time
0,1477147,337525,Hangawi,Korean,30.75,Weekend,25,20
1,1477685,358141,Blue Ribbon Sushi Izakaya,Japanese,12.08,Weekend,25,23
2,1477070,66393,Cafe Habana,Mexican,12.23,Weekday,23,28
3,1477334,106968,Blue Ribbon Fried Chicken,American,29.20,Weekend,25,15
4,1478249,76942,Dirty Bird to Go,American,11.59,Weekday,25,24
...,...,...,...,...,...,...,...,...
1893,1476701,292602,Chipotle Mexican Grill $1.99 Delivery,Mexican,22.31,Weekend,31,17
1894,1477421,397537,The Smile,American,12.18,Weekend,31,19
1895,1477819,35309,Blue Ribbon Sushi,Japanese,25.22,Weekday,31,24
1896,1477513,64151,Jack's Wife Freda,Mediterranean,12.18,Weekday,23,31


In [15]:
#Convert food preparation_time to categorical
bins = [0, 24, 29, 35]  
labels = ['short', 'medium', 'long']

# Create new categorical column
df['prep_time_class'] = pd.cut(
    df['food_preparation_time'],
    bins=bins,
    labels=labels,
    include_lowest=True
)

# Check class balance
print(df['prep_time_class'].value_counts())

prep_time_class
long      696
short     621
medium    581
Name: count, dtype: int64


In [16]:
df

Unnamed: 0,order_id,customer_id,restaurant_name,cuisine_type,cost_of_the_order,day_of_the_week,food_preparation_time,delivery_time,prep_time_class
0,1477147,337525,Hangawi,Korean,30.75,Weekend,25,20,medium
1,1477685,358141,Blue Ribbon Sushi Izakaya,Japanese,12.08,Weekend,25,23,medium
2,1477070,66393,Cafe Habana,Mexican,12.23,Weekday,23,28,short
3,1477334,106968,Blue Ribbon Fried Chicken,American,29.20,Weekend,25,15,medium
4,1478249,76942,Dirty Bird to Go,American,11.59,Weekday,25,24,medium
...,...,...,...,...,...,...,...,...,...
1893,1476701,292602,Chipotle Mexican Grill $1.99 Delivery,Mexican,22.31,Weekend,31,17,long
1894,1477421,397537,The Smile,American,12.18,Weekend,31,19,long
1895,1477819,35309,Blue Ribbon Sushi,Japanese,25.22,Weekday,31,24,long
1896,1477513,64151,Jack's Wife Freda,Mediterranean,12.18,Weekday,23,31,short


### Now we split the data. Remember to split the data before any preprocessing or fitting the model to avoid data leakage. We don't want our model to have any glance on our test dataset.

In [17]:
X = df.drop(["delivery_time", "food_preparation_time","prep_time_class" ], axis=1)
y = df["prep_time_class"]
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, stratify=y_full_train, test_size=0.25, random_state=42)

In [18]:
X_train

Unnamed: 0,order_id,customer_id,restaurant_name,cuisine_type,cost_of_the_order,day_of_the_week
437,1477154,143796,S'MAC,American,5.92,Weekday
1183,1477175,47235,Blue Ribbon Fried Chicken,American,14.60,Weekend
1730,1477096,158578,Xi'an Famous Foods,Chinese,14.12,Weekday
664,1476909,366975,Blue Ribbon Sushi Bar & Grill,Japanese,4.85,Weekend
782,1477684,39275,TAO,Japanese,15.28,Weekend
...,...,...,...,...,...,...
959,1477097,132906,Sushi of Gari 46,Japanese,13.05,Weekday
9,1477311,39705,Bukhara Grill,Indian,7.18,Weekday
737,1478280,91722,Shake Shack,American,19.40,Weekend
1565,1478144,155124,The Meatball Shop,Italian,33.03,Weekend


In [20]:
X_test;

In [21]:
X_val;

In [22]:
y_train;

In [23]:
# helper: turn selected columns into list of dicts for FeatureHasher
def to_dict_rows(X):
    # X is a DataFrame with the hashed_cols subset
    return X.astype(str).to_dict(orient='records')

hashed_pipe = Pipeline([
    ("to_dict", FunctionTransformer(to_dict_rows, feature_names_out="one-to-one")),
    ("hasher", FeatureHasher(n_features=2**18, input_type="dict"))  # adjust n_features as needed
])

cat_pipe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)

num_pipe = StandardScaler(with_mean=False)  # keep False because we may end up with sparse matrices

preproc = ColumnTransformer(
    transformers=[
        ("hash", hashed_pipe, hashed_cols),
        ("cat", cat_pipe, categorical_cols),
        ("num", num_pipe, numeric_cols),
    ],
    remainder="drop",    # or "passthrough" if you want to keep other columns
    sparse_threshold=1.0 # keep result sparse (good with hashing + OHE)
)

model = LogisticRegression(max_iter=1000)

pipeline = Pipeline([
    ("preproc", preproc),
    ("model", model)
])


In [24]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [25]:
y_pred

array(['long', 'short', 'short', 'medium', 'long', 'long', 'long',
       'medium', 'short', 'long', 'long', 'long', 'long', 'long', 'short',
       'short', 'medium', 'short', 'long', 'medium', 'long', 'medium',
       'short', 'short', 'short', 'long', 'short', 'long', 'medium',
       'long', 'short', 'short', 'medium', 'short', 'short', 'long',
       'short', 'long', 'short', 'long', 'long', 'long', 'long', 'long',
       'short', 'medium', 'short', 'medium', 'long', 'short', 'long',
       'short', 'long', 'long', 'long', 'short', 'long', 'long', 'short',
       'short', 'long', 'short', 'long', 'medium', 'long', 'long',
       'short', 'long', 'short', 'long', 'long', 'short', 'long', 'short',
       'long', 'long', 'medium', 'medium', 'short', 'medium', 'short',
       'long', 'medium', 'long', 'long', 'long', 'short', 'short',
       'short', 'short', 'long', 'long', 'long', 'long', 'medium', 'long',
       'long', 'short', 'long', 'long', 'long', 'long', 'medium', 'short',
  

In [26]:
y_test

1370     short
1545      long
127     medium
1048      long
520     medium
         ...  
1019      long
619       long
0       medium
755     medium
1148    medium
Name: prep_time_class, Length: 380, dtype: category
Categories (3, object): ['short' < 'medium' < 'long']

### Evaluation

In [27]:
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")

Accuracy: 0.334


#### We see that the model is not so good, and works just as good as a random predictor. Let's try a tree-based classifiers

In [38]:
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="mlogloss"
)

pipeline = Pipeline([
    ("preproc", preproc),
    ("model", model)
])

le = LabelEncoder()

y_train_enc = le.fit_transform(y_train)   # -> [0,1,2]
y_test_enc  = le.transform(y_test)

pipeline.fit(X_train, y_train_enc)
y_pred = pipeline.predict(X_test)

acc = accuracy_score(y_test_enc, y_pred)
print(f"XGBosst Accuracy: {acc:.3f}")

XGBosst Accuracy: 0.337


In [39]:
model = RandomForestClassifier()

pipeline = Pipeline([
    ("preproc", preproc),
    ("model", model)
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {acc:.3f}")


Random Forest Accuracy: 0.363


#### This is a little better, but still not so great. The class label we have here is long, short and medium, which does not make sense much to the user. Let's see if we can get back our prediction in minutes. First, we split on food_preparation_time

In [30]:
X = df.drop(["delivery_time", "food_preparation_time","prep_time_class" ], axis=1)
y = df["food_preparation_time"]
X_full_train, X_test_num, y_full_train, y_test_num = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
X_train_num, X_val_num, y_train_num, y_val_num = train_test_split(X_full_train, y_full_train, stratify=y_full_train, test_size=0.25, random_state=42)

In [31]:
y_pred_labels = pipeline.predict(X_test)                 # predicted class labels (e.g., 20..35)
proba          = pipeline.predict_proba(X_test)          # shape: (n_samples, K)
classes        = pipeline.named_steps["model"].classes_  # the label set seen by the model

# 2) Map each label to its interval [low, high)
label_to_interval = {lab: (bins[i], bins[i+1]) for i, lab in enumerate(labels)}

# Safety check to ensure all predicted labels exist in our mapping
unknown = sorted(set(y_pred_labels) - set(labels))
if unknown:
    raise ValueError(f"Predicted labels not in your labels list: {unknown}")

lows  = np.array([label_to_interval[lab][0] for lab in y_pred_labels], dtype=float)
highs = np.array([label_to_interval[lab][1] for lab in y_pred_labels], dtype=float)

# 3) Midpoint of predicted bin
mid = 0.5 * (lows + highs)

y_pred_point = mid             # midpoint as the prediction
# print(y_pred_point)

mae = mean_absolute_error(y_test_num, y_pred_point)
print(f"MAE: {mae:.3f}")



MAE: 7.350


In [33]:
X_train_num;

In [35]:
y_test_num;

#### What if we consider the task as a regression task instead of classification, what MAE will we get?

In [58]:
model = RandomForestRegressor()

pipeline = Pipeline([
    ("preproc", preproc),
    ("model", model)
])

pipeline.fit(X_train_num, y_train_num)
y_pred = pipeline.predict(X_test_num)


mae = mean_absolute_error(y_test_num, y_pred)
print(f"MAE: {mae:.3f}")


MAE: 4.080


In [60]:
y_pred;

### It seems handling it as a regression model gives a better performance which is within less than 5 minutes of the actual preparation time. So we proceed with regression.

### Hyperparameter Tuning

In [61]:
model = RandomForestRegressor(random_state=42, n_jobs=-1)
pipeline = Pipeline([
    ("preproc", preproc),
    ("model", model)
])

# --- search space (small & fast) ---
param_distributions = {
    "model__n_estimators": randint(200, 800),
    "model__max_depth": randint(5, 30),
    "model__min_samples_split": randint(2, 20),
}

cv = KFold(n_splits=3, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=25,                        
    scoring="neg_mean_absolute_error",
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42,
    refit=True                       
)

search.fit(X_train_num, y_train_num)

print("Best CV score (−MAE):", -search.best_score_)
print("Best params:\n", search.best_params_)

# Evaluate on hold-out test
y_pred = search.predict(X_test_num)
mae = mean_absolute_error(y_test_num, y_pred)
print(f"Test MAE: {mae:.3f}")

# Access the tuned pipeline
best_pipeline = search.best_estimator_


Fitting 3 folds for each of 25 candidates, totalling 75 fits
Best CV score (−MAE): 4.054436510474862
Best params:
 {'model__max_depth': 8, 'model__min_samples_split': 15, 'model__n_estimators': 441}
Test MAE: 3.998


In [62]:
best_pipeline

0,1,2
,steps,"[('preproc', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('hash', ...), ('cat', ...), ...]"
,remainder,'drop'
,sparse_threshold,1.0
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,func,<function to_...t 0x156bdb7e0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,n_features,262144
,input_type,'dict'
,dtype,<class 'numpy.float64'>
,alternate_sign,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,False
,with_std,True

0,1,2
,n_estimators,441
,criterion,'squared_error'
,max_depth,8
,min_samples_split,15
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### After hyperparameter tuning, I was able to get an MAE of less than 4 seconds

### Saving and loading the pipeline

In [64]:
# save
with open("prep_time_pipeline.pkl", "wb") as f:
    pickle.dump(best_pipeline, f)


In [65]:
# load
with open("prep_time_pipeline.pkl", "rb") as f:
    pipeline = pickle.load(f)

In [66]:
X_test.iloc[0]

order_id                 1478329
customer_id               116992
restaurant_name      Tres Carnes
cuisine_type             Mexican
cost_of_the_order          33.32
day_of_the_week          Weekday
Name: 1370, dtype: object

In [67]:
datapoint= {"order_id" :  1478329,
            "customer_id"  : 116992,
            "restaurant_name" : "Tres Carnes",
            "cuisine_type" : "Mexican",
            "cost_of_the_order" : 33.32,
            "day_of_the_week" : "Weekday"}

In [68]:
y_pred = pipeline.predict(pd.DataFrame([datapoint]))
y_pred

array([27.1015445])

In [69]:
y_test_num.iloc[0]

28

#### As we see, this is about 1 minute within the preparation time. This is good