## Delivery Time Prediction

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

## Step 1: Gathering the data

In [2]:
import pandas as pd 
path = r"G:\Machine Learning\Projects\Dataset for project.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
0,1.0,22:24:17,23:27:16,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0
1,2.0,21:49:25,22:56:29,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0
2,3.0,20:39:28,21:09:09,5477,,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0
3,3.0,21:21:45,22:13:00,5477,,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0
4,3.0,02:40:36,03:20:26,5477,,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0


## Step 2: Data Understanding and basic data quality checks

In [3]:
df.shape

(197428, 16)

In [4]:
df.columns

Index(['market_id', 'created_at', 'actual_delivery_time', 'store_id',
       'store_primary_category', 'order_protocol', 'total_items', 'subtotal',
       'num_distinct_items', 'min_item_price', 'max_item_price',
       'total_onshift_dashers', 'total_busy_dashers',
       'total_outstanding_orders', 'estimated_order_place_duration',
       'estimated_store_to_consumer_driving_duration'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197428 entries, 0 to 197427
Data columns (total 16 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   market_id                                     196441 non-null  float64
 1   created_at                                    197428 non-null  object 
 2   actual_delivery_time                          197421 non-null  object 
 3   store_id                                      197428 non-null  int64  
 4   store_primary_category                        192668 non-null  object 
 5   order_protocol                                196433 non-null  float64
 6   total_items                                   197428 non-null  int64  
 7   subtotal                                      197428 non-null  int64  
 8   num_distinct_items                            197428 non-null  int64  
 9   min_item_price                                19

In [6]:
## Missing values
missing = df.isna().sum()
missing[missing>0]

market_id                                         987
actual_delivery_time                                7
store_primary_category                           4760
order_protocol                                    995
total_onshift_dashers                           16262
total_busy_dashers                              16262
total_outstanding_orders                        16262
estimated_store_to_consumer_driving_duration      526
dtype: int64

In [7]:
df.nunique()

market_id                                           6
created_at                                      46077
actual_delivery_time                            46088
store_id                                         6743
store_primary_category                             74
order_protocol                                      7
total_items                                        57
subtotal                                         8368
num_distinct_items                                 20
min_item_price                                   2312
max_item_price                                   2652
total_onshift_dashers                             172
total_busy_dashers                                159
total_outstanding_orders                          281
estimated_order_place_duration                     98
estimated_store_to_consumer_driving_duration     1336
dtype: int64

In [8]:
## Check the duplicates
df.duplicated().sum()

np.int64(0)

In [9]:
## Drop the dupolicates
df = df.drop_duplicates()

#### There are no duplicates rows now, but in the future, duplicate rows might appear, so we will need to drop them.

## Change The Datatype

In [10]:
df ['created_at']=pd.to_datetime(df ['created_at'])
df ['actual_delivery_time']=pd.to_datetime(df ['actual_delivery_time'])

#### Create a new column

In [11]:
df['delivery_duration_seconds']=(df['actual_delivery_time']-df['created_at']).dt.total_seconds()

#### Separate X and Y features
    Y: total_delivery_time
    X: all remaining features 

In [12]:
X = df.drop(columns=['delivery_duration_seconds','created_at','actual_delivery_time'])
Y = df[['delivery_duration_seconds']]

In [13]:
df['delivery_duration_seconds'].isna().sum()

np.int64(7)

In [14]:
X.head()

Unnamed: 0,market_id,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
0,1.0,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0
1,2.0,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0
2,3.0,5477,,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0
3,3.0,5477,,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0
4,3.0,5477,,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0


In [15]:
Y.head()

Unnamed: 0,delivery_duration_seconds
0,3779.0
1,4024.0
2,1781.0
3,3075.0
4,2390.0


## Step 3: Data Preprocessing and Data Cleaning

In [16]:
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes == 'int64']) or (X.columns[X.dtypes=='float64'])

In [17]:
con

['store_id',
 'total_items',
 'subtotal',
 'num_distinct_items',
 'min_item_price',
 'max_item_price',
 'estimated_order_place_duration']

In [18]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [19]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),  
OneHotEncoder(handle_unknown='ignore',sparse_output=False) # feature scaling
#OrdinalEncoder()
)

In [20]:
con_pipe = make_pipeline(
    SimpleImputer(strategy="mean"), # data cleaning
    StandardScaler()
)

### Handle using Forward fill

In [21]:

Y['delivery_duration_seconds']=Y['delivery_duration_seconds'].fillna(method='ffill')

In [22]:
pre= ColumnTransformer([
    ("cat",cat_pipe,cat),
    ("con",con_pipe,con)
]).set_output(transform='pandas')

In [23]:
pre

0,1,2
,transformers,"[('cat', ...), ('con', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [24]:
X_pre= pre.fit_transform (X)

In [25]:
new_cols =[]
for i in X_pre:
    c = i.split('__')[-1]
    new_cols.append(c)
print(new_cols)

['store_primary_category_afghan', 'store_primary_category_african', 'store_primary_category_alcohol', 'store_primary_category_alcohol-plus-food', 'store_primary_category_american', 'store_primary_category_argentine', 'store_primary_category_asian', 'store_primary_category_barbecue', 'store_primary_category_belgian', 'store_primary_category_brazilian', 'store_primary_category_breakfast', 'store_primary_category_british', 'store_primary_category_bubble-tea', 'store_primary_category_burger', 'store_primary_category_burmese', 'store_primary_category_cafe', 'store_primary_category_cajun', 'store_primary_category_caribbean', 'store_primary_category_catering', 'store_primary_category_cheese', 'store_primary_category_chinese', 'store_primary_category_chocolate', 'store_primary_category_comfort-food', 'store_primary_category_convenience-store', 'store_primary_category_dessert', 'store_primary_category_dim-sum', 'store_primary_category_ethiopian', 'store_primary_category_european', 'store_primar

In [26]:
X_pre.columns = new_cols
X_pre

Unnamed: 0,store_primary_category_afghan,store_primary_category_african,store_primary_category_alcohol,store_primary_category_alcohol-plus-food,store_primary_category_american,store_primary_category_argentine,store_primary_category_asian,store_primary_category_barbecue,store_primary_category_belgian,store_primary_category_brazilian,...,store_primary_category_vegan,store_primary_category_vegetarian,store_primary_category_vietnamese,store_id,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,estimated_order_place_duration
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.820802,0.301368,0.416145,0.815340,-0.247527,0.142210,1.524747
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.947893,-0.823686,-0.429124,-1.024867,1.367300,0.430528,1.524747
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.947893,-0.823686,-0.429124,-1.024867,2.325086,1.325928,1.524747
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.947893,1.051404,2.313474,1.428742,-0.165158,1.146848,1.524747
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.947893,-0.073650,0.667915,0.201937,0.792628,0.788688,1.524747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.279772,-0.073650,-0.709418,0.201937,-0.653628,-0.914362,-0.638568
197424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.279772,1.051404,0.179733,0.815340,-0.538694,-0.599181,-0.638568
197425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.279772,0.676386,-0.464229,0.201937,-0.739829,-1.362061,-0.638568
197426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.048449,-0.823686,-0.826801,-1.024867,-0.289670,-1.118513,1.524747


## Train Test Split

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
xtrain,xtest,ytrain,ytest = train_test_split(X_pre,Y,train_size=0.7,test_size=0.3,random_state=21)

In [29]:
xtrain.head()

Unnamed: 0,store_primary_category_afghan,store_primary_category_african,store_primary_category_alcohol,store_primary_category_alcohol-plus-food,store_primary_category_american,store_primary_category_argentine,store_primary_category_asian,store_primary_category_barbecue,store_primary_category_belgian,store_primary_category_brazilian,...,store_primary_category_vegan,store_primary_category_vegetarian,store_primary_category_vietnamese,store_id,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,estimated_order_place_duration
191851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.194542,-0.07365,0.220872,0.201937,0.399936,-0.115665,1.524747
128175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.633804,-0.823686,-0.86026,-1.024867,0.265846,-0.599181,-0.638568
150540,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.999512,1.051404,0.101843,0.81534,-1.01184,0.555884,-0.638568
80906,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.359874,-0.448668,-0.407183,-0.411465,0.495715,-0.294745,-0.638568
137931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.333577,0.301368,0.741965,0.81534,-0.260936,0.958814,-0.638568


In [30]:
xtest.head()

Unnamed: 0,store_primary_category_afghan,store_primary_category_african,store_primary_category_alcohol,store_primary_category_alcohol-plus-food,store_primary_category_american,store_primary_category_argentine,store_primary_category_asian,store_primary_category_barbecue,store_primary_category_belgian,store_primary_category_brazilian,...,store_primary_category_vegan,store_primary_category_vegetarian,store_primary_category_vietnamese,store_id,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,estimated_order_place_duration
67513,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-1.023384,-0.07365,-0.113177,0.201937,-0.167073,-0.669022,-0.638568
178218,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.052842,-0.823686,-0.587646,-1.024867,0.706428,-0.121038,1.524747
61453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-1.271254,-0.07365,0.914749,0.201937,1.079964,0.788688,-0.638568
22816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-1.49575,1.051404,1.764954,0.201937,-0.165158,0.609608,-0.638568
39264,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.619919,-0.448668,1.984362,-0.411465,3.474429,4.728446,1.524747


In [31]:
ytrain.head()

Unnamed: 0,delivery_duration_seconds
191851,2128.0
128175,-84301.0
150540,2200.0
80906,2298.0
137931,1600.0


In [32]:
ytest.head()

Unnamed: 0,delivery_duration_seconds
67513,2064.0
178218,1976.0
61453,5230.0
22816,3801.0
39264,3499.0


## Final model building

In [33]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xtrain,ytrain)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [34]:
model.intercept_

array([696.63631723])

In [35]:
model.coef_

array([[ 2301.84013043,  2443.57717514,  -408.82557834,  2546.29702844,
         -466.70674137, -1690.83959382, -1264.25872809, -1186.68251725,
         2560.77005141,  -654.02329187,   129.56591035,   281.50986418,
        -5014.85626932, -1458.13731709,  1694.3171936 , -1524.43140124,
         -980.04678574,  1720.70604302,  -238.35016066, -3578.34711227,
         -415.03394544,  1053.47029573, -1673.19174723, -1542.47000429,
         -743.37992278,   143.0438241 ,  -579.46872475, -3900.8121224 ,
         -872.14909427,  -620.75902545,  -857.27619159,   326.9686261 ,
         1775.56647792,   227.61344097,   592.4245532 ,  -696.87089519,
         1098.37512839,  1640.72782647,  1930.73371814,    84.30528517,
           72.42784242,   471.22490853,  1824.60075648,  -302.26517567,
         1561.152005  ,  2284.03045446,  -636.95497623,  -505.50090252,
          468.27722639,  1508.30672074,   743.2474803 , -1180.80823348,
         2219.45495951,  -222.85251365, -2513.67417796,  -666.33

In [36]:

ypreds = model.predict(xtrain)
ypreds

array([[1815.16007661],
       [-988.6109207 ],
       [-603.40368615],
       ...,
       [ 396.95684359],
       [ 940.9038521 ],
       [2350.69216498]], shape=(138199, 1))

In [37]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [38]:
mse = mean_squared_error(ytrain,ypreds)
mae = mean_absolute_error(ytrain,ypreds)
rmse = mse**(1/2)
r2 = r2_score(ytrain,ypreds)

print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R2 score: {r2*100:.2f}%")

MSE: 216210759.98
MAE: 5004.56
RMSE: 14704.11
R2 score: 0.47%
