In [1]:
import pandas as pd

df = pd.read_csv('./data/restaurant_cleaned.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,multiple_deliveries,City,distance,time_diff_minutes,Time_taken (min)
0,0,36.0,4.2,Fog,Jam,2,Snack,3.0,Metropolitian,10.283809,15,46
1,1,21.0,4.7,Stormy,High,1,Meal,1.0,Metropolitian,6.244278,10,23
2,2,23.0,4.7,Sandstorms,Medium,1,Drinks,1.0,Metropolitian,13.792189,10,21
3,3,34.0,4.3,Sandstorms,Low,0,Buffet,0.0,Metropolitian,2.931178,10,20
4,4,24.0,4.7,Fog,Jam,1,Snack,1.0,Metropolitian,19.402707,15,41


In [2]:
# df.drop(['Type_of_order','Vehicle_condition','time_diff_minutes'])
df.drop(columns=['Type_of_order','time_diff_minutes'], inplace=True)

In [3]:
## Independent and dependent features
X = df.drop(labels=['Time_taken (min)'],axis=1)
Y = df[['Time_taken (min)']]

In [4]:
Y

Unnamed: 0,Time_taken (min)
0,46
1,23
2,21
3,20
4,41
...,...
45001,32
45002,36
45003,16
45004,26


In [5]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [6]:
numerical_cols

Index(['Unnamed: 0', 'Delivery_person_Age', 'Delivery_person_Ratings',
       'Vehicle_condition', 'multiple_deliveries', 'distance'],
      dtype='object')

In [7]:
categorical_cols

Index(['Weather_conditions', 'Road_traffic_density', 'City'], dtype='object')

In [8]:
numerical_cols

Index(['Unnamed: 0', 'Delivery_person_Age', 'Delivery_person_Ratings',
       'Vehicle_condition', 'multiple_deliveries', 'distance'],
      dtype='object')

In [9]:
for i in categorical_cols:
    print(df[i].unique())

['Fog' 'Stormy' 'Sandstorms' 'Windy' 'Cloudy' 'Sunny']
['Jam' 'High' 'Medium' 'Low']
['Metropolitian' 'Urban' 'Semi-Urban']


In [10]:
# Define the custom ranking for each ordinal variable
Weather_conditions_categories = ['Fog','Stormy','Sandstorms','Windy','Cloudy','Sunny']
Road_traffic_density_categories = ['Low', 'Medium', 'High', 'Jam']
City_categories = ['Semi-Urban','Urban','Metropolitian']


In [11]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [12]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[Weather_conditions_categories,Road_traffic_density_categories,City_categories])),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer(transformers=[
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [13]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [14]:
X_train.shape, X_test.shape

((31504, 9), (13502, 9))

In [15]:
y_train.shape, y_test.shape

((31504, 1), (13502, 1))

In [16]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns= preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns= preprocessor.get_feature_names_out())

In [17]:
X_train.head()

Unnamed: 0,num_pipeline__Unnamed: 0,num_pipeline__Delivery_person_Age,num_pipeline__Delivery_person_Ratings,num_pipeline__Vehicle_condition,num_pipeline__multiple_deliveries,num_pipeline__distance,cat_pipeline__Weather_conditions,cat_pipeline__Road_traffic_density,cat_pipeline__City
0,1.728934,-0.272594,0.829496,-0.016117,0.470023,-0.908856,-0.264475,1.306565,0.532706
1,-0.606794,-1.151894,0.829496,-1.219351,-1.26857,0.748897,0.319327,-1.09907,0.532706
2,1.11323,-0.976034,-0.107749,-0.016117,-1.26857,0.156454,1.486932,-0.297192,0.532706
3,0.824813,0.958426,1.141911,1.187116,0.470023,-0.376782,-0.264475,1.306565,0.532706
4,1.066388,0.782566,-0.107749,-1.219351,2.208615,0.444162,-0.848278,1.306565,-1.803696


In [18]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [19]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [20]:
regression.coef_

array([[-0.03640257,  2.3323797 , -2.20260893, -1.95192404,  2.11143164,
         1.88396747, -1.1493882 ,  2.97606395,  0.66793806]])

In [21]:
regression.intercept_

array([26.30072372])

In [22]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [23]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 6.546913542893007
MAE: 5.206277531902437
R2 score 51.36354263932513


Lasso
Model Training Performance
RMSE: 6.985280153532742
MAE: 5.581621673123432
R2 score 44.63231318075115


Ridge
Model Training Performance
RMSE: 6.546913130523226
MAE: 5.20627574175965
R2 score 51.36354876624418


Elasticnet
Model Training Performance
RMSE: 7.058972280521391
MAE: 5.651806803335893
R2 score 43.45793373636047


