In [43]:
## Importing necessary libraries ##
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [44]:
data = pd.read_csv("/config/workspace/notebooks/data/finalTrain.csv")

In [45]:
data.head(3)

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min)
0,0xcdcd,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,21:55,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46
1,0xd987,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,14:55,15:05,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23
2,0x2784,PUNERES13DEL03,23.0,4.7,18.56245,73.916619,18.65245,74.006619,04-03-2022,17:30,17:40,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21


In [46]:
data = data.drop(labels=["ID"],axis=1)
data = data.drop(labels=["Delivery_person_ID"],axis=1)
data = data.drop(labels=["Time_Orderd"],axis=1)
data = data.drop(labels=["Time_Order_picked"],axis=1)

In [47]:
data.head(3)

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min)
0,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46
1,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23
2,23.0,4.7,18.56245,73.916619,18.65245,74.006619,04-03-2022,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21


# Seggregating Dependent and Independent Features

In [48]:
data.rename(columns={"Time_taken (min)":"Time_taken_min"}, inplace=True)

In [49]:
X = data.drop(labels=["Time_taken_min"], axis=1)
y= data[["Time_taken_min"]]

In [50]:
y.head(3)

Unnamed: 0,Time_taken_min
0,46
1,23
2,21


In [51]:
X.head(3)

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City
0,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian
1,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian
2,23.0,4.7,18.56245,73.916619,18.65245,74.006619,04-03-2022,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian


# Seggregating Categorical and Numerical Features

In [52]:
X["Order_Date"] = pd.to_datetime(X["Order_Date"],format='%d-%m-%Y')
X["Day"] = pd.to_datetime(X["Order_Date"],format='%d-%m-%Y').dt.day
X["Month"] = pd.to_datetime(X["Order_Date"],format='%d-%m-%Y').dt.month
X["Year"] = pd.to_datetime(X["Order_Date"],format='%d-%m-%Y').dt.year
X.drop("Order_Date",axis=1,inplace=True)

In [53]:
X.rename(columns={"Day": "Day_Ordered", "Month": "Month_Ordered", "Year": "Year_Ordered"}, inplace=True)

In [54]:
categorical_columns = X.select_dtypes(include="object").columns
numerical_columns = X.select_dtypes(exclude="object").columns
datetime_columns = X.select_dtypes(include="datetime64[ns]").columns

In [55]:
categorical_columns

Index(['Weather_conditions', 'Road_traffic_density', 'Type_of_order',
       'Type_of_vehicle', 'Festival', 'City'],
      dtype='object')

In [56]:
numerical_columns

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Vehicle_condition',
       'multiple_deliveries', 'Day_Ordered', 'Month_Ordered', 'Year_Ordered'],
      dtype='object')

In [57]:
datetime_columns

Index([], dtype='object')

# Automating EDA

In [58]:
from sklearn.impute import SimpleImputer ## Handling Missing Values
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
from sklearn.preprocessing import OneHotEncoder # One Hot Encoder
## Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

## Seggregating null values based on dtype

In [59]:
cat_col_null_list = []
num_col_null_list = []

for col in data.columns:
    if data[col].isnull().sum() > 0:
        if col in categorical_columns:
            cat_col_null_list.append(f"{col} = {data[col].isnull().sum()}")
        if col in numerical_columns:
            num_col_null_list.append(f"{col} = {data[col].isnull().sum()}")

In [60]:
cat_col_null_list

['Weather_conditions = 616',
 'Road_traffic_density = 601',
 'Festival = 228',
 'City = 1200']

In [61]:
null_sum_catcols = sum(int(item.split('=')[1].strip()) for item in cat_col_null_list)
print("Sum of null values in categorical columns:", null_sum_catcols)

Sum of null values in categorical columns: 2645


In [62]:
num_col_null_list

['Delivery_person_Age = 1854',
 'Delivery_person_Ratings = 1908',
 'multiple_deliveries = 993']

In [63]:
null_sum_numcols = sum(int(item.split('=')[1].strip()) for item in num_col_null_list)
print("Sum of null values in numerical columns:", null_sum_numcols)

Sum of null values in numerical columns: 4755


In [64]:
null_sum_catcols + null_sum_numcols

7400

## Numerical Pipeline

In [65]:
null_sum_numcols

4755

In [66]:
X.shape

(45584, 17)

Imputing 4755 null values out of 45584 with median will be okay as its not such a great part of data and these are also missing at random.

## Categorical Pipeline

- Weather_conditions: Categorical/Qualitative Data - Ordinal.<br> 

- Road_traffic_density: Categorical/Qualitative Data - Ordinal.<br> 

-  **_Vehicle_condition : Categorical/Qualitative Data - Ordinal._**<br> 

-  City: Categorical/Qualitative Data - Ordinal.<br>
-----------------------------------------------------------------------------------------
-  Type_of_order: Categorical/Qualitative Data - Nominal.<br> 

-  Type_of_vehicle: Categorical/Qualitative Data - Nominal.<br> 

-  Festival: Categorical/Qualitative Data - Nominal.<br>
-----------------------------------------------------------------------------------------
-  multiple_deliveries: Numerical/Quantitative Data - Ratio Scale Data.<br> 




As we can see vehicle condition is ordinal but it had a dtype of int due to which it is placed in numerical_columns

In [67]:
# Defining custom rankings for each ordinal feature
Weather_conditions_cat = ['Sunny', 'Stormy', 'Sandstorms', 'Windy', 'Cloudy', 'Fog', 'nan'] 
Road_traffic_density_cat = ['Low', 'Medium', 'High', 'Jam', 'nan'] 
City_cat = ['Urban', 'Metropolitian', 'Semi-Urban', 'nan']

In [68]:
Type_of_order_cat = ['Buffet', 'Drinks', 'Meal', 'Snack']
Type_of_vehicle_cat = ['bicycle', 'electric_scooter', 'motorcycle', 'scooter']
Festival_cat = ['No', 'Yes', 'nan']

In [69]:
numerical_columns = ['Delivery_person_Age', 'Delivery_person_Ratings', 'Restaurant_latitude', 'Restaurant_longitude', 'Delivery_location_latitude', 'Delivery_location_longitude', 'Vehicle_condition', 'multiple_deliveries', 'Day_Ordered', 'Month_Ordered', 'Year_Ordered']
cat_ordinal_columns = ["Weather_conditions","Road_traffic_density","City"] 
cat_ohe_columns = ["Type_of_order","Type_of_vehicle","Festival"]

In [70]:
num_pipeline = Pipeline(
    steps = [
        ("imputer",SimpleImputer(strategy="median")), # We chose median incase of presence of outliers.
        ("scaler_numerical",StandardScaler(with_mean=False)) # Standardization + Min Max Scaler
    ])
    
cat_pipeline_ordinal = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(categories= [Weather_conditions_cat,Road_traffic_density_cat,City_cat])),
    ('scaler_ordinal', StandardScaler(with_mean=False))
    ]

)
cat_pipeline_ohe = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehotencoder', OneHotEncoder(categories = [Type_of_order_cat,Type_of_vehicle_cat,Festival_cat])),
    ('scaler_ohe', StandardScaler(with_mean=False))
    ]
    )

# Combining pipelines for both numerical as well as categorical features using column transformer.
pre_processor = ColumnTransformer(transformers=[
    ("num_pipeline", num_pipeline, numerical_columns),
    ("cat_pipeline_ordinal", cat_pipeline_ordinal, cat_ordinal_columns),
    ("cat_pipeline_ohe", cat_pipeline_ohe, cat_ohe_columns)
], remainder='passthrough')

# With remainder = "passthrough" it makes sure that if client had given columns which are not be pre-processed so those columns wont be changed.


# Train Test Split:

In [71]:
from sklearn.model_selection import train_test_split

#X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=20)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30,random_state=20)

In [72]:
X_train.head(3)

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Day_Ordered,Month_Ordered,Year_Ordered
2389,34.0,4.7,15.56155,73.749092,15.58155,73.769092,Windy,Low,2,Snack,scooter,1.0,No,Metropolitian,17,2,2022
37768,31.0,4.2,22.753659,75.903365,22.833659,75.983365,Windy,Jam,1,Snack,motorcycle,0.0,No,Metropolitian,2,4,2022
21005,38.0,4.6,11.000762,76.981876,11.110762,77.091876,Sunny,Low,2,Snack,electric_scooter,1.0,No,Metropolitian,14,3,2022


In [73]:
X_test.head(3)

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Day_Ordered,Month_Ordered,Year_Ordered
41099,26.0,4.6,18.53408,73.89852,18.62408,73.98852,Stormy,Low,1,Drinks,motorcycle,1.0,No,Metropolitian,4,4,2022
805,26.0,3.9,22.761226,75.887522,22.791226,75.917522,Sunny,Jam,0,Meal,motorcycle,0.0,No,Metropolitian,13,3,2022
27719,24.0,4.7,23.359194,85.325447,23.399194,85.365447,Windy,High,1,Meal,scooter,1.0,No,Urban,30,3,2022


In [74]:
X_train = pd.DataFrame(pre_processor.fit_transform(X_train),columns=pre_processor.get_feature_names_out())
X_test = pd.DataFrame(pre_processor.transform(X_test),columns=pre_processor.get_feature_names_out())

In [75]:
X_train[["cat_pipeline_ohe__Type_of_order_Snack","cat_pipeline_ohe__Type_of_order_Buffet","cat_pipeline_ohe__Type_of_order_Drinks","cat_pipeline_ohe__Type_of_order_Meal"]].sample(5)

Unnamed: 0,cat_pipeline_ohe__Type_of_order_Snack,cat_pipeline_ohe__Type_of_order_Buffet,cat_pipeline_ohe__Type_of_order_Drinks,cat_pipeline_ohe__Type_of_order_Meal
27638,2.303844,0.0,0.0,0.0
19983,0.0,2.31602,0.0,0.0
8723,0.0,2.31602,0.0,0.0
2401,2.303844,0.0,0.0,0.0
23663,0.0,0.0,0.0,2.30242


# Model Training

In [76]:
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [77]:
regression = LinearRegression()
regression.fit(X_train,y_train)

In [78]:
regression.coef_

array([[ 2.24042349e+00, -2.34886902e+00, -1.33935603e-01,
        -6.46858163e-01,  1.72927056e-01,  5.68079817e-01,
        -1.62674614e+00,  1.91053372e+00,  5.58547729e-02,
        -1.52508758e-02, -9.43219807e+00,  1.82063772e+00,
         3.11171334e+00,  1.01940942e+00, -1.68174404e+04,
        -1.68210203e+04, -1.69167211e+04, -1.69062619e+04,
        -2.16267626e+03, -1.56374342e+04, -2.78273840e+04,
        -2.65949609e+04,  5.87235312e+05,  5.87236807e+05,
         0.00000000e+00]])

In [79]:
regression.intercept_

array([-4170434.57313473])

## Model Evaluation

In [80]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

## Train Multiple Models

In [81]:
models = {
    "LinearRegression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "ElasticNet": ElasticNet()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    # Make Predictions
    y_pred = model.predict(X_test)

    mae, rmse, r2_square = evaluate_model(y_test,y_pred)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Training Performance")
    print("RMSE: ",rmse)
    print("MAE: ",mae)
    print("R2 Score: ",r2_square*100)
    r2_list.append(r2_square)
    print("="*100)

LinearRegression
Model Training Performance
RMSE:  6.418077525921053
MAE:  5.117978261661924
R2 Score:  53.7133179123668
Lasso
Model Training Performance
RMSE:  6.898925680701302
MAE:  5.54421753219891
R2 Score:  46.51782430825371
Ridge
Model Training Performance
RMSE:  6.418077857226926
MAE:  5.117979032655532
R2 Score:  53.71331313366166
ElasticNet
Model Training Performance
RMSE:  6.969910598548741
MAE:  5.617728471531431
R2 Score:  45.411576969929754


In [82]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']

In [83]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45584 entries, 0 to 45583
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Delivery_person_Age          43730 non-null  float64
 1   Delivery_person_Ratings      43676 non-null  float64
 2   Restaurant_latitude          45584 non-null  float64
 3   Restaurant_longitude         45584 non-null  float64
 4   Delivery_location_latitude   45584 non-null  float64
 5   Delivery_location_longitude  45584 non-null  float64
 6   Order_Date                   45584 non-null  object 
 7   Weather_conditions           44968 non-null  object 
 8   Road_traffic_density         44983 non-null  object 
 9   Vehicle_condition            45584 non-null  int64  
 10  Type_of_order                45584 non-null  object 
 11  Type_of_vehicle              45584 non-null  object 
 12  multiple_deliveries          44591 non-null  float64
 13  Festival        