In [454]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from geopy.distance import geodesic
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb


In [455]:
# Load data
df = pd.read_csv('data/food_delivery.csv')

In [456]:
df.drop('ID',axis=1,inplace=True)

In [457]:
# Handling missing values 
# Data Preprocessing
# Feature Engineering

df['Delivery_person_Ratings'] = df['Delivery_person_Ratings'].str.strip()
mode_ratings = df['Delivery_person_Ratings'].mode()[0]
df['Delivery_person_Ratings'] = pd.to_numeric(df['Delivery_person_Ratings'], errors='coerce')
df['Delivery_person_Ratings'] = df['Delivery_person_Ratings'].fillna(mode_ratings)
df['Delivery_person_Ratings'] =  df['Delivery_person_Ratings'].replace('6',mode_ratings)
df['Delivery_person_Ratings'] = df['Delivery_person_Ratings'].astype('float64')


In [458]:
df['Delivery_person_Age'] = df['Delivery_person_Age'].replace('NaN', np.nan)
df['Delivery_person_Age'] = df['Delivery_person_Age'].astype('float64')
df['Delivery_person_Age'] = df['Delivery_person_Age'].fillna(df['Delivery_person_Age'].mean())
df['Delivery_person_Age'] = df['Delivery_person_Age'].astype(int)

In [459]:

df['City'] = df['City'].replace(r'^\s*$', np.nan, regex=True)
df['City'] = df['City'].replace('', np.nan)
city_mode = df['City'].mode()[0]
df['City'] = df['City'].fillna(city_mode)
df['City'] = df['City'].replace('NaN ','Metropolitian ')

In [460]:
mode_multiple_deliveries = df['multiple_deliveries'].mode()[0]
df['multiple_deliveries'] = df['multiple_deliveries'].replace('NaN ',mode_multiple_deliveries)
df['multiple_deliveries'] = df['multiple_deliveries'].astype('int64')

In [461]:
df['Order_Date'] = pd.to_datetime(df['Order_Date'], format='%d-%m-%Y')
df['Order_day'] = df['Order_Date'].dt.day.astype('int64')
df['Order_month'] = df['Order_Date'].dt.month.astype('int64')
df['Order_year'] = df['Order_Date'].dt.year.astype('int64')

In [462]:
df['Time_Orderd']=df['Time_Orderd'].fillna(df.Time_Orderd.mode()[0])
df['Time_Orderd'] = pd.to_datetime(df['Time_Orderd'], format='%H:%M:%S', errors='coerce').dt.time
df['Time_Order_picked'] = pd.to_datetime(df['Time_Order_picked'], format='%H:%M:%S', errors='coerce').dt.time
df['Time_taken(min)'] = df['Time_taken(min)'].str.extract(r'(\d+)').astype('Int64')

In [463]:
#df['Order_Time'] = df['Time_Orderd'].dt.time
#df['Pick_Time'] = df['Time_Order_picked'].dt.time


In [464]:

df['Weatherconditions']=df['Weatherconditions'].fillna(df['Weatherconditions'].mode()[0])
df['Weatherconditions'] = df['Weatherconditions'].str.replace('conditions ', '')
df = df.rename(columns={'Weatherconditions': 'Weather_Conditions'})

In [465]:
festival_mode = df['Festival'].mode()
df['Festival'] = df['Festival'].fillna(festival_mode)
df['Festival'] = df['Festival'].replace('NaN ', 'No ')

In [466]:
df['Road_traffic_density']=df['Road_traffic_density'].fillna(df.Road_traffic_density.mode()[0])
df['Road_traffic_density'] = df['Road_traffic_density'].str.replace('NaN ' , df['Road_traffic_density'].mode()[0])

In [467]:
def calculate_distance(row):
    start = (row['Restaurant_latitude'], row['Restaurant_longitude'])
    end = (row['Delivery_location_latitude'], row['Delivery_location_longitude'])
    return geodesic(start, end).kilometers

df['distance'] =df.apply(calculate_distance, axis=1)

In [468]:
df['Repeat_Customer'] = df['Delivery_person_ID'].duplicated(keep=False).astype(int)

In [469]:
df['Delivery_Speed'] = df['distance'] / df['Time_taken(min)']

In [470]:
weather_severity_map = {
    'Sunny': 'Mild',
    'Windy': 'Moderate',
    'Cloudy': 'Moderate',
    'Fog': 'Severe',
    'Stormy': 'Severe',
    'Sandstorms': 'Severe'
}
df['Weather_Severity'] = df['Weather_Conditions'].map(weather_severity_map)


In [471]:
df['Traffic_Impact'] = df['Road_traffic_density'].apply(lambda x: 1 if x in ['Jam', 'High'] else 0)

In [472]:
# Setting X, y variables
X= df.drop('Time_taken(min)', axis=1)
y =df['Time_taken(min)']

In [473]:
y.head()

0    24
1    33
2    26
3    21
4    30
Name: Time_taken(min), dtype: Int64

In [474]:
# Define categorical and numerical columns
categorical_cols = [
    'Weather_Conditions',
    'Road_traffic_density',
    'Type_of_order',
    'Type_of_vehicle',
    'Festival',
    'City'
]

numerical_cols = [
    'Delivery_person_Age',
    'Delivery_person_Ratings',
    'Vehicle_condition',
    'distance',
    'Order_day',
    'Order_month',
    'Order_year',
    'multiple_deliveries'
]

print("Numerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)

Numerical Columns: ['Delivery_person_Age', 'Delivery_person_Ratings', 'Vehicle_condition', 'distance', 'Order_day', 'Order_month', 'Order_year', 'multiple_deliveries']
Categorical Columns: ['Weather_Conditions', 'Road_traffic_density', 'Type_of_order', 'Type_of_vehicle', 'Festival', 'City']


In [475]:
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ])

In [477]:
X_transformed = preprocessor.fit_transform(X)

In [478]:
X.shape

(45593, 26)

In [479]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size =0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((36474, 26), (9119, 26), (36474,), (9119,))

In [480]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, predicted)
    return mae, mse, rmse, r2

In [None]:
models = {
    LinearRegression(): "Linear Regression",
    RandomForestRegressor(): "Random Forest",
    GradientBoostingRegressor(): "Gradient Boosting",
    xgb.XGBRegressor(): "XGBoost",
    LGBMRegressor(): "LightGBM"
}
    

In [400]:
# Baseline model
model = RandomForestRegressor()
model.fit(X_train, y_train)

ValueError: could not convert string to float: 'VADRES11DEL01 '

In [None]:
# Model Evaluation

In [None]:
# Feature Importqance

In [None]:
# Model Save

In [None]:
#Model Load

In [None]:
# Model Prediction