In [300]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import matplotlib.pyplot as plt
from geopy.distance import geodesic
from sklearn.preprocessing import StandardScaler



In [271]:
# Load data
df = pd.read_csv('data/food_delivery.csv')

In [272]:
df.drop('ID',axis=1,inplace=True)

In [273]:
# Handling missing values 
# Data Preprocessing
# Feature Engineering


mode_ratings = df['Delivery_person_Ratings'].mode()[0]
df['Delivery_person_Ratings'] =  df['Delivery_person_Ratings'].astype('float64')
df['Delivery_person_Ratings'] =  df['Delivery_person_Ratings'].fillna(mode_ratings)
df['Delivery_person_Ratings'] =  df['Delivery_person_Ratings'].replace('6',mode_ratings)


In [274]:
df['Delivery_person_Age'] = df['Delivery_person_Age'].replace('NaN', np.nan)
df['Delivery_person_Age'] = df['Delivery_person_Age'].astype('float64')
df['Delivery_person_Age'] = df['Delivery_person_Age'].fillna(df['Delivery_person_Age'].mean())
df['Delivery_person_Age'] = df['Delivery_person_Age'].astype(int)

In [275]:

df['City'] = df['City'].replace(r'^\s*$', np.nan, regex=True)
df['City'] = df['City'].replace('', np.nan)
city_mode = df['City'].mode()[0]
df['City'] = df['City'].fillna(city_mode)
df['City'] = df['City'].replace('NaN ','Metropolitian ')

In [276]:
mode_multiple_deliveries = df['multiple_deliveries'].mode()[0]
df['multiple_deliveries'] = df['multiple_deliveries'].replace('NaN ',mode_multiple_deliveries)

In [277]:
df['Order_Date'] = pd.to_datetime(df['Order_Date'], format='%d-%m-%Y')
df['Order_day'] = df['Order_Date'].dt.day.astype('int64')
df['Order_month'] = df['Order_Date'].dt.month.astype('int64')
df['Order_year'] = df['Order_Date'].dt.year.astype('int64')

In [278]:
df['Time_Orderd']=df['Time_Orderd'].fillna(df.Time_Orderd.mode()[0])
df['Time_Orderd'] = pd.to_datetime(df['Time_Orderd'], format='%H:%M:%S', errors='coerce').dt.time
df['Time_Order_picked'] = pd.to_datetime(df['Time_Order_picked'], format='%H:%M:%S', errors='coerce').dt.time
df['Time_taken(min)'] = df['Time_taken(min)'].str.extract(r'(\d+)').astype('Int64')

In [279]:
#df['Order_Time'] = df['Time_Orderd'].dt.time
#df['Pick_Time'] = df['Time_Order_picked'].dt.time


In [280]:

df['Weatherconditions']=df['Weatherconditions'].fillna(df['Weatherconditions'].mode()[0])
df['Weatherconditions'] = df['Weatherconditions'].str.replace('conditions ', '')
df = df.rename(columns={'Weatherconditions': 'Weather_Conditions'})

In [281]:
festival_mode = df['Festival'].mode()
df['Festival'] = df['Festival'].fillna(festival_mode)
df['Festival'] = df['Festival'].replace('NaN ', 'No ')

In [282]:
df['Road_traffic_density']=df['Road_traffic_density'].fillna(df.Road_traffic_density.mode()[0])
df['Road_traffic_density'] = df['Road_traffic_density'].str.replace('NaN ' , df['Road_traffic_density'].mode()[0])

In [283]:
def calculate_distance(row):
    start = (row['Restaurant_latitude'], row['Restaurant_longitude'])
    end = (row['Delivery_location_latitude'], row['Delivery_location_longitude'])
    return geodesic(start, end).kilometers

df['distance'] =df.apply(calculate_distance, axis=1)

In [284]:
df['Repeat_Customer'] = df['Delivery_person_ID'].duplicated(keep=False).astype(int)

In [285]:
df['Delivery_Speed'] = df['distance'] / df['Time_taken(min)']

In [286]:
weather_severity_map = {
    'Sunny': 'Mild',
    'Windy': 'Moderate',
    'Cloudy': 'Moderate',
    'Fog': 'Severe',
    'Stormy': 'Severe',
    'Sandstorms': 'Severe'
}
df['Weather_Severity'] = df['Weather_Conditions'].map(weather_severity_map)


In [287]:
df['Traffic_Impact'] = df['Road_traffic_density'].apply(lambda x: 1 if x in ['Jam', 'High'] else 0)

In [288]:
# Setting X, y variables
X= df.drop('Time_taken(min)', axis=1)
y =df['Time_taken(min)']

In [289]:
y.head()

0    24
1    33
2    26
3    21
4    30
Name: Time_taken(min), dtype: Int64

In [290]:
print(df.shape)
print(y.shape)
print(X.shape)

(45593, 27)
(45593,)
(45593, 26)


In [291]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size =0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((36474, 26), (9119, 26), (36474,), (9119,))

In [298]:
df.dtypes.to_dict()


{'Delivery_person_Age': dtype('int64'),
 'Restaurant_latitude': dtype('float64'),
 'Restaurant_longitude': dtype('float64'),
 'Delivery_location_latitude': dtype('float64'),
 'Delivery_location_longitude': dtype('float64'),
 'Order_Date': dtype('<M8[ns]'),
 'Vehicle_condition': dtype('int64'),
 'Time_taken(min)': Int64Dtype(),
 'Order_day': dtype('int64'),
 'Order_month': dtype('int64'),
 'Order_year': dtype('int64'),
 'distance': dtype('float64'),
 'Repeat_Customer': dtype('int64'),
 'Delivery_Speed': Float64Dtype(),
 'Traffic_Impact': dtype('int64'),
 'Delivery_person_ID_AGRRES010DEL01 ': dtype('bool'),
 'Delivery_person_ID_AGRRES010DEL02 ': dtype('bool'),
 'Delivery_person_ID_AGRRES010DEL03 ': dtype('bool'),
 'Delivery_person_ID_AGRRES01DEL01 ': dtype('bool'),
 'Delivery_person_ID_AGRRES01DEL02 ': dtype('bool'),
 'Delivery_person_ID_AGRRES01DEL03 ': dtype('bool'),
 'Delivery_person_ID_AGRRES02DEL01 ': dtype('bool'),
 'Delivery_person_ID_AGRRES02DEL02 ': dtype('bool'),
 'Delivery_per

In [293]:
# Identify numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Identify categorical columns
categorical_cols = df.select_dtypes(exclude =['int64', 'float64']).columns.tolist()

print("Numerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)

Numerical Columns: ['Delivery_person_Age', 'Restaurant_latitude', 'Restaurant_longitude', 'Delivery_location_latitude', 'Delivery_location_longitude', 'Vehicle_condition', 'Time_taken(min)', 'Order_day', 'Order_month', 'Order_year', 'distance', 'Repeat_Customer', 'Delivery_Speed', 'Traffic_Impact']
Categorical Columns: ['Delivery_person_ID', 'Delivery_person_Ratings', 'Order_Date', 'Time_Orderd', 'Time_Order_picked', 'Weather_Conditions', 'Road_traffic_density', 'Type_of_order', 'Type_of_vehicle', 'multiple_deliveries', 'Festival', 'City', 'Weather_Severity']


In [294]:

# Drop original datetime columns and unnecessary columns
#columns_to_drop = [
#    'Order_Date',
#    'Time_Orderd',
#    'Time_Order_picked',
#    'Delivery_person_ID',
#    'Restaurant_latitude',
#    'Restaurant_longitude',
#    'Delivery_location_latitude',
#    'Delivery_location_longitude'
#]
#df = df.drop(columns=columns_to_drop, axis=1)


In [295]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# encode categorical variables
df = pd.get_dummies(df, categorical_cols)


In [296]:

# Scale numeric features
scaler = StandardScaler()
#numeric_features_to_scale = [
#    'Delivery_person_Age',
#    'Delivery_person_Ratings',
#    'Vehicle_condition',
#    'distance',
#    'preparation_time'
#]
#df[numeric_features_to_scale] = scaler.fit_transform(df[numeric_features_to_scale])


In [297]:
# Baseline model
model = RandomForestRegressor()
model.fit(X_train, y_train)

ValueError: could not convert string to float: 'VADRES11DEL01 '

In [None]:
# Model Evaluation

In [None]:
# Feature Importqance

In [None]:
# Model Save

In [None]:
#Model Load

In [None]:
# Model Prediction