In [89]:
# Required libraries
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statistics
import statsmodels.api as sm
# mlxtend --no-deps

# Scikit-Learn models, metrics, preprocessing
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

# Importing plotly visualization library
import plotly.express as px
import plotly.figure_factory as ff

#Filter and ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [91]:
df = pd.read_csv("C:/Users/festu/OneDrive/Documents/Machine learning project/Uber data.csv")

In [92]:
df.head(3)

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min)
0,0x4607,INDORES13DEL02,37,4.9,22.745049,75.892471,22.765049,75.912471,19-03-2022,11:30:00,11:45:00,conditions Sunny,High,2,Snack,motorcycle,0,No,Urban,(min) 24
1,0xb379,BANGRES18DEL02,34,4.5,12.913041,77.683237,13.043041,77.813237,25-03-2022,19:45:00,19:50:00,conditions Stormy,Jam,2,Snack,scooter,1,No,Metropolitian,(min) 33
2,0x5d6d,BANGRES19DEL01,23,4.4,12.914264,77.6784,12.924264,77.6884,19-03-2022,8:30:00,8:45:00,conditions Sandstorms,Low,0,Drinks,motorcycle,1,No,Urban,(min) 26


In [95]:
#Check the columns 
df.columns

Index(['ID', 'Delivery_person_ID', 'Delivery_person_Age',
       'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Order_Date', 'Time_Orderd',
       'Time_Order_picked', 'Weatherconditions', 'Road_traffic_density',
       'Vehicle_condition', 'Type_of_order', 'Type_of_vehicle',
       'multiple_deliveries', 'Festival', 'City', 'Time_taken(min)'],
      dtype='object')

In [97]:
print("The shape of the data is:", df.shape)

The shape of the data is: (45593, 20)


In [99]:
#Check the information of the data 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45593 entries, 0 to 45592
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           45593 non-null  object 
 1   Delivery_person_ID           45593 non-null  object 
 2   Delivery_person_Age          45593 non-null  object 
 3   Delivery_person_Ratings      45593 non-null  object 
 4   Restaurant_latitude          45593 non-null  float64
 5   Restaurant_longitude         45593 non-null  float64
 6   Delivery_location_latitude   45593 non-null  float64
 7   Delivery_location_longitude  45593 non-null  float64
 8   Order_Date                   45593 non-null  object 
 9   Time_Orderd                  45593 non-null  object 
 10  Time_Order_picked            45593 non-null  object 
 11  Weatherconditions            45593 non-null  object 
 12  Road_traffic_density         45593 non-null  object 
 13  Vehicle_conditio

In [101]:
#Descriptive statistics for numerical variables  
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Restaurant_latitude,45593.0,17.017729,8.185109,-30.905562,12.933284,18.546947,22.728163,30.914057
Restaurant_longitude,45593.0,70.231332,22.883647,-88.366217,73.17,75.898497,78.044095,88.433452
Delivery_location_latitude,45593.0,17.465186,7.335122,0.01,12.988453,18.633934,22.785049,31.054057
Delivery_location_longitude,45593.0,70.845702,21.118812,0.01,73.28,76.002574,78.107044,88.563452
Vehicle_condition,45593.0,1.023359,0.839065,0.0,0.0,1.0,2.0,3.0


In [103]:
#Descriptive statistics for categorical 
df.describe(exclude=np.number).T

Unnamed: 0,count,unique,top,freq
ID,45593,45593,0x4607,1
Delivery_person_ID,45593,1320,PUNERES01DEL01,67
Delivery_person_Age,45593,23,35,2262
Delivery_person_Ratings,45593,29,4.8,7148
Order_Date,45593,44,15-03-2022,1192
Time_Orderd,45593,177,,1731
Time_Order_picked,45593,193,21:30:00,496
Weatherconditions,45593,7,conditions Fog,7654
Road_traffic_density,45593,5,Low,15477
Type_of_order,45593,4,Snack,11533


#Data Cleaning 

In [106]:
#Rename the following columns 
def rename_column(df):
    df.rename(columns={"Weatherconditions" : "Weather_conditions"}, inplace=True)
rename_column(df)
print(df.columns)

Index(['ID', 'Delivery_person_ID', 'Delivery_person_Age',
       'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Order_Date', 'Time_Orderd',
       'Time_Order_picked', 'Weather_conditions', 'Road_traffic_density',
       'Vehicle_condition', 'Type_of_order', 'Type_of_vehicle',
       'multiple_deliveries', 'Festival', 'City', 'Time_taken(min)'],
      dtype='object')


In [108]:
#Extract relevant values from column

def extract_column_value(df):
    '''
    Remove "min" string from target variable and get numerical target
    '''
    # Extract time and convert to int
    df['Time_taken(min)'] = df['Time_taken(min)'].str.split().str[-1]


    # Extract Weather conditions
    df['Weather_conditions'] = df['Weather_conditions'].str.split().str[-1]

    # Extract city code from Delivery person ID
    df['City_code'] = df['Delivery_person_ID'].str.split("RES", expand=True)[0]

extract_column_value(df)
df[['Time_taken(min)','Weather_conditions','City_code']].head()

Unnamed: 0,Time_taken(min),Weather_conditions,City_code
0,24,Sunny,INDO
1,33,Stormy,BANG
2,26,Sandstorms,BANG
3,21,Sunny,COIMB
4,30,Cloudy,CHEN


In [110]:
# Unique values of some categorical columns of df
columns = ['Weather_conditions', 'Road_traffic_density',
       'Vehicle_condition', 'Type_of_order', 'Type_of_vehicle',
       'Festival', 'City', 'multiple_deliveries']

for column in columns:
    unique_values = df[column].unique().tolist()
    print(column, ":", unique_values)

Weather_conditions : ['Sunny', 'Stormy', 'Sandstorms', 'Cloudy', 'Fog', 'Windy', 'NaN']
Road_traffic_density : ['High ', 'Jam ', 'Low ', 'Medium ', 'NaN ']
Vehicle_condition : [2, 0, 1, 3]
Type_of_order : ['Snack ', 'Drinks ', 'Buffet ', 'Meal ']
Type_of_vehicle : ['motorcycle ', 'scooter ', 'electric_scooter ', 'bicycle ']
Festival : ['No ', 'Yes ', 'NaN ']
City : ['Urban ', 'Metropolitian ', 'Semi-Urban ', 'NaN ']
multiple_deliveries : ['0', '1', '3', 'NaN ', '2']


In [112]:
# Drop Columns which won't be use for building model
def drop_columns(df):
    df.drop(['ID','Delivery_person_ID'],axis=1,inplace=True)

print("Before No. of columns: ",df.shape[1])
drop_columns(df)
print("After No. of columns: ",df.shape[1])

#Check for Duplicate Values
if (len(df[df.duplicated()])>0):
    print("\nThere are Duplicate values present")
else:
    print("\nThere is no duplicate value present")

Before No. of columns:  21
After No. of columns:  19

There is no duplicate value present


In [114]:

def convert_nan(df):
    df.replace('NaN', float(np.nan), regex=True,inplace=True)

convert_nan(df)

In [116]:
#Feature engineering 

In [118]:
df.columns

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Order_Date', 'Time_Orderd',
       'Time_Order_picked', 'Weather_conditions', 'Road_traffic_density',
       'Vehicle_condition', 'Type_of_order', 'Type_of_vehicle',
       'multiple_deliveries', 'Festival', 'City', 'Time_taken(min)',
       'City_code'],
      dtype='object')

In [120]:
## Convert categorical variables into numerical values
categorical_features = ['Weather_conditions', 'Road_traffic_density', 'Type_of_order', 'Type_of_vehicle', 'Festival', 'City']
label_encoders = {}
for feature in categorical_features:
    label_encoders[feature] = LabelEncoder()
    df[feature] = label_encoders[feature].fit_transform(df[feature])

In [122]:
## Convert datetime features
df['Order_Date'] = pd.to_datetime(df['Order_Date'], format="mixed")
df['Time_Orderd'] = pd.to_datetime(df['Time_Orderd'], errors='coerce')
df['Time_Order_picked'] = pd.to_datetime(df['Time_Order_picked'], errors='coerce')
df['hour_ordered'] = df['Time_Orderd'].dt.hour.fillna(0)
df['minute_ordered'] = df['Time_Orderd'].dt.minute.fillna(0)
df['hour_picked'] = df['Time_Order_picked'].dt.hour.fillna(0)
df['minute_picked'] = df['Time_Order_picked'].dt.minute.fillna(0)

In [124]:
#Check the null in the columns 
df.isnull().sum().sort_values(ascending=False)

Delivery_person_Ratings        1908
Delivery_person_Age            1854
Time_Orderd                    1731
multiple_deliveries             993
Type_of_vehicle                   0
hour_picked                       0
minute_ordered                    0
hour_ordered                      0
City_code                         0
Time_taken(min)                   0
City                              0
Festival                          0
Vehicle_condition                 0
Type_of_order                     0
Road_traffic_density              0
Weather_conditions                0
Time_Order_picked                 0
Order_Date                        0
Delivery_location_longitude       0
Delivery_location_latitude        0
Restaurant_longitude              0
Restaurant_latitude               0
minute_picked                     0
dtype: int64

In [126]:

# Function to handle null values in the DataFrame
def handle_null_values(df):
    # Convert 'Delivery_person_Ratings' to numeric (if stored as strings)
    df['Delivery_person_Ratings'] = pd.to_numeric(df['Delivery_person_Ratings'], errors='coerce')

    # Fill missing values in 'Delivery_person_Age' and 'Weather_conditions' with a random choice from existing values
    df['Delivery_person_Age'].fillna(np.random.choice(df['Delivery_person_Age'].dropna()), inplace=True)
    df['Weather_conditions'].fillna(np.random.choice(df['Weather_conditions'].dropna()), inplace=True)

    # Columns to impute using the most frequent value
    mode_cols = ["Road_traffic_density", "multiple_deliveries", "Festival", "City"]
    mode_imputer = SimpleImputer(strategy='most_frequent')

    # Ensure correct reshaping for imputer
    df[mode_cols] = mode_imputer.fit_transform(df[mode_cols])

    # Impute 'Delivery_person_Ratings' using the median (after conversion to numeric)
    df['Delivery_person_Ratings'].fillna(df['Delivery_person_Ratings'].median(), inplace=True)

    # Fill missing 'Time_Orderd' values with corresponding 'Time_Order_picked' values
    df['Time_Orderd'] = df['Time_Orderd'].fillna(df['Time_Order_picked'])

# Example usage:
# Assuming df is your DataFrame
handle_null_values(df)

# Check for remaining missing values
print(df.isnull().sum())




Delivery_person_Age            0
Delivery_person_Ratings        0
Restaurant_latitude            0
Restaurant_longitude           0
Delivery_location_latitude     0
Delivery_location_longitude    0
Order_Date                     0
Time_Orderd                    0
Time_Order_picked              0
Weather_conditions             0
Road_traffic_density           0
Vehicle_condition              0
Type_of_order                  0
Type_of_vehicle                0
multiple_deliveries            0
Festival                       0
City                           0
Time_taken(min)                0
City_code                      0
hour_ordered                   0
minute_ordered                 0
hour_picked                    0
minute_picked                  0
dtype: int64


In [128]:
df = df.drop(columns=['Order_Date', 'Time_Orderd', 'Time_Order_picked'])  # Dropping original datetime columns

In [130]:
# Define Features and Target
X = df[['Delivery_person_Age', 'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Weather_conditions', 'Road_traffic_density',
       'Vehicle_condition', 'Type_of_order', 'Type_of_vehicle',
       'multiple_deliveries', 'Festival', 'City', 'hour_ordered', 'minute_ordered',
       'hour_picked', 'minute_picked']]
y = df['Time_taken(min)']

In [132]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [134]:
# Import necessary libraries

# Model Selection
models = {
    "Linear Regression": LinearRegression(),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "Support Vector Regressor": SVR(),
    'Decision Tree': DecisionTreeRegressor(),
'Random Forest': RandomForestRegressor()
   
}


In [138]:
best_model = None
best_mae = float("inf")

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"{name} - MAE: {mae:.2f}")
    if mae < best_mae:
        best_mae = mae
        best_model = model

print(f"Best Model: {best_model}")


Linear Regression - MAE: 5.73
Gradient Boosting - MAE: 4.09
Support Vector Regressor - MAE: 6.51
Decision Tree - MAE: 4.84
Random Forest - MAE: 3.72
Best Model: RandomForestRegressor()


Linear Regression
