#### Read Data

In [216]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import missingno as msno
from scipy import stats
import statistics as st

In [217]:
data = pd.read_csv("../dataset/dataset_clean.csv")
# Delete outlier
data = data[data['Delivery_person_Age'] != 15 ]
data = data[data['Vehicle_condition'] != 3 ]
data = data[data['Type_of_vehicle'] != 'bicycle']
data['Type_of_vehicle'].replace('electric_scooter','scooter',inplace=True)
data = data[data['City'] != 'Semi-Urban']
# Input feature
data=data.drop(columns=['Index','Restaurant_longitude','Restaurant_latitude','Delivery_location_latitude','Delivery_location_longitude',
                        'Time_order','Time_order_picked','Festival','Delivery_person_ratings','Type_of_order','Multiple_deliveries'])
print(data.shape)
data.head()

(40887, 8)


Unnamed: 0,Delivery_person_Age,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_vehicle,City,Distance,Time_taken_(min)
0,36.0,Cloudy,High,1,motorcycle,Metropolitian,3.018911,25.0
1,37.0,Fog,Medium,1,scooter,Metropolitian,13.973178,40.0
2,25.0,Cloudy,High,1,scooter,Urban,6.058825,30.0
3,28.0,Sandstorms,Low,1,motorcycle,Metropolitian,19.97552,11.0
4,22.0,Fog,Jam,0,motorcycle,Metropolitian,12.43554,47.0


In [218]:
data.dtypes

Delivery_person_Age     float64
Weather_conditions       object
Road_traffic_density     object
Vehicle_condition         int64
Type_of_vehicle          object
City                     object
Distance                float64
Time_taken_(min)        float64
dtype: object

#### Encoder object features

In [219]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = le.fit_transform(data[col])
        
data.dtypes

Delivery_person_Age     float64
Weather_conditions        int32
Road_traffic_density      int32
Vehicle_condition         int64
Type_of_vehicle           int32
City                      int32
Distance                float64
Time_taken_(min)        float64
dtype: object

In [220]:
# feature
feature = data.drop(labels= 'Time_taken_(min)',axis = 1)
feature
# target
target = data['Time_taken_(min)']
print(feature.shape)
print(target.shape)

(40887, 7)
(40887,)


#### Split data

In [221]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test = train_test_split(feature,target,test_size = 0.2, random_state = 43)
print('train set',X_train.shape,y_train.shape)
print('test set',X_test.shape,y_test.shape)

train set (32709, 7) (32709,)
test set (8178, 7) (8178,)


In [222]:
result = pd.DataFrame(
    columns=['model','mean absolute error (MAE)','mean squared error (MSE)','r2 score (R2)'] #,'root mean squared error (RMSE)'],
)
result

Unnamed: 0,model,mean absolute error (MAE),mean squared error (MSE),r2 score (R2)


## Linear Regression

In [223]:
from sklearn.linear_model import LinearRegression
Model = LinearRegression()
Model.fit(X_train,y_train)
yHat = Model.predict(X_test)
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2
print('MAE:',MAE(y_test,yHat))
print('MSE:',MSE(y_test,yHat))
print('R2:',R2(y_test,yHat))
result =result.append({'model':'LinearRegression','mean absolute error (MAE)':MAE(y_test,yHat), 'mean squared error (MSE)':MSE(y_test,yHat), 'r2 score (R2)':R2(y_test,yHat)},ignore_index=True)


MAE: 6.087866197580163
MSE: 55.812643434509525
R2: 0.33934867084047593


  result =result.append({'model':'LinearRegression','mean absolute error (MAE)':MAE(y_test,yHat), 'mean squared error (MSE)':MSE(y_test,yHat), 'r2 score (R2)':R2(y_test,yHat)},ignore_index=True)


## Lasso

In [224]:
from sklearn.linear_model import Lasso
Model = Lasso()
Model.fit(X_train,y_train)
yHat = Model.predict(X_test)
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2
print('MAE:',MAE(y_test,yHat))
print('MSE:',MSE(y_test,yHat))
print('R2:',R2(y_test,yHat))
result =result.append({'model':'Lasso','mean absolute error (MAE)':MAE(y_test,yHat), 'mean squared error (MSE)':MSE(y_test,yHat), 'r2 score (R2)':R2(y_test,yHat)},ignore_index=True)

MAE: 6.323709553905675
MSE: 60.329975554216595
R2: 0.2858772477812661


  result =result.append({'model':'Lasso','mean absolute error (MAE)':MAE(y_test,yHat), 'mean squared error (MSE)':MSE(y_test,yHat), 'r2 score (R2)':R2(y_test,yHat)},ignore_index=True)


## XGBoost

In [225]:
from xgboost import XGBRFRegressor
Model = XGBRFRegressor()
Model.fit(X_train,y_train)
yHat = Model.predict(X_test)
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2
print('MAE:',MAE(y_test,yHat))
print('MSE:',MSE(y_test,yHat))
print('R2:',R2(y_test,yHat))
result =result.append({'model':'XGBRFRegressor','mean absolute error (MAE)':MAE(y_test,yHat), 'mean squared error (MSE)':MSE(y_test,yHat), 'r2 score (R2)':R2(y_test,yHat)},ignore_index=True)

MAE: 3.9432423194657624
MSE: 24.529447355456107
R2: 0.7096462198937763


  result =result.append({'model':'XGBRFRegressor','mean absolute error (MAE)':MAE(y_test,yHat), 'mean squared error (MSE)':MSE(y_test,yHat), 'r2 score (R2)':R2(y_test,yHat)},ignore_index=True)


## Logistic Regression

In [226]:
from sklearn.linear_model import LogisticRegression
Model = LogisticRegression()
Model.fit(X_train,y_train)
yHat = Model.predict(X_test)
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2
print('MAE:',MAE(y_test,yHat))
print('MSE:',MSE(y_test,yHat))
print('R2:',R2(y_test,yHat))
result =result.append({'model':'LogisticRegression','mean absolute error (MAE)':MAE(y_test,yHat), 'mean squared error (MSE)':MSE(y_test,yHat), 'r2 score (R2)':R2(y_test,yHat)},ignore_index=True)


MAE: 6.677060405967229
MSE: 74.43959403277084
R2: 0.11886243486116976


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  result =result.append({'model':'LogisticRegression','mean absolute error (MAE)':MAE(y_test,yHat), 'mean squared error (MSE)':MSE(y_test,yHat), 'r2 score (R2)':R2(y_test,yHat)},ignore_index=True)


In [227]:
result.set_index('model',inplace = True)
result

Unnamed: 0_level_0,mean absolute error (MAE),mean squared error (MSE),r2 score (R2)
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LinearRegression,6.087866,55.812643,0.339349
Lasso,6.32371,60.329976,0.285877
XGBRFRegressor,3.943242,24.529447,0.709646
LogisticRegression,6.67706,74.439594,0.118862
