In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
df=pd.read_csv('data/finalTrain.csv')

In [3]:
df=df.drop(labels=['Delivery_person_Age'],axis=1)
df=df.drop(labels=['Delivery_person_ID'],axis=1)
df=df.drop(labels=['Order_Date'],axis=1)
df=df.drop(labels=['ID'],axis=1)

In [4]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
import haversine as hs
import numpy as np
import datetime
from sklearn.preprocessing import OrdinalEncoder


In [5]:
# Custom transformer to calculate haversine distance
class HaversineDistance(BaseEstimator, TransformerMixin):
    def fit(self, Z, y=None):
        return self
    
    def transform(self, Z):
        hav=[]
        for i in range(len(df)):
            loc1=(Z.iloc[i,1],Z.iloc[i,2])
            loc2=(Z.iloc[i,3],Z.iloc[i,4])
            hav.append(hs.haversine(loc1,loc2))
        
        Z['Distance']=hav
        Z['Distance']=Z['Distance'].round(2)
        Z=Z.drop(labels=['Restaurant_latitude'],axis=1)
        Z=Z.drop(labels=['Restaurant_longitude'],axis=1)
        Z=Z.drop(labels=['Delivery_location_longitude'],axis=1)
        Z=Z.drop(labels=['Delivery_location_latitude'],axis=1)
        
        return Z

In [6]:
# Custom transformer to split time columns and handle missing values
class TimeSplitter(BaseEstimator, TransformerMixin):
    def fit(self, Z, y=None):
        return self
    
    def transform(self, Z):
        
        converted_datetimes = []
        for value in Z['Time_Orderd']:
            if isinstance(value, str):  # Check if the value is a string
                try:
                    converted_time = datetime.datetime.strptime(value, '%H:%M').time()
                    converted_datetime = datetime.datetime.combine(datetime.datetime.min, converted_time)
                except ValueError:
                    converted_datetime = None
            else:
                converted_datetime = None  # Handle NaN values
            converted_datetimes.append(converted_datetime)

        Z['time_column'] = converted_datetimes
        # Extract numerical features from datetime.datetime column
        Z['Orderd_hour'] = Z['time_column'].apply(lambda x: x.hour if x else None)
        Z['Orderd_minute'] = Z['time_column'].apply(lambda x: x.minute if x else None)

        # Drop the original datetime.datetime column
        Z.drop(columns=['time_column'], inplace=True)


        converted_datetimes1 = []
        for value in Z['Time_Order_picked']:
            if isinstance(value, str):  # Check if the value is a string
                try:
                    converted_time = datetime.datetime.strptime(value, '%H:%M').time()
                    converted_datetime = datetime.datetime.combine(datetime.datetime.min, converted_time)
                except ValueError:
                    converted_datetime = None
            else:
                converted_datetime = None  # Handle NaN values
            converted_datetimes1.append(converted_datetime)

        Z['Altered_Time_Order_picked'] = converted_datetimes1
        # Extract numerical features from datetime.datetime column
        Z['Orderd_picked_hour'] = Z['Altered_Time_Order_picked'].apply(lambda x: x.hour if x else None)
        Z['Orderd_picked_minute'] = Z['Altered_Time_Order_picked'].apply(lambda x: x.minute if x else None)
        Z.drop(columns=['Altered_Time_Order_picked'], inplace=True)


        for index, row in Z.iterrows():
            if pd.isna(row['Orderd_hour']) and pd.isna(row['Orderd_minute']):
                picked_time_in_minutes = row['Orderd_picked_hour'] * 60 + row['Orderd_picked_minute']
                adjusted_time_in_minutes = picked_time_in_minutes - 15
                
                adjusted_hour = adjusted_time_in_minutes // 60
                adjusted_minute = adjusted_time_in_minutes % 60
                
                Z.at[index, 'Orderd_hour'] = adjusted_hour
                Z.at[index, 'Orderd_minute'] = adjusted_minute

        for index, row in Z.iterrows():
            if pd.isna(row['Orderd_picked_hour']) and pd.isna(row['Orderd_picked_minute']):
                picked_time_in_minutes = row['Orderd_hour'] * 60 + row['Orderd_minute']
                adjusted_time_in_minutes = picked_time_in_minutes - 15
                
                adjusted_hour = adjusted_time_in_minutes // 60
                adjusted_minute = adjusted_time_in_minutes % 60
                
                Z.at[index, 'Orderd_picked_hour'] = adjusted_hour
                Z.at[index, 'Orderd_picked_minute'] = adjusted_minute


        column_to_check = 'Orderd_hour'
        Z = Z.dropna(subset=[column_to_check])
        Z=Z.drop(labels=['Time_Orderd'],axis=1)
        Z=Z.drop(labels=['Time_Order_picked'],axis=1)
        
        return Z

In [7]:
# Create a combined preprocessing pipeline
combined_pipeline = Pipeline([
    ('haversine', HaversineDistance()),
    ('time_splitter', TimeSplitter()),
  
])

In [8]:
df_transformed = combined_pipeline.transform(df)

In [9]:
## Independent and dependent features
X = df_transformed.drop(labels=['Time_taken (min)'],axis=1)
Y = df_transformed[['Time_taken (min)']]

In [10]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns  

In [11]:
# Define the custom ranking for each ordinal variable
Weather_conditions_map={"Sunny":1,"Cloudy":2,"Windy":3,"Fog":4,"Stormy":5,"Sandstorms":6}
Road_traffic_density_map={"Low":1,"Medium":2,"High":3,"Jam":4}
Type_of_order_map={"Drinks":1,"Snack":2,"Meal":3,"Buffet":4}
Type_of_vehicle_map={"motorcycle":1,"scooter":2,"electric_scooter":3,"bicycle":4}
City_map={"Semi-Urban":1,"Urban":2,"Metropolitian":3}
Festival_map={"No":0,"Yes":1}

In [12]:
from sklearn.preprocessing import LabelEncoder
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

ordinal_cols = ['Weather_conditions', 'Road_traffic_density', 'Type_of_order','Type_of_vehicle', 'Festival', 'City']
# Categorical Pipeline with Ordinal Encoding
ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder', OrdinalEncoder(categories=[list(Weather_conditions_map.keys()), list(Road_traffic_density_map.keys()), list(Type_of_order_map.keys()),list(Type_of_vehicle_map.keys()),
    list(Festival_map.keys()),list(City_map.keys())])),
    ('scaler', StandardScaler())
])
# Specify the columns to be label encoded
#label_encoded_columns = ['Type_of_vehicle', 'Festival', 'City']

# Categorical Pipeline with Label Encoding
#label_pipeline = Pipeline([
    #('imputer', SimpleImputer(strategy='most_frequent')),
   # ('labelencoder', LabelEncoder()),
   # ('scaler', StandardScaler())
#])

# Combine all the preprocessing pipelines using ColumnTransformer

preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_cols),
    ('ordinal_encoder', ordinal_pipeline, ordinal_cols)
])


In [13]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [14]:
X_train_transformed = pd.DataFrame(ordinal_pipeline.fit_transform(X_train[ordinal_cols]),columns=ordinal_pipeline.get_feature_names_out())

In [15]:
X_train_transformed

Unnamed: 0,Weather_conditions,Road_traffic_density,Type_of_order,Type_of_vehicle,Festival,City
0,1.465244,-1.106963,-1.345218,-0.770179,-0.142815,0.535140
1,-1.488825,-0.304246,-0.447036,-0.770179,-0.142815,0.535140
2,1.465244,-0.304246,1.349328,0.766565,-0.142815,0.535140
3,0.874430,-0.304246,-0.447036,-0.770179,-0.142815,0.535140
4,0.874430,-0.304246,-0.447036,0.766565,-0.142815,0.535140
...,...,...,...,...,...,...
31463,-0.307198,0.498471,-0.447036,0.766565,-0.142815,0.535140
31464,-0.307198,-0.304246,-0.447036,0.766565,-0.142815,0.535140
31465,1.465244,1.301189,-0.447036,-0.770179,-0.142815,-1.787911
31466,-0.307198,-1.106963,1.349328,-0.770179,-0.142815,-1.787911


In [16]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())

In [17]:
X_train

Unnamed: 0,num_pipeline__Delivery_person_Ratings,num_pipeline__Vehicle_condition,num_pipeline__multiple_deliveries,num_pipeline__Distance,num_pipeline__Orderd_hour,num_pipeline__Orderd_minute,num_pipeline__Orderd_picked_hour,num_pipeline__Orderd_picked_minute,ordinal_encoder__Weather_conditions,ordinal_encoder__Road_traffic_density,ordinal_encoder__Type_of_order,ordinal_encoder__Type_of_vehicle,ordinal_encoder__Festival,ordinal_encoder__City
0,-0.720146,-1.213398,0.439184,-0.084777,-1.879470,-1.457320,-1.913741,-0.488076,1.465244,-1.106963,-1.345218,-0.770179,-0.142815,0.535140
1,0.196930,-0.018305,-1.313764,-0.073014,-0.114490,0.147217,-0.145878,0.515700,-1.488825,-0.304246,-0.447036,-0.770179,-0.142815,0.535140
2,-0.108762,-0.018305,-1.313764,-0.075662,0.106133,-1.136412,0.075105,-0.822668,1.465244,-0.304246,1.349328,0.766565,-0.142815,0.535140
3,-0.108762,-0.018305,-1.313764,-0.075979,-0.114490,-1.136412,-0.145878,-0.822668,0.874430,-0.304246,-0.447036,-0.770179,-0.142815,0.535140
4,-0.414454,1.176787,0.439184,-0.081851,0.106133,-0.815505,0.075105,0.181108,0.874430,-0.304246,-0.447036,0.766565,-0.142815,0.535140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31463,-0.108762,1.176787,-1.313764,-0.080344,-1.217602,-0.815505,-1.250793,-0.488076,-0.307198,0.498471,-0.447036,0.766565,-0.142815,0.535140
31464,0.808314,1.176787,0.439184,-0.073196,-0.114490,-1.136412,-0.145878,-0.153484,-0.307198,-0.304246,-0.447036,0.766565,-0.142815,0.535140
31465,-0.720146,-1.213398,-1.313764,-0.077274,0.768000,0.147217,0.738054,0.850292,1.465244,1.301189,-0.447036,-0.770179,-0.142815,-1.787911
31466,0.196930,-1.213398,-1.313764,-0.072803,0.988623,1.109939,1.180020,-1.826444,-0.307198,-1.106963,1.349328,-0.770179,-0.142815,-1.787911


In [18]:
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [19]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [20]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [21]:
LinearRegression()

In [22]:
regression.coef_

array([[-2.39199468, -1.73208143,  2.32233181,  0.27126204, -1.39393027,
        -0.04982164,  2.01925423,  0.08549569,  0.53144519,  2.95125155,
         0.03099495,  0.01511916,  1.76572311,  0.8830633 ]])

In [23]:
regression.intercept_

array([26.38060887])

In [24]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [25]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 7.003663029621975
MAE: 5.546478575641949
R2 score 43.47262078489219


Lasso
Model Training Performance
RMSE: 7.361380555573428
MAE: 5.904601984005518
R2 score 37.55079628920672


Ridge
Model Training Performance
RMSE: 7.00367794680008
MAE: 5.546504628514301
R2 score 43.47237998807544


Elasticnet
Model Training Performance
RMSE: 7.406047819479049
MAE: 5.971439687143129
R2 score 36.79064053151152




In [26]:
numerical_cols

Index(['Delivery_person_Ratings', 'Vehicle_condition', 'multiple_deliveries',
       'Distance', 'Orderd_hour', 'Orderd_minute', 'Orderd_picked_hour',
       'Orderd_picked_minute'],
      dtype='object')