In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from math import sqrt
from matplotlib import pyplot as plt
from lightgbm import LGBMClassifier,LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from ipywidgets import widgets

pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', None)

# ignore future warnings
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [2]:
train_df = pd.read_csv("train.csv")
train_df.replace("NaN",np.nan,regex=True,inplace=True)
test_df = pd.read_csv("test.csv")
test_df.replace("NaN",np.nan,regex=True,inplace=True)

train_df.drop(columns=['ID', 'Delivery_person_ID', 'City',
                       'Delivery_person_Age',
                       #'Delivery_person_Ratings',
                       #'Vehicle_condition',
                      'Type_of_order',
                       #'Type_of_vehicle',
                       #'multiple_deliveries'
                       'Festival'], axis=1, inplace=True)
test_df.drop(columns=['ID', 'Delivery_person_ID', 'City',
                      'Delivery_person_Age',
                      #'Delivery_person_Ratings',
                      #'Vehicle_condition',
                      'Type_of_order',
                      #'Type_of_vehicle',
                      #'multiple_deliveries'
                      'Festival'], axis=1, inplace=True)

# train_df.sample(10)

In [3]:
# train_df.isna().sum()

In [4]:
train_df["Order_Date"] = pd.to_datetime(train_df["Order_Date"], dayfirst=True)
test_df["Order_Date"] = pd.to_datetime(test_df["Order_Date"], dayfirst=True)

train_df["Time_taken(min)"] = train_df["Time_taken(min)"].apply(lambda x:x.replace("(min) ",""))
train_df["Time_taken(min)"] = train_df["Time_taken(min)"].astype("int")

In [5]:
train_df["Time_Order_picked"] = pd.to_timedelta(train_df["Time_Order_picked"])
test_df["Time_Order_picked"] = pd.to_timedelta(test_df["Time_Order_picked"])

In [6]:
time_nan = train_df[train_df["Time_Orderd"].isna()]

In [7]:
for index in (time_nan.index):
    train_df.loc[index,["Time_Orderd"]] = train_df.loc[index,"Time_Order_picked"] - pd.to_timedelta(train_df.loc[index,"Time_taken(min)"],unit="m")

In [8]:
train_df['multiple_deliveries'].fillna(train_df['multiple_deliveries'].mode()[0], inplace=True)
test_df['multiple_deliveries'].fillna(test_df['multiple_deliveries'].mode()[0], inplace=True)

train_df['multiple_deliveries'] = train_df['multiple_deliveries'].astype("int")
test_df['multiple_deliveries'] = test_df['multiple_deliveries'].astype("int")

In [9]:
train_df["Weatherconditions"].fillna(train_df["Weatherconditions"].mode()[0],inplace=True)
test_df["Weatherconditions"].fillna(test_df["Weatherconditions"].mode()[0],inplace=True)

In [10]:
train_df["Road_traffic_density"].fillna(train_df["Road_traffic_density"].mode()[0],inplace=True)
test_df["Road_traffic_density"].fillna(test_df["Road_traffic_density"].mode()[0],inplace=True)

In [11]:
train_df["Road_traffic_density"] = train_df["Road_traffic_density"].replace("Low ",0).replace("Medium ",1).replace("High ",2).replace("Jam ",3)
test_df["Road_traffic_density"] = test_df["Road_traffic_density"].replace("Low ",0).replace("Medium ",1).replace("High ",2).replace("Jam ",3)

In [12]:
train_df['Delivery_person_Ratings'] = train_df['Delivery_person_Ratings'].astype(float)
test_df['Delivery_person_Ratings'] = test_df['Delivery_person_Ratings'].astype(float)

train_df['Delivery_person_Ratings'] = train_df['Delivery_person_Ratings'].fillna(train_df['Delivery_person_Ratings'].mode()[0])
test_df['Delivery_person_Ratings'] = test_df['Delivery_person_Ratings'].fillna(test_df['Delivery_person_Ratings'].mode()[0])

In [13]:
# train_df.isna().sum()

In [14]:
train_df['Restaurant_latitude'] = train_df['Restaurant_latitude'].abs()
train_df['Restaurant_longitude'] = train_df['Restaurant_longitude'].abs()

In [15]:
train_df["Type_of_vehicle"] = train_df["Type_of_vehicle"].str.strip()
test_df["Type_of_vehicle"] = test_df["Type_of_vehicle"].str.strip()

train_df["Type_of_vehicle"] = train_df["Type_of_vehicle"].replace("motorcycle",0).replace("scooter",1).replace("electric_scooter",2).replace("bicycle",3)
test_df["Type_of_vehicle"] = test_df["Type_of_vehicle"].replace("motorcycle",0).replace("scooter",1).replace("electric_scooter",2).replace("bicycle",3)

In [16]:
train_df["Weatherconditions"] = train_df["Weatherconditions"].apply(lambda x:x.replace("conditions ",""))
test_df["Weatherconditions"] = test_df["Weatherconditions"].apply(lambda x:x.replace("conditions ",""))

train_df["Weatherconditions"] = train_df["Weatherconditions"].replace("Sunny",0).replace("Cloudy",1).replace("Windy",2).replace("Fog",3).replace("Stormy",4).replace("Sandstorms",5)
test_df["Weatherconditions"] = test_df["Weatherconditions"].replace("Sunny",0).replace("Cloudy",1).replace("Windy",2).replace("Fog",3).replace("Stormy",4).replace("Sandstorms",5)

In [17]:
# train_df.sample(5)

In [18]:
# train_df.describe().T

In [19]:
# Weatherconditionsdf = train_df.groupby("Weatherconditions")["Time_taken(min)"].mean().to_frame().reset_index()
# plt.figure(figsize=(12,6))
# fig , ax = plt.subplots(2,1,figsize = (10,10))
# ax1=sns.barplot(x=Weatherconditionsdf["Weatherconditions"],y=Weatherconditionsdf["Time_taken(min)"],ax=ax[1])
# ax1.set_ylabel("Mean of Time_taken(min)")
# ax2 = sns.countplot(x=train_df["Weatherconditions"],palette="rocket",ax=ax[0])
# plt.tight_layout()

In [20]:
# Road_traffic_densitydf = train_df.groupby("Road_traffic_density")["Time_taken(min)"].mean().to_frame().reset_index()
# plt.figure(figsize=(12,6))
# fig , ax = plt.subplots(2,1,figsize = (12,8))
# ax1=sns.barplot(x=Road_traffic_densitydf["Road_traffic_density"],y=Road_traffic_densitydf["Time_taken(min)"],ax=ax[1])
# ax1.set_ylabel("Mean of Time_taken(min)")
# ax2 = sns.countplot(x=train_df["Road_traffic_density"],palette="magma",ax=ax[0])
# plt.tight_layout()

In [21]:
# plt.figure(figsize=(12,6))
# sns.histplot(x=train_df["Time_taken(min)"], kde=True)
# plt.tight_layout()

In [22]:
train_df.drop(["Order_Date"],axis=1,inplace=True)
test_df.drop(["Order_Date"],axis=1,inplace=True)

# train_df.info()

In [23]:
# train_df.head(10)

In [24]:
# plt.figure(figsize=(12,6))
# sns.heatmap(train_df.corr(numeric_only=True),annot=True)
# plt.tight_layout()

In [25]:
train_df["Time_Order_picked"] = train_df["Time_Order_picked"].apply(lambda x: x.total_seconds())
test_df["Time_Order_picked"] = test_df["Time_Order_picked"].apply(lambda x: x.total_seconds())

In [26]:
train_df["Time_Orderd"] = pd.to_timedelta(train_df["Time_Orderd"])
train_df["Time_Orderd"] = train_df["Time_Orderd"].apply(lambda x: x.total_seconds())

test_df["Time_Orderd"] = pd.to_timedelta(test_df["Time_Orderd"])
test_df["Time_Orderd"] = test_df["Time_Orderd"].apply(lambda x: x.total_seconds())

In [27]:
# vectorized haversine function
def haversine(lat1, lon1, lat2, lon2, to_radians=True, earth_radius=6371):
    """
    slightly modified version: of http://stackoverflow.com/a/29546836/2901002

    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees or in radians)

    All (lat, lon) coordinates must have numeric dtypes and be of equal length.

    """
    if to_radians:
        lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    a = np.sin((lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2

    return earth_radius * 2 * np.arcsin(np.sqrt(a))


train_df['dist'] = \
    haversine(train_df['Restaurant_latitude'], train_df['Restaurant_longitude'],
              train_df['Delivery_location_latitude'], train_df['Delivery_location_longitude'])
test_df['dist'] = \
    haversine(test_df['Restaurant_latitude'], test_df['Restaurant_longitude'],
              test_df['Delivery_location_latitude'], test_df['Delivery_location_longitude'])

In [28]:
# train_df.sort_values(by=['dist'], ascending=False).head()

In [29]:
# train_df.sort_values(by=['dist'], ascending=True).head()

In [30]:
train_df.drop(columns=['Time_Order_picked','Time_Orderd', 'Restaurant_latitude','Restaurant_longitude','Delivery_location_latitude','Delivery_location_longitude'
], axis=1, inplace=True)
test_df.drop(columns=['Time_Order_picked','Time_Orderd', 'Restaurant_latitude','Restaurant_longitude','Delivery_location_latitude','Delivery_location_longitude'
], axis=1, inplace=True)

In [31]:
train_df['dist'].sort_values()

q_low = train_df["dist"].quantile(0.01)
q_hi  = train_df["dist"].quantile(0.99)

df_filtered = train_df #[(train_df["dist"] < q_hi) & (train_df["dist"] > q_low)]
df_filtered.drop_duplicates(inplace=True)

In [32]:
# plt.figure(figsize=(12,10))
# sns.heatmap(df_filtered.corr(), annot=True)
# plt.tight_layout()

In [33]:
# num_list=[i for i in df_filtered.select_dtypes(["int64","float64"])]
# cat_list=[i for i in df_filtered.select_dtypes("object")]

# k=1
# plt.tight_layout()
# plt.figure(figsize=(12,16))

# for i in df_filtered.loc[:,num_list]:
#     plt.subplot(4,2,k)
#     sns.histplot(df_filtered[i], kde=True)
#     k+=1

In [34]:
# #sns.lineplot(x=df_filtered.dist,y=df_filtered["Time_taken(min)"],data=df_filtered)

# plt.tight_layout()
# plt.figure(figsize=(14,4))

# plt.subplot(1,2,1)
# plt.hist(df_filtered['Time_taken(min)'],color = 'Orange')
# plt.xlabel('Time Taken in minutes')
# plt.ylabel('Frequency')
# plt.title("Histogram of Delivery Time Distribution")


# # fig, ax= plt.subplots()

# plt.subplot(1,2,2)
# plt.hist(df_filtered['dist'],color = 'Orange')
# plt.xlabel('Distance in kilometers')
# plt.ylabel('Frequency')
# plt.title("Histogram of Distance Distribution")


In [35]:
# fig,axes = plt.subplots(1,1, figsize=(14,4))
# plt.xlabel('Time Taken (min)')
# plt.ylabel('Distance (km)')
# sns.lineplot(x = df_filtered["Time_taken(min)"], y = df_filtered['dist'], data = df_filtered)

In [36]:
# fig,axes = plt.subplots(3,1, figsize=(15,15))

# sns.boxplot(data=df_filtered,x = 'Weatherconditions', y ='Time_taken(min)', ax=axes[0], palette="Blues")
# sns.boxplot(data=df_filtered,x ='Road_traffic_density', y='Time_taken(min)', ax=axes[1], palette="Reds")
# sns.boxplot(data=df_filtered,x='Time_taken(min)', y ='dist', ax=axes[2], palette="Oranges")

In [37]:
# df_filtered.describe().T

In [38]:
# print("Training Columns: ", train_df.columns)

In [39]:
X = train_df.drop("Time_taken(min)",axis=1)
Y = train_df["Time_taken(min)"]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=101)

In [40]:
def RSE(y_true, y_predicted):
    y_true = np.array(y_true)
    y_predicted = np.array(y_predicted)
    RSS = np.sum(np.square(y_true - y_predicted))

    rse = sqrt(RSS / (len(y_true) - 2))
    return rse

In [41]:
#models=[LinearRegression(),RandomForestRegressor(n_estimators=42,n_jobs=-1),AdaBoostRegressor(),KNeighborsRegressor(),DecisionTreeRegressor(),GradientBoostingRegressor(),XGBRegressor(),LGBMRegressor()]
#model_names=["LinearRegression","RandomForestRegressor","AdaBoostRegressor","KNeighborsRegressor","DecisionTreeRegressor","GradientBoostingRegressor","XGBRegressor","LGBMRegressor"]

models=[LGBMRegressor()]
model_names=["LGBMRegressor"]

MSE=[]
r_2=[]
RSE_ = []
accuracy = []
RMSE = []

for model in range (len(models)):
    model=models[model]
    model.fit(X_train,y_train)
    y_pre=model.predict(X_test)
    MSE.append(round(mean_squared_error(y_true=y_test,y_pred=y_pre),5))
    r_2.append(r2_score(y_true=y_test,y_pred=y_pre))
    RSE_.append(round(RSE(y_test,y_pre),5))
    accuracy.append((model.score(X_test,y_test))*100)
    RMSE.append(sqrt(mean_squared_error(y_true=y_test,y_pred=y_pre)))
d=pd.DataFrame({'Modelling Name':model_names,'MSE':MSE,"R_2":r_2,"RSE":RSE_,"Accuracy":accuracy,"RMSE":RMSE})
# d

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000622 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 306
[LightGBM] [Info] Number of data points in the train set: 31867, number of used features: 7
[LightGBM] [Info] Start training from score 26.286189


In [42]:
predictions = models[0].predict(X_test)
# predictions

In [43]:
# fig, ax = plt.subplots()

# x_min, x_max = 9.0, 55.0
# y_min, y_max = 9.0, 55.0
# ax.set_ylim(y_min, y_max)
# ax.set_xlim(x_min, x_max)

# ax.set_xlabel('y_test  = actual values', fontsize=12, labelpad=1)
# ax.set_ylabel('y_pred = predicted values', fontsize=12)
# plt.scatter(y_test, predictions)


In [44]:
# sns.histplot((y_test-predictions), kde=True)

In [45]:
# fig, ax = plt.subplots()
# x_min, x_max = 10.0, 50.0
# y_min, y_max = 10.0, 50.0
# m = 4 # margin of error
# x = np.linspace(x_min, x_max, 50)
# ax.set_ylim(y_min, y_max)
# ax.set_xlim(x_min, x_max)

# plt.plot(x, x, 'g', linewidth=3)
# plt.scatter(y_test, predictions, color="black")
# ax.fill_between(x, x - m, x + m, alpha=0.4)

# ax.set_xlabel('y_test  = actual values', fontsize=8, labelpad=1)
# ax.set_ylabel('y_pre(d = predicted values', fontsize=8)
# plt.figtext(0.5, -0.06, "Black dots = actual values paired with predicted values. \n \
#     green line = when predicted values equal actual values. \n \
#     blue area = margin of error: 4 (min)", wrap=True, horizontalalignment='center' );

In [48]:
person_rating = widgets.Dropdown(
    options=[('1', 1), ('2', 2), ('3', 3), ('4', 4), ('5', 5), ('6', 6)],
    value=5,
    description='Driver Rating:',
    style= {'description_width': '100px'}
)
weather_cond = widgets.Dropdown(
    options=[('Sunny', 0), ('Cloudy', 1), ('Windy', 2), ('Fog', 3), ('Stormy', 4), ('Sandstorm', 5)],
    value=0,
    description='Weather:',
    style= {'description_width': '100px'}
)
traffic = widgets.FloatSlider(description='Traffic Density:', min=0, max=3, step=1, style= {'description_width': '100px'})
distance = widgets.FloatSlider(description='Distance (km):', min=1.0, max=20.0, step=0.1, style= {'description_width': '100px'})

#A button for the user to get predictions using input valus. 
button_predict = widgets.Button(description='Predict Delivery Time' )
button_clear = widgets.Button(description='Clear Output')
button_ouput = widgets.Output()

#Defines what happens when you click the button 
def on_click_predict(b):
    data = {
        "Delivery_person_Ratings": [person_rating.value],
        "Weatherconditions": [weather_cond.value],
        "Road_traffic_density": [traffic.value],
        "Vehicle_condition": [0],
        "Type_of_vehicle": [0],
        "multiple_deliveries": [0],
        "dist": [distance.value]
    }
    udf = pd.DataFrame(data)
    pred = models[0].predict(udf)
    with button_ouput:
        print('Predicted Delivery Time (minutes) = ' + str(pred))

def on_click_clear(b):
    button_ouput.clear_output()

button_predict.on_click(on_click_predict)
button_clear.on_click(on_click_clear)

#Displays the text boxes and button inside a VBox 
print('\033[1m' + 'Enter parameters and make a prediction:' + '\033[0m')
widgets.VBox([person_rating, weather_cond, traffic, distance, button_predict, button_clear, button_ouput])


[1mEnter parameters and make a prediction:[0m


VBox(children=(Dropdown(description='Driver Rating:', index=4, options=(('1', 1), ('2', 2), ('3', 3), ('4', 4)…