In [190]:
import pandas as pd
import numpy as np
import seaborn as sns

In [191]:
df = pd.read_pickle("data/sendy3.pkl")
df.head()
df.dtypes

order_no                            object
user_id                             object
vehicle_type                        object
platform_type                       object
customer_type                       object
placed_day                          object
placed_wkday                        object
placed_time                 datetime64[ns]
confirmed_day                       object
confirmed_wkday                     object
confirmed_time              datetime64[ns]
arrive_pickup_day                   object
arrive_pickup_wkday                 object
arrive_pickup_time          datetime64[ns]
pickup_day                          object
pickup_wkday                        object
pickup_time                 datetime64[ns]
delivered_day                       object
delivered_wkday                     object
delivered_time              datetime64[ns]
distance_covered_km                  int64
temp                               float64
pickup_lat                         float64
pickup_long

In [192]:
df.isnull().sum()

order_no                      0
user_id                       0
vehicle_type                  0
platform_type                 0
customer_type                 0
placed_day                    0
placed_wkday                  0
placed_time                   0
confirmed_day                 0
confirmed_wkday               0
confirmed_time                0
arrive_pickup_day             0
arrive_pickup_wkday           0
arrive_pickup_time            0
pickup_day                    0
pickup_wkday                  0
pickup_time                   0
delivered_day                 0
delivered_wkday               0
delivered_time                0
distance_covered_km           0
temp                          0
pickup_lat                    0
pickup_long                   0
delivered_lat                 0
delivered_long                0
Rider Id                      0
time_pickup_to_delivered      0
No_Of_Orders                  0
Age                           0
Average_Rating                0
No_of_Ra

In [193]:
df['confirmed_wkday'].unique()

array([5, 2, 1, 3, 4, 6, 7], dtype=object)

In [194]:
df['placed_wkday'].unique()

array([5, 2, 1, 3, 4, 6, 7], dtype=object)

In [195]:
## grouping the day variables [0 for weekend and 1 for weekday]
df.loc[(df["placed_wkday"] <= 5) , "placed_wkday_type"] = 1
df.loc[(df["placed_wkday"] > 5), "placed_wkday_type"] = 0

df.loc[(df["confirmed_wkday"] <= 5) , "confirmed_wkday_type"] = 1
df.loc[(df["confirmed_wkday"] > 5) , "confirmed_wkday_type"] = 0

df.loc[(df["arrive_pickup_wkday"] <= 5) , "arrive_pickup_wkday"] = 1
df.loc[(df["arrive_pickup_wkday"] > 5) , "arrive_pickup_wkday"] = 0

df.loc[(df["delivered_wkday"] <= 5) , "delivered_wkday"] = 1
df.loc[(df["delivered_wkday"] > 5) , "delivered_wkday"] = 0

df['pickup_wkday_type'].replace(['Weekday', 'Weekend'], [1,0], inplace = True)
## grouping the time variables [0 for inactive, and 1 for active hours]

df.loc[(df["placed_time"].dt.hour >= 7) & (df["placed_time"].dt.hour < 9) , "placed_hr_type"] = 0
df.loc[(df["placed_time"].dt.hour > 17) & (df["placed_time"].dt.hour <= 19) , "placed_hr_type"] = 0
df.loc[(df["placed_time"].dt.hour >= 9) & (df["placed_time"].dt.hour <= 17), "placed_hr_type"] = 1

df.loc[(df["confirmed_time"].dt.hour >= 7) & (df["confirmed_time"].dt.hour < 9) , "confirmed_hr_type"] = 0
df.loc[(df["confirmed_time"].dt.hour > 17) & (df["confirmed_time"].dt.hour <= 19) , "confirmed_hr_type"] = 0
df.loc[(df["confirmed_time"].dt.hour >= 9) & (df["confirmed_time"].dt.hour <= 17), "confirmed_hr_type"] = 1

df.loc[(df["arrive_pickup_time"].dt.hour >= 7) & (df["arrive_pickup_time"].dt.hour < 9) , "arrive_pickup_hr_type"] = 0
df.loc[(df["arrive_pickup_time"].dt.hour > 17) & (df["arrive_pickup_time"].dt.hour <= 19) , "arrive_pickup_hr_type"] = 0
df.loc[(df["arrive_pickup_time"].dt.hour >= 9) & (df["arrive_pickup_time"].dt.hour <= 17), "arrive_pickup_hr_type"] = 1

df['delivered_hr_type'].replace(['Active Hours', 'Inactive Hours'], [1,0], inplace = True)
df['pickup_hr_type'].replace(['Active Hours', 'Inactive Hours'], [1,0], inplace = True)


In [196]:
## Dropping all the unneeded features
df.drop(['order_no','user_id', 'placed_wkday', 'placed_time', 'confirmed_wkday', 'confirmed_time', 'arrive_pickup_wkday', 'arrive_pickup_time', 'pickup_wkday', 'pickup_time', 'delivered_wkday', 'delivered_time', 'Rider Id',
        'pickup_hr', 'delivered_hr'], axis =1, inplace=True)

In [197]:
## Converting string values into numeric
df['vehicle_type'].replace(["Bike"], [1], inplace= True)
df["platform_type"].replace(['1', '2', '3', '4'], [1,2,3,4], inplace = True)
df["customer_type"].replace(["Business", "Personal"], [0,1], inplace= True)

df["placed_day"] = df["placed_day"].astype(str).astype(int)
df["confirmed_day"] = df["confirmed_day"].astype(str).astype(int)
df["arrive_pickup_day"] = df["arrive_pickup_day"].astype(str).astype(int)
df["pickup_day"] = df["pickup_day"].astype(str).astype(int)
df["delivered_day"] = df["delivered_day"].astype(str).astype(int)



In [198]:
df['speed_type'].replace(['slow', 'average', 'fast'], [0,1,2], inplace= True)

In [199]:
df.dtypes

vehicle_type                  int64
platform_type                 int64
customer_type                 int64
placed_day                    int32
confirmed_day                 int32
arrive_pickup_day             int32
pickup_day                    int32
delivered_day                 int32
distance_covered_km           int64
temp                        float64
pickup_lat                  float64
pickup_long                 float64
delivered_lat               float64
delivered_long              float64
time_pickup_to_delivered      int64
No_Of_Orders                  int64
Age                           int64
Average_Rating              float64
No_of_Ratings                 int64
pickup_hr_type              float64
delivered_hr_type           float64
pickup_wkday_type             int64
speed_ms                    float64
speed_type                    int64
placed_wkday_type           float64
confirmed_wkday_type        float64
placed_hr_type              float64
confirmed_hr_type           

In [200]:
## Rearranging the target column
last_col = df.pop('time_pickup_to_delivered')

In [201]:
df.insert(0,'time_pickup_to_delivered', last_col)

In [202]:
## Checking for null values
df.isna().sum()

time_pickup_to_delivered      0
vehicle_type                  0
platform_type                 0
customer_type                 0
placed_day                    0
confirmed_day                 0
arrive_pickup_day             0
pickup_day                    0
delivered_day                 0
distance_covered_km           0
temp                          0
pickup_lat                    0
pickup_long                   0
delivered_lat                 0
delivered_long                0
No_Of_Orders                  0
Age                           0
Average_Rating                0
No_of_Ratings                 0
pickup_hr_type               66
delivered_hr_type           110
pickup_wkday_type             0
speed_ms                      0
speed_type                    0
placed_wkday_type             0
confirmed_wkday_type          0
placed_hr_type               43
confirmed_hr_type            42
arrive_pickup_hr_type        53
dtype: int64

In [203]:
## imputing the columns with NAs 
import missingno as msno

In [204]:
df.pickup_hr_type.fillna(df.pickup_hr_type.mean(), inplace= True)
df.delivered_hr_type.fillna(df.delivered_hr_type.mean(), inplace= True)
df.placed_hr_type.fillna(df.placed_hr_type.mean(), inplace=True)
df.confirmed_hr_type.fillna(df.confirmed_hr_type.mean(), inplace= True)
df.arrive_pickup_hr_type.fillna(df.arrive_pickup_hr_type.mean(), inplace=True)

In [205]:
df.isna().sum()
### yesss, no missing data!!

time_pickup_to_delivered    0
vehicle_type                0
platform_type               0
customer_type               0
placed_day                  0
confirmed_day               0
arrive_pickup_day           0
pickup_day                  0
delivered_day               0
distance_covered_km         0
temp                        0
pickup_lat                  0
pickup_long                 0
delivered_lat               0
delivered_long              0
No_Of_Orders                0
Age                         0
Average_Rating              0
No_of_Ratings               0
pickup_hr_type              0
delivered_hr_type           0
pickup_wkday_type           0
speed_ms                    0
speed_type                  0
placed_wkday_type           0
confirmed_wkday_type        0
placed_hr_type              0
confirmed_hr_type           0
arrive_pickup_hr_type       0
dtype: int64

In [206]:
from sklearn.model_selection import train_test_split

In [207]:
train, test = train_test_split(df, test_size = 0.2, random_state =0)

print('The training set has ', train.shape[0], 'rows')
print('The test set has ', test.shape[0], 'rows')

The training set has  14211 rows
The test set has  3553 rows


In [208]:
## Initislizing the KNN model
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()

In [209]:
## Fitting the KNN model
knn.fit(X=train.iloc[:,1:], y=train.time_pickup_to_delivered)

KNeighborsRegressor()

In [210]:
## Standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

df = scaler.fit_transform(train)

In [211]:
y_pred = knn.predict(X=test.iloc[:,1:])

In [212]:
### Calculating the evaluation metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [213]:
mean_squared_error(y_true=test['time_pickup_to_delivered'], y_pred = y_pred)

704430.7532789191

In [214]:
mean_absolute_error(y_true=test['time_pickup_to_delivered'], y_pred = y_pred)

595.7023360540388

In [215]:
r2_score(y_true=test['time_pickup_to_delivered'], y_pred =y_pred)

0.18927825268709075

In [216]:
## Choosing K
knn1 = KNeighborsRegressor(n_neighbors = 1) # K = 1
knn1.fit(X=train.iloc[:,1:], y=train.time_pickup_to_delivered)
y_pred1 = knn1.predict(X=test.iloc[:,1:])

knn3 = KNeighborsRegressor(n_neighbors = 3) # K = 3
knn3.fit(X=train.iloc[:,1:], y=train.time_pickup_to_delivered)
y_pred3 = knn3.predict(X=test.iloc[:,1:])

knn5 = KNeighborsRegressor(n_neighbors = 5) # K = 5
knn5.fit(X=train.iloc[:,1:], y=train.time_pickup_to_delivered)
y_pred5 = knn5.predict(X=test.iloc[:,1:])


In [217]:
print('MAE')
print('K = 1\t', mean_absolute_error(y_true=test['time_pickup_to_delivered'], y_pred=y_pred1))
print('K = 3\t', mean_absolute_error(y_true=test['time_pickup_to_delivered'], y_pred=y_pred3))
print('K = 5\t', mean_absolute_error(y_true=test['time_pickup_to_delivered'], y_pred=y_pred5))


MSE
K = 1	 645.9197860962566
K = 3	 589.5702223473122
K = 5	 595.7023360540388


In [218]:
print('MSE')
print('K = 1\t', mean_squared_error(y_true=test['time_pickup_to_delivered'], y_pred=y_pred1))
print('K = 3\t', mean_squared_error(y_true=test['time_pickup_to_delivered'], y_pred=y_pred3))
print('K = 5\t', mean_squared_error(y_true=test['time_pickup_to_delivered'], y_pred=y_pred5))

MSE
K = 1	 890055.1784407543
K = 3	 701824.027988867
K = 5	 704430.7532789191


In [219]:
print('R^2 Score')
print('K = 1\t', r2_score(y_true=test['time_pickup_to_delivered'], y_pred =y_pred1))
print('K = 3\t', r2_score(y_true=test['time_pickup_to_delivered'], y_pred=y_pred3))
print('K = 5\t', r2_score(y_true=test['time_pickup_to_delivered'], y_pred=y_pred5))

R^2 Score
K = 1	 -0.024354893808390177
K = 3	 0.19227830467527962
K = 5	 0.18927825268709075


In [None]:
### Notice that k =3 is the best option from the following since it has the lowest MAE and MSE and the highest R2 score

In [220]:
from sklearn import metrics #accuracy measure

In [226]:
print("The accuracy of the KNN with K = 3, is ", metrics.accuracy_score(y_pred3,test['time_pickup_to_delivered']))

ValueError: Classification metrics can't handle a mix of continuous and multiclass targets