In [1]:
import pandas as pd
import numpy as np
import datetime
from matplotlib import pyplot
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import StandardScaler
import os
import ast
import glob
import numpy as np
import pandas as pd
from collections import Counter
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

def point_in_polygon(data, polygons_df):
    out_dest = list()
    for n,coord in enumerate(data['Destination Long']):
        list_movid = list()
        for k,pol in enumerate(polygons_df['features']):
            mov_id = pol['properties']['MOVEMENT_ID']
            poly = pol['geometry']['coordinates'][0]
            polygon = Polygon(poly)
            point = Point(coord,data.loc[n,'Destination Lat'])
            if polygon.contains(point) and point.within(polygon):
                list_movid.append(mov_id)
        out_dest.append(list_movid)
    out_pickup = list()
    for n,coord in enumerate(data['Pickup Long']):
        list_movid = list()
        for k,pol in enumerate(polygons_df['features']):
            mov_id = pol['properties']['MOVEMENT_ID']
            poly = pol['geometry']['coordinates'][0]
            polygon = Polygon(poly)
            point = Point(coord,data.loc[n,'Pickup Lat'])
            if polygon.contains(point) and point.within(polygon):
                list_movid.append(mov_id)
        out_pickup.append(list_movid)
    return out_pickup, out_dest


def cleanup_lists(lst):
    out = list()
    for n,v in enumerate(lst):
        if len(v) == 0:
            out.append(-999)

        elif len(v) == 1:
            out.append(int(v[0]))
        else:
            out.append(-888)
    return out

Using TensorFlow backend.
  data = yaml.load(f.read()) or {}


In [2]:
train_data = pd.read_csv('Data/Train.csv')
riders_data = pd.read_csv('Data/Riders.csv')
test_data = pd.read_csv('Data/Test.csv')
polygon_points = pd.read_json('Data/uber/540_hexclusters.json',orient='values')


In [3]:
train_data.head()

Unnamed: 0,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),...,Arrival at Destination - Time,Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Rider Id,Time from Pickup to Arrival
0,Order_No_4211,User_Id_633,Bike,3,Business,9,5,9:35:46 AM,9,5,...,10:39:55 AM,4,20.4,,-1.317755,36.83037,-1.300406,36.829741,Rider_Id_432,745
1,Order_No_25375,User_Id_2285,Bike,3,Personal,12,5,11:16:16 AM,12,5,...,12:17:22 PM,16,26.4,,-1.351453,36.899315,-1.295004,36.814358,Rider_Id_856,1993
2,Order_No_1899,User_Id_265,Bike,3,Business,30,2,12:39:25 PM,30,2,...,1:00:38 PM,3,,,-1.308284,36.843419,-1.300921,36.828195,Rider_Id_155,455
3,Order_No_9336,User_Id_1402,Bike,3,Business,15,5,9:25:34 AM,15,5,...,10:05:27 AM,9,19.2,,-1.281301,36.832396,-1.257147,36.795063,Rider_Id_855,1341
4,Order_No_27883,User_Id_1737,Bike,1,Personal,13,1,9:55:18 AM,13,1,...,10:25:37 AM,9,15.4,,-1.266597,36.792118,-1.295041,36.809817,Rider_Id_770,1214


In [4]:
def time_from_midnight_in_seconds(data, column):
    data[column] = pd.to_datetime(data[column])
    return (data[column] - pd.to_datetime(pd.to_datetime('today').date())).astype('timedelta64[s]')


def delta_time(dataset, higher_time, lower_time):
    return dataset[higher_time] - dataset[lower_time]

def time_to_day_part(time):
    hours = time/3600
    if hours < 6:
        return ('Night')
    if hours < 12:
        return ('Morning')
    if hours < 18:
        return ('Afternoon')
    else:
        return ('Evening')
    
def calculate_bearing(lat1, lng1, lat2, lng2):
        lat1 = np.deg2rad(lat1)
        lat2 = np.deg2rad(lat2)
        diffLong = np.deg2rad(lng2 - lng1)
        x = np.sin(diffLong) * np.cos(lat2)
        y = np.cos(lat1) * np.sin(lat2) - (np.sin(lat1)
                                     * np.cos(lat2) * np.cos(diffLong))
        initial_bearing = np.arctan2(x, y)
        # Now we have the initial bearing but math.atan2 return values
        # from -180° to + 180° which is not what we want for a compass bearing
        # The solution is to normalize the initial bearing as shown below
        initial_bearing = np.rad2deg(initial_bearing)
        compass_bearing = (initial_bearing+360) % 360
        return compass_bearing    


In [5]:
# converting time in seconds from midnight
train_data['Placement - Time'] = time_from_midnight_in_seconds(train_data, 'Placement - Time')
train_data['Confirmation - Time'] = time_from_midnight_in_seconds(train_data, 'Confirmation - Time')
train_data['Pickup - Time'] = time_from_midnight_in_seconds(train_data, 'Pickup - Time')
train_data['Arrival at Pickup - Time'] = time_from_midnight_in_seconds(train_data, 'Arrival at Pickup - Time')
train_data['Arrival at Destination - Time'] = time_from_midnight_in_seconds(train_data, 'Arrival at Destination - Time')

# converting time in seconds from midnight for test data
test_data['Placement - Time'] = time_from_midnight_in_seconds(test_data, 'Placement - Time')
test_data['Confirmation - Time'] = time_from_midnight_in_seconds(test_data, 'Confirmation - Time')
test_data['Pickup - Time'] = time_from_midnight_in_seconds(test_data, 'Pickup - Time')
test_data['Arrival at Pickup - Time'] = time_from_midnight_in_seconds(test_data, 'Arrival at Pickup - Time')


In [6]:
# calculating delta_time for train data
delta_confirm_place_train = delta_time(train_data, 'Confirmation - Time', 'Placement - Time')
delta_pick_arr_confirm_train = delta_time(train_data, 'Arrival at Pickup - Time', 'Confirmation - Time')
delta_pickup_confirm_train = delta_time(train_data, 'Pickup - Time', 'Arrival at Pickup - Time')
delta_arrival_pickup_train = delta_time(train_data, 'Arrival at Destination - Time', 'Pickup - Time')
delta_placement_arrival_train = delta_time(train_data, 'Arrival at Destination - Time', 'Placement - Time')

delta_confirm_place_test = delta_time(test_data, 'Confirmation - Time', 'Placement - Time')
delta_pick_arr_confirm_test = delta_time(test_data, 'Arrival at Pickup - Time', 'Confirmation - Time')
delta_pickup_confirm_test = delta_time(test_data, 'Pickup - Time', 'Arrival at Pickup - Time')


In [7]:
train_with_rider_info = train_data.merge(riders_data, on='Rider Id')
test_with_rider_info = test_data.merge(riders_data, on='Rider Id')


In [8]:
train_with_rider_info = train_with_rider_info[train_with_rider_info['Placement - Weekday (Mo = 1)'] == train_with_rider_info['Confirmation - Weekday (Mo = 1)']]
train_with_rider_info = train_with_rider_info[train_with_rider_info['Placement - Day of Month'] == train_with_rider_info['Confirmation - Day of Month']]


In [9]:
train_with_rider_info['Temperature'].fillna(train_with_rider_info['Temperature'].mean(), inplace=True)
test_with_rider_info['Temperature'].fillna(test_with_rider_info['Temperature'].mean(), inplace=True)


In [10]:
# label encoding of personal/business column for train data
labelencoder_personal_business = LabelEncoder()
train_with_rider_info['Personal or Business'] = labelencoder_personal_business.fit_transform(train_with_rider_info['Personal or Business'])

# label encoding of personal/business column for test data
labelencoder_personal_business = LabelEncoder()
test_with_rider_info['Personal or Business'] = labelencoder_personal_business.fit_transform(test_with_rider_info['Personal or Business'])


In [11]:
# one hot encoding of the train_data['Platform Type'] column
train_with_rider_info['Platform Type'] = train_with_rider_info['Platform Type'].astype('category')
train_with_rider_info = pd.concat([train_with_rider_info.drop(columns=['Platform Type']), pd.get_dummies(train_with_rider_info['Platform Type'])], axis=1)



In [12]:
# one hot encoding of the test_data['Platform Type'] column
test_with_rider_info['Platform Type'] = test_with_rider_info['Platform Type'].astype('category')
test_with_rider_info = pd.concat([test_with_rider_info.drop(columns=['Platform Type']), pd.get_dummies(test_with_rider_info['Platform Type'])], axis=1)


In [13]:
bearing_test = calculate_bearing(test_with_rider_info['Pickup Lat'], test_with_rider_info['Pickup Long'],
                            test_with_rider_info['Destination Lat'], test_with_rider_info['Destination Long'])

bearing_train = calculate_bearing(train_with_rider_info['Pickup Lat'], train_with_rider_info['Pickup Long'],
                            train_with_rider_info['Destination Lat'], train_with_rider_info['Destination Long'])


In [14]:
test_with_rider_info['Bearing'] = bearing_test
train_with_rider_info['Bearing'] = bearing_train


```
# drop redundant columns train data
train_with_rider_info.drop(columns=['Confirmation - Day of Month', 'Confirmation - Weekday (Mo = 1)',
                                    'Arrival at Pickup - Day of Month','Arrival at Pickup - Weekday (Mo = 1)',
                                    'Pickup - Day of Month', 'Pickup - Weekday (Mo = 1)',
                                    'Arrival at Destination - Day of Month',
                                    'Arrival at Destination - Weekday (Mo = 1)',
                                    'Arrival at Destination - Time',
                                    'Pickup Lat', 'Pickup Long',
                                    'Destination Lat', 'Destination Long',
                                    'Vehicle Type', 'Order No',
                                    'User Id', 'Rider Id',
                                    'Precipitation in millimeters'], inplace=True)
                                    
# drop redundant columns test data
test_with_rider_info.drop(columns=['Confirmation - Day of Month', 'Confirmation - Weekday (Mo = 1)',
                                   'Arrival at Pickup - Day of Month', 'Arrival at Pickup - Weekday (Mo = 1)',
                                   'Pickup - Day of Month', 'Pickup - Weekday (Mo = 1)',
                                   'Vehicle Type', 'User Id', 'Rider Id', 'Precipitation in millimeters',
                                   'Pickup Lat', 'Pickup Long',
                                   'Destination Lat', 'Destination Long'], inplace=True)                                    
```

In [15]:
# drop redundant columns train data
train_with_rider_info.drop(columns=['Confirmation - Day of Month', 'Confirmation - Weekday (Mo = 1)',
                                    'Arrival at Pickup - Day of Month','Arrival at Pickup - Weekday (Mo = 1)',
                                    'Pickup - Day of Month', 'Pickup - Weekday (Mo = 1)',
                                    'Arrival at Destination - Day of Month',
                                    'Arrival at Destination - Weekday (Mo = 1)',
                                    'Arrival at Destination - Time',
                                    'Vehicle Type', 'Order No',
                                    'User Id', 'Rider Id',
                                    'Precipitation in millimeters'], inplace=True)


In [16]:
train_with_rider_info.rename(columns={1: "Platform Type 1", 2: "Platform Type 2",
                                      3: "Platform Type 3", 4: "Platform Type 4"}, inplace=True)

test_with_rider_info.rename(columns={1: "Platform Type 1", 2: "Platform Type 2",
                                     3: "Platform Type 3", 4: "Platform Type 4"}, inplace=True)


In [17]:
train_with_rider_info.rename(columns={'Placement - Day of Month': 'Day of Month',
                                      'Placement - Weekday (Mo = 1)': 'Weekday (Mo = 1)'}, inplace=True)

test_with_rider_info.rename(columns={'Placement - Day of Month': 'Day of Month',
                                     'Placement - Weekday (Mo = 1)': 'Weekday (Mo = 1)'}, inplace=True)


In [18]:
# one hot encoding of the test_data['Weekday (Mo = 1)] column
test_with_rider_info['Weekday (Mo = 1)'] = test_with_rider_info['Weekday (Mo = 1)'].astype('category')
test_with_rider_info = pd.concat([test_with_rider_info.drop(columns=['Weekday (Mo = 1)']), pd.get_dummies(test_with_rider_info['Weekday (Mo = 1)'])], axis=1)


In [19]:
# one hot encoding of the train_data['Weekday (Mo = 1)] column
train_with_rider_info['Weekday (Mo = 1)'] = train_with_rider_info['Weekday (Mo = 1)'].astype('category')
train_with_rider_info = pd.concat([train_with_rider_info.drop(columns=['Weekday (Mo = 1)']), pd.get_dummies(train_with_rider_info['Weekday (Mo = 1)'])], axis=1)


In [20]:

train_with_rider_info.rename(columns={1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 
                                      4: 'Thursday', 5: 'Friday', 6: 'Saturday',
                                      7: 'Sunday'
                                      }, inplace=True)

test_with_rider_info.rename(columns={1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 
                                      4: 'Thursday', 5: 'Friday', 6: 'Saturday',
                                      7: 'Sunday'}, inplace=True)


In [21]:
# labeling part of the day in train data 
# train_with_rider_info['Placement - Time'] = train_with_rider_info['Placement - Time'].apply(time_to_day_part)
# train_with_rider_info['Confirmation - Time'] = train_with_rider_info['Confirmation - Time'].apply(time_to_day_part)
# train_with_rider_info['Arrival at Pickup - Time'] = train_with_rider_info['Arrival at Pickup - Time'].apply(time_to_day_part)
# train_with_rider_info['Pickup - Time'] = train_with_rider_info['Pickup - Time'].apply(time_to_day_part)


In [22]:
# labeling part of the day in test data
# test_with_rider_info['Placement - Time'] = test_with_rider_info['Placement - Time'].apply(time_to_day_part)
# test_with_rider_info['Confirmation - Time'] = test_with_rider_info['Confirmation - Time'].apply(time_to_day_part)
# test_with_rider_info['Arrival at Pickup - Time'] = test_with_rider_info['Arrival at Pickup - Time'].apply(time_to_day_part)
# test_with_rider_info['Pickup - Time'] = test_with_rider_info['Pickup - Time'].apply(time_to_day_part)


In [23]:
# label encoding, one hot encoding, and renaming of times
# labelencoder = LabelEncoder()
# train_with_rider_info['Placement - Time'] = labelencoder.fit_transform(train_with_rider_info['Placement - Time'])
# train_with_rider_info['Placement - Time'] = train_with_rider_info['Placement - Time'].astype('category')
# train_with_rider_info = pd.concat([train_with_rider_info.drop(columns=['Placement - Time']), pd.get_dummies(train_with_rider_info['Placement - Time'])], axis=1)



## Uber Data

In [24]:
HOURLY_ONLYWEEKDAYS = 'Data/uber/nairobi-hexclusters-2019-3-OnlyWeekdays-HourlyAggregate.csv'
HOURLY_ONLYWEEKENDS = 'Data/uber/nairobi-hexclusters-2019-3-OnlyWeekends-HourlyAggregate.csv'
WEEKLY_AGREGATES = 'Data/uber/nairobi-hexclusters-2019-3-WeeklyAggregate.csv'
hourly_onlyweekdays = pd.read_csv(HOURLY_ONLYWEEKDAYS)
hourly_onlyweekends = pd.read_csv(HOURLY_ONLYWEEKENDS)
weekly_agregates = pd.read_csv(WEEKLY_AGREGATES)
display(hourly_onlyweekends.head(3))
display(hourly_onlyweekdays.head(3))
display(weekly_agregates.head(3))
hourly_onlyweekdays['ID'] = hourly_onlyweekdays['sourceid'].astype(str)+"_"+hourly_onlyweekdays['dstid'].astype(str)+"_"+hourly_onlyweekdays['hod'].astype(str)+"_"+"WD"
hourly_onlyweekends['ID'] = hourly_onlyweekends['sourceid'].astype(str)+"_"+hourly_onlyweekends['dstid'].astype(str)+"_"+hourly_onlyweekends['hod'].astype(str)+"_"+"WE" 
uber_hourly_data = pd.concat([hourly_onlyweekdays, hourly_onlyweekends], axis=0)
uber_hourly_data.head()

Unnamed: 0,sourceid,dstid,hod,mean_travel_time,standard_deviation_travel_time,geometric_mean_travel_time,geometric_standard_deviation_travel_time
0,397,391,14,1725.07,528.66,1656.26,1.32
1,258,390,1,665.49,484.41,577.44,1.6
2,388,26,9,1134.77,493.96,1044.47,1.49


Unnamed: 0,sourceid,dstid,hod,mean_travel_time,standard_deviation_travel_time,geometric_mean_travel_time,geometric_standard_deviation_travel_time
0,133,241,7,1090.94,733.73,967.71,1.55
1,130,271,7,1999.54,854.05,1829.63,1.55
2,142,151,7,606.56,368.62,487.25,2.19


Unnamed: 0,sourceid,dstid,dow,mean_travel_time,standard_deviation_travel_time,geometric_mean_travel_time,geometric_standard_deviation_travel_time
0,70,259,1,1397.93,809.34,1275.11,1.46
1,355,200,3,912.69,312.4,869.08,1.35
2,335,400,3,1280.75,428.83,1228.0,1.31


Unnamed: 0,sourceid,dstid,hod,mean_travel_time,standard_deviation_travel_time,geometric_mean_travel_time,geometric_standard_deviation_travel_time,ID
0,133,241,7,1090.94,733.73,967.71,1.55,133_241_7_WD
1,130,271,7,1999.54,854.05,1829.63,1.55,130_271_7_WD
2,142,151,7,606.56,368.62,487.25,2.19,142_151_7_WD
3,271,273,11,1502.18,772.19,1362.81,1.52,271_273_11_WD
4,264,343,11,1643.27,383.46,1606.93,1.22,264_343_11_WD


In [25]:
weekly_agregates['ID'] = weekly_agregates['sourceid'].astype(str)+"_"+weekly_agregates['dstid'].astype(str)+"_"+weekly_agregates['dow'].astype(str)
weekly_agregates.head()


Unnamed: 0,sourceid,dstid,dow,mean_travel_time,standard_deviation_travel_time,geometric_mean_travel_time,geometric_standard_deviation_travel_time,ID
0,70,259,1,1397.93,809.34,1275.11,1.46,70_259_1
1,355,200,3,912.69,312.4,869.08,1.35,355_200_3
2,335,400,3,1280.75,428.83,1228.0,1.31,335_400_3
3,354,210,3,1752.71,742.27,1631.55,1.45,354_210_3
4,72,239,1,1425.7,395.95,1377.96,1.29,72_239_1


## Training Neural Networks

In [None]:
print(train_with_rider_info.columns)

In [26]:
train_with_rider_info['DAYS'] = train_with_rider_info['Monday'].astype(str)+train_with_rider_info['Tuesday'].astype(str)+train_with_rider_info['Wednesday'].astype(str)+train_with_rider_info['Thursday'].astype(str)+train_with_rider_info['Friday'].astype(str)+train_with_rider_info['Saturday'].astype(str)+train_with_rider_info['Sunday'].astype(str)
days = list()
for day in train_with_rider_info['DAYS']:
    if day == '1000000':
        days.append(1)
    elif day == '0100000':
        days.append(2)
    elif day == '0010000':
        days.append(3)
    elif day == '0001000':
        days.append(4)
    elif day == '0000100':
        days.append(5)
    elif day == '0000010':
        days.append(6)
    elif day == '0000001':
        days.append(7)
    else:
        days.append(0)
train_with_rider_info['Day'] = days
train_with_rider_info['Day'].value_counts()

Index(['Personal or Business', 'Day of Month', 'Placement - Time',
       'Confirmation - Time', 'Arrival at Pickup - Time', 'Pickup - Time',
       'Distance (KM)', 'Temperature', 'Pickup Lat', 'Pickup Long',
       'Destination Lat', 'Destination Long', 'Time from Pickup to Arrival',
       'No_Of_Orders', 'Age', 'Average_Rating', 'No_of_Ratings',
       'Platform Type 1', 'Platform Type 2', 'Platform Type 3',
       'Platform Type 4', 'Bearing', 'Monday', 'Tuesday', 'Wednesday',
       'Thursday', 'Friday', 'Saturday', 'Sunday'],
      dtype='object')

In [29]:
train_with_rider_info.reset_index(inplace=True)
pickup1, dest1 = point_in_polygon(data=train_with_rider_info, polygons_df=polygon_points)
print(Counter([len(lst) for lst in dest1]))
print(Counter([len(lst) for lst in pickup1]))
dest1 = cleanup_lists(lst=dest1)
pickup1 = cleanup_lists(lst=pickup1)
train_with_rider_info['sourceid'] = pickup1
train_with_rider_info['destid'] = dest1

Counter({1: 21174, 0: 25})
Counter({1: 21197, 0: 2})


In [30]:
train_with_rider_info.head()

Unnamed: 0,index,Personal or Business,Day of Month,Placement - Time,Confirmation - Time,Arrival at Pickup - Time,Pickup - Time,Distance (KM),Temperature,Pickup Lat,...,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,DAYS,Day,sourceid,destid
0,0,0,9,34546.0,34810.0,36287.0,37650.0,4,20.4,-1.317755,...,0,0,0,1,0,0,100,5,386,287
1,1,1,18,56477.0,56490.0,58036.0,58417.0,20,24.5,-1.326774,...,0,0,0,1,0,0,100,5,334,88
2,2,0,31,46301.0,47569.0,48002.0,48788.0,6,24.7,-1.255189,...,0,0,0,1,0,0,100,5,110,258
3,3,1,2,25930.0,25949.0,27324.0,27526.0,18,15.2,-1.290315,...,1,0,0,0,0,0,100000,2,385,102
4,4,1,22,38458.0,38544.0,39360.0,39580.0,7,19.2,-1.273524,...,1,0,0,0,0,0,100000,2,151,322


In [52]:
train_with_rider_info['ID'] = train_with_rider_info['sourceid'].astype(str)+"_"+train_with_rider_info['destid'].astype(str)+"_"+train_with_rider_info['Day'].astype(str)
print(train_with_rider_info.shape)
train_with_rider_info.head()

(21199, 35)


Unnamed: 0,index,Personal or Business,Day of Month,Placement - Time,Confirmation - Time,Arrival at Pickup - Time,Pickup - Time,Distance (KM),Temperature,Pickup Lat,...,Wednesday,Thursday,Friday,Saturday,Sunday,DAYS,Day,sourceid,destid,ID
0,0,0,9,34546.0,34810.0,36287.0,37650.0,4,20.4,-1.317755,...,0,0,1,0,0,100,5,386,287,386_287_5
1,1,1,18,56477.0,56490.0,58036.0,58417.0,20,24.5,-1.326774,...,0,0,1,0,0,100,5,334,88,334_88_5
2,2,0,31,46301.0,47569.0,48002.0,48788.0,6,24.7,-1.255189,...,0,0,1,0,0,100,5,110,258,110_258_5
3,3,1,2,25930.0,25949.0,27324.0,27526.0,18,15.2,-1.290315,...,0,0,0,0,0,100000,2,385,102,385_102_2
4,4,1,22,38458.0,38544.0,39360.0,39580.0,7,19.2,-1.273524,...,0,0,0,0,0,100000,2,151,322,151_322_2


In [53]:
print(weekly_agregates.shape)
weekly_agregates.head()


(408523, 8)


Unnamed: 0,sourceid,dstid,dow,mean_travel_time,standard_deviation_travel_time,geometric_mean_travel_time,geometric_standard_deviation_travel_time,ID
0,70,259,1,1397.93,809.34,1275.11,1.46,70_259_1
1,355,200,3,912.69,312.4,869.08,1.35,355_200_3
2,335,400,3,1280.75,428.83,1228.0,1.31,335_400_3
3,354,210,3,1752.71,742.27,1631.55,1.45,354_210_3
4,72,239,1,1425.7,395.95,1377.96,1.29,72_239_1


In [54]:
train_with_rider_data = train_with_rider_info.merge(right=weekly_agregates, how='left', right_on='ID', left_on='ID')
print(train_with_rider_data.shape)

(21199, 42)


In [71]:
print(train_with_rider_data.shape)
train_with_rider_data = train_with_rider_data.loc[train_with_rider_data['standard_deviation_travel_time'].notna(),:]
print(train_with_rider_data.shape)


(20948, 42)
(20948, 42)


In [72]:
X = train_with_rider_data.drop(columns=['index','Time from Pickup to Arrival', 'Destination Lat', 'Destination Long','Pickup Lat', 'Pickup Long','DAYS', 'sourceid_x' ,'destid', 'ID', 'sourceid_y', 'dstid', 'dow', ])
Y = train_with_rider_data['Time from Pickup to Arrival']

In [73]:
#X = train_with_rider_info.drop(columns='Time from Pickup to Arrival')
#Y = train_with_rider_info['Time from Pickup to Arrival']
variables = ['Distance (KM)',
'Bearing',
'No_Of_Orders',
'Age',
'No_of_Ratings',
'Arrival at Pickup - Time',
'Pickup - Time',
'Confirmation - Time',
'Placement - Time',
'Average_Rating',
'Temperature',
'Day of Month']
variables1 = ['Distance (KM)',
'No_Of_Orders',
'Age',
'No_of_Ratings',
'Arrival at Pickup - Time',
'Pickup - Time',
'Confirmation - Time',
'Placement - Time',
'Average_Rating',
'Temperature',
'Day of Month']
#X.to_csv('X.csv',index=False, index_label=False)


In [74]:
X.columns

Index(['Personal or Business', 'Day of Month', 'Placement - Time',
       'Confirmation - Time', 'Arrival at Pickup - Time', 'Pickup - Time',
       'Distance (KM)', 'Temperature', 'No_Of_Orders', 'Age', 'Average_Rating',
       'No_of_Ratings', 'Platform Type 1', 'Platform Type 2',
       'Platform Type 3', 'Platform Type 4', 'Bearing', 'Monday', 'Tuesday',
       'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'Day',
       'mean_travel_time', 'standard_deviation_travel_time',
       'geometric_mean_travel_time',
       'geometric_standard_deviation_travel_time'],
      dtype='object')

In [97]:
def split_train_evaluate(Xdata, Ydata, params,scale=True):
    X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata, test_size=0.2, random_state=0)
    if scale:
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
    # Initialising the ANN
    model = Sequential()
    # Adding the input layer and the first hidden layer
    model.add(Dense(units = params['units1'], kernel_initializer = 'normal', activation = 'relu', input_dim = params['input_dim']))
    # Adding the second hidden layer
    model.add(Dense(units = params['units2'], kernel_initializer = 'normal', activation = 'relu'))
    # Adding the output layer
    model.add(Dense(units = 1, kernel_initializer='normal', activation = 'linear'))
    # Compiling the ANN
    model.compile(loss='mean_squared_error', optimizer=params['optimizer'])
    # Fitting the ANN to the Training set
    model.fit(X_train, y_train, batch_size = params['batchsize'], epochs = params['Nepochs'], verbose=True)
    y_pred = model.predict(X_test)
    return model, mean_squared_error(y_test/60, y_pred/60)


In [98]:
parameters_dict = {'units1':29,'input_dim':29, 'units2':15, 'optimizer':'adam', 'batchsize':5, 'Nepochs':100}
m1,r1 = split_train_evaluate(Xdata=X, Ydata=Y, params=parameters_dict,scale=True)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [100]:
print(r1)

157.5356609515597


In [95]:
parameters_dict = {'units1':15,'input_dim':29, 'units2':15, 'optimizer':'adam', 'batchsize':5, 'Nepochs':100}
m2,r2 = split_train_evaluate(Xdata=X, Ydata=Y, params=parameters_dict,scale=True)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


157.78344633583944

In [91]:
parameters_dict = {'units1':15,'input_dim':29, 'units2':15, 'optimizer':'adam', 'batchsize':100, 'Nepochs':100}
m3,r3 = split_train_evaluate(Xdata=X, Ydata=Y, params=parameters_dict,scale=True)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


160.14419040510546

## Predictions

In [106]:
print(test_with_rider_info.columns)
test_with_rider_info['DAYS'] = test_with_rider_info['Monday'].astype(str)+test_with_rider_info['Tuesday'].astype(str)+test_with_rider_info['Wednesday'].astype(str)+test_with_rider_info['Thursday'].astype(str)+test_with_rider_info['Friday'].astype(str)+test_with_rider_info['Saturday'].astype(str)+test_with_rider_info['Sunday'].astype(str)
days = list()
for day in test_with_rider_info['DAYS']:
    if day == '1000000':
        days.append(1)
    elif day == '0100000':
        days.append(2)
    elif day == '0010000':
        days.append(3)
    elif day == '0001000':
        days.append(4)
    elif day == '0000100':
        days.append(5)
    elif day == '0000010':
        days.append(6)
    elif day == '0000001':
        days.append(7)
    else:
        days.append(0)
test_with_rider_info['Day'] = days
print(test_with_rider_info['Day'].value_counts())
test_with_rider_info.reset_index(inplace=True)


Index(['Order No', 'User Id', 'Vehicle Type', 'Personal or Business',
       'Day of Month', 'Placement - Time', 'Confirmation - Day of Month',
       'Confirmation - Weekday (Mo = 1)', 'Confirmation - Time',
       'Arrival at Pickup - Day of Month',
       'Arrival at Pickup - Weekday (Mo = 1)', 'Arrival at Pickup - Time',
       'Pickup - Day of Month', 'Pickup - Weekday (Mo = 1)', 'Pickup - Time',
       'Distance (KM)', 'Temperature', 'Precipitation in millimeters',
       'Pickup Lat', 'Pickup Long', 'Destination Lat', 'Destination Long',
       'Rider Id', 'No_Of_Orders', 'Age', 'Average_Rating', 'No_of_Ratings',
       'Platform Type 1', 'Platform Type 2', 'Platform Type 3',
       'Platform Type 4', 'Bearing', 'Monday', 'Tuesday', 'Wednesday',
       'Thursday', 'Friday', 'Saturday', 'Sunday'],
      dtype='object')
4    1386
5    1349
2    1303
1    1280
3    1257
6     427
7      66
Name: Day, dtype: int64


In [108]:
pickup1, dest1 = point_in_polygon(data=test_with_rider_info, polygons_df=polygon_points)
print(Counter([len(lst) for lst in dest1]))
print(Counter([len(lst) for lst in pickup1]))
dest1 = cleanup_lists(lst=dest1)
pickup1 = cleanup_lists(lst=pickup1)

Counter({1: 7059, 0: 9})
Counter({1: 7068})


In [109]:
test_with_rider_info['sourceid'] = pickup1
test_with_rider_info['destid'] = dest1

In [110]:
test_with_rider_info.head()
test_with_rider_info['ID'] = test_with_rider_info['sourceid'].astype(str)+"_"+test_with_rider_info['destid'].astype(str)+"_"+test_with_rider_info['Day'].astype(str)
print(test_with_rider_info.shape)
test_with_rider_info.head()
print(weekly_agregates.shape)
weekly_agregates.head()
test_with_rider_data = test_with_rider_info.merge(right=weekly_agregates, how='left', right_on='ID', left_on='ID')
print(test_with_rider_data.shape)

(7068, 45)
(408523, 8)
(7068, 52)


In [113]:
list(X.columns)

['Personal or Business',
 'Day of Month',
 'Placement - Time',
 'Confirmation - Time',
 'Arrival at Pickup - Time',
 'Pickup - Time',
 'Distance (KM)',
 'Temperature',
 'No_Of_Orders',
 'Age',
 'Average_Rating',
 'No_of_Ratings',
 'Platform Type 1',
 'Platform Type 2',
 'Platform Type 3',
 'Platform Type 4',
 'Bearing',
 'Monday',
 'Tuesday',
 'Wednesday',
 'Thursday',
 'Friday',
 'Saturday',
 'Sunday',
 'Day',
 'mean_travel_time',
 'standard_deviation_travel_time',
 'geometric_mean_travel_time',
 'geometric_standard_deviation_travel_time']

In [116]:
print(test_with_rider_data['mean_travel_time'].notna().value_counts())
test_with_rider_data['mean_travel_time'] = test_with_rider_data['mean_travel_time'].fillna(test_with_rider_data['mean_travel_time'].median())



True     6995
False      73
Name: mean_travel_time, dtype: int64


In [117]:
print(test_with_rider_data['standard_deviation_travel_time'].notna().value_counts())
test_with_rider_data['standard_deviation_travel_time'] = test_with_rider_data['standard_deviation_travel_time'].fillna(test_with_rider_data['standard_deviation_travel_time'].median())



True     6995
False      73
Name: standard_deviation_travel_time, dtype: int64


In [118]:
print(test_with_rider_data['geometric_mean_travel_time'].notna().value_counts())
test_with_rider_data['geometric_mean_travel_time'] = test_with_rider_data['geometric_mean_travel_time'].fillna(test_with_rider_data['geometric_mean_travel_time'].median())



True     6995
False      73
Name: geometric_mean_travel_time, dtype: int64


In [119]:
print(test_with_rider_data['geometric_standard_deviation_travel_time'].notna().value_counts())
test_with_rider_data['geometric_standard_deviation_travel_time'] = test_with_rider_data['geometric_standard_deviation_travel_time'].fillna(test_with_rider_data['geometric_standard_deviation_travel_time'].median())



True     6995
False      73
Name: geometric_standard_deviation_travel_time, dtype: int64


In [132]:
sc = StandardScaler()
Xtrain = sc.fit_transform(X)
submit_data = test_with_rider_data[list(X.columns)]

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [133]:
final_predict = m1.predict(sc.transform(submit_data))
test_with_rider_data['Time from Pickup to Arrival'] = [val[0] for val in final_predict]

  """Entry point for launching an IPython kernel.


In [135]:
submission = test_with_rider_data[['Order No','Time from Pickup to Arrival' ]]
submission['Time from Pickup to Arrival'] = submission['Time from Pickup to Arrival'].astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [136]:
submission.to_csv('Data/m1.csv', index=False)
