In [2]:
import datetime as dt
import warnings
from datetime import datetime, timedelta
from math import sqrt
from time import sleep
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from shapely.geometry import Point, Polygon

from IPython.core.interactiveshell import InteractiveShell
warnings.simplefilter(action='ignore', category=FutureWarning)
InteractiveShell.ast_node_interactivity = "all"

In [4]:
### Pre-dataloading
d_types = { 
             'Platform Type' : np.float16
           , 'Placement - Day of Month' : np.uint16
           , 'Placement - Weekday (Mo = 1)' : np.uint16
           , 'Confirmation - Day of Month' : np.uint16
           , 'Confirmation - Weekday (Mo = 1)' : np.uint16
           , 'Arrival at Pickup - Day of Month' : np.uint16
           , 'Arrival at Pickup - Weekday (Mo = 1)' : np.uint16
           , 'Pickup - Day of Month' : np.float16
           , 'Pickup - Weekday (Mo = 1)' : np.float16
           , 'Arrival at Destination - Day of Month' : np.uint16
           , 'Arrival at Destination - Weekday (Mo = 1)' : np.uint16
           , 'Distance (KM)' : np.float16
           , 'Temperature' : np.float16
           , 'Precipitation in millimeters': np.float16
           , 'Pickup Lat' : np.float16
           , 'Pickup Long' : np.float16
           , 'Destination Lat' : np.float16
           , 'Destination Long' : np.float16
           , 'Time from Pickup to Arrival' : np.uint16
           , 'No_Of_Orders' : np.float16
           , 'Age' : np.float16
           , 'Average_Rating' : np.float16
           , 'No_of_Ratings' : np.float16
            }

d_tpes_rider = {
      'No_Of_Orders' : np.uint16      
    , 'Age': np.uint16                
    , 'Average_Rating': np.float16    
    , 'No_of_Ratings': np.uint16      
    }

df_train = pd.read_csv('Train.csv', dtype=d_types)
df_test = pd.read_csv('Test.csv', dtype=d_types)
df_riders = pd.read_csv('Riders.csv', dtype=d_tpes_rider)

# Join rider info to train/test
df_train = pd.merge(df_train, df_riders, how='left', left_on=['Rider Id'], right_on=['Rider Id'])
df_test = pd.merge(df_test, df_riders, how='left', left_on=['Rider Id'], right_on=['Rider Id'])

hexss = pd.read_json('travel_times/540_hexclusters.json')
dict_ = {}
for itm in hexss['features']:
    dict_[int(itm['properties']['MOVEMENT_ID'])] = Polygon(itm['geometry']['coordinates'][0])


## Make pickup and destination points
df_train['pickup'] = [Point(row['Pickup Long'], row['Pickup Lat']) for i,row in df_train.iterrows()]
df_train['destination'] = [Point(row['Destination Long'], row['Destination Lat']) for i,row in df_train.iterrows()]

df_test['pickup'] = [Point(row['Pickup Long'], row['Pickup Lat']) for i,row in df_test.iterrows()]
df_test['destination'] = [Point(row['Destination Long'], row['Destination Lat']) for i,row in df_test.iterrows()]

## find the hex shape the points belong to
df_train['pickup_ID'] = df_train['pickup'].apply(lambda x: max([k if x.within(v) else -1 for k,v in dict_.items()]))
df_train['destination_ID'] = df_train['destination'].apply(lambda x: max([k if x.within(v) else -1 for k,v in dict_.items()]))

df_test['pickup_ID'] = df_test['pickup'].apply(lambda x: max([k if x.within(v) else -1 for k,v in dict_.items()]))
df_test['destination_ID'] = df_test['destination'].apply(lambda x: max([k if x.within(v) else -1 for k,v in dict_.items()]))

df_train = df_train.astype({
  'pickup_ID': np.int16
 , 'destination_ID': np.int16   
})

df_test = df_test.astype({
  'pickup_ID': np.int16
 , 'destination_ID': np.int16   
})

### Time 

train_time_col = ['Placement', 'Confirmation', 'Arrival at Pickup', 'Pickup', 'Arrival at Destination']
test_time_col  = ['Placement', 'Confirmation', 'Arrival at Pickup', 'Pickup']
time_cat = ['Day of Month', 'Weekday (Mo = 1) ', 'Time']

for col in test_time_col:
    time_col = col + ' - Time'
    
    df_train[time_col] = pd.to_datetime(df_train[time_col])
    df_train[time_col+'_hour'] = df_train[time_col].dt.hour
    df_train[time_col] = df_train[time_col].dt.hour * 60 + df_train[time_col].dt.minute
    
    df_test[time_col] = pd.to_datetime(df_test[time_col])
    df_test[time_col+'_hour'] = df_test[time_col].dt.hour
    df_test[time_col] = df_test[time_col].dt.hour * 60 + df_test[time_col].dt.minute  
    
df_train = df_train.astype({
      'Placement - Time_hour' : np.int16
    , 'Confirmation - Time_hour': np.int16
    , 'Arrival at Pickup - Time_hour' : np.int16           
    , 'Pickup - Time_hour'   : np.int16
    , 'Placement - Time': np.int16
    , 'Confirmation - Time' : np.int16
    , 'Arrival at Pickup - Time': np.int16
    , 'Pickup - Time' : np.int16
})

df_test = df_test.astype({
      'Placement - Time_hour' : np.int16
    , 'Confirmation - Time_hour': np.int16
    , 'Arrival at Pickup - Time_hour' : np.int16           
    , 'Pickup - Time_hour'   : np.int16
    , 'Placement - Time': np.int16
    , 'Confirmation - Time' : np.int16
    , 'Arrival at Pickup - Time': np.int16
    , 'Pickup - Time' : np.int16
})

cat_col = ['Personal or Business']
df_train['Personal or Business'] = df_train['Personal or Business'].astype('category').cat.codes
df_test['Personal or Business'] = df_test['Personal or Business'].astype('category').cat.codes

## Merge the average travel times to the pickup and destination shapes
types = {'sourceid' : np.uint16                                   
    , 'dstid' : np.uint16                                      
    , 'dow' : np.uint16                                       
    , 'mean_travel_time' : np.float16                            
    , 'standard_deviation_travel_time' : np.float16             
    , 'geometric_mean_travel_time'  : np.float16                
    , 'geometric_standard_deviation_travel_time': np.float16}
tt_weekly = pd.read_csv('travel_times/nairobi-hexclusters-2018-4-WeeklyAggregate.csv', dtype=types)

print ('Shape before merge:', df_train.shape)
df_train = pd.merge(df_train, tt_weekly
              , how='left'
              , left_on=['pickup_ID','destination_ID','Pickup - Weekday (Mo = 1)']
              , right_on=[ 'dstid','sourceid', 'dow']
              , suffixes=('_hour', '_week')
         )
print ('Shape after merge:', df_train.shape)
print ('Shape test before merge:', df_test.shape)
df_test = pd.merge(df_test, tt_weekly
          , how='left'
              , left_on=['pickup_ID','destination_ID','Pickup - Weekday (Mo = 1)']
              , right_on=[ 'dstid','sourceid', 'dow']
              , suffixes=('_hour', '_week')
         )
print ('Shape test after merge:', df_test.shape)

del tt_weekly

df_train.set_index('Order No', inplace=True)
df_test.set_index('Order No', inplace=True)

## Save Pickle dataframes
df_train.to_pickle('df_train.pkl')
df_test.to_pickle('df_test.pkl')

ValueError: Unexpected character found when decoding 'true'