In [1]:
import pandas as pd
import numpy as np
import datetime
from matplotlib import pyplot
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import StandardScaler

Using TensorFlow backend.
  data = yaml.load(f.read()) or {}


In [2]:
train_data = pd.read_csv('Data/Train.csv')
riders_data = pd.read_csv('Data/Riders.csv')
test_data = pd.read_csv('Data/Test.csv')


In [3]:
def time_from_midnight_in_seconds(data, column):
    data[column] = pd.to_datetime(data[column])
    return (data[column] - pd.to_datetime(pd.to_datetime('today').date())).astype('timedelta64[s]')


def delta_time(dataset, higher_time, lower_time):
    return dataset[higher_time] - dataset[lower_time]

def time_to_day_part(time):
    hours = time/3600
    if hours < 6:
        return ('Night')
    if hours < 12:
        return ('Morning')
    if hours < 18:
        return ('Afternoon')
    else:
        return ('Evening')
    
def calculate_bearing(lat1, lng1, lat2, lng2):
        lat1 = np.deg2rad(lat1)
        lat2 = np.deg2rad(lat2)
        diffLong = np.deg2rad(lng2 - lng1)
        x = np.sin(diffLong) * np.cos(lat2)
        y = np.cos(lat1) * np.sin(lat2) - (np.sin(lat1)
                                     * np.cos(lat2) * np.cos(diffLong))
        initial_bearing = np.arctan2(x, y)
        # Now we have the initial bearing but math.atan2 return values
        # from -180° to + 180° which is not what we want for a compass bearing
        # The solution is to normalize the initial bearing as shown below
        initial_bearing = np.rad2deg(initial_bearing)
        compass_bearing = (initial_bearing+360) % 360
        return compass_bearing    


In [4]:
# converting time in seconds from midnight
train_data['Placement - Time'] = time_from_midnight_in_seconds(train_data, 'Placement - Time')
train_data['Confirmation - Time'] = time_from_midnight_in_seconds(train_data, 'Confirmation - Time')
train_data['Pickup - Time'] = time_from_midnight_in_seconds(train_data, 'Pickup - Time')
train_data['Arrival at Pickup - Time'] = time_from_midnight_in_seconds(train_data, 'Arrival at Pickup - Time')
train_data['Arrival at Destination - Time'] = time_from_midnight_in_seconds(train_data, 'Arrival at Destination - Time')

# converting time in seconds from midnight for test data
test_data['Placement - Time'] = time_from_midnight_in_seconds(test_data, 'Placement - Time')
test_data['Confirmation - Time'] = time_from_midnight_in_seconds(test_data, 'Confirmation - Time')
test_data['Pickup - Time'] = time_from_midnight_in_seconds(test_data, 'Pickup - Time')
test_data['Arrival at Pickup - Time'] = time_from_midnight_in_seconds(test_data, 'Arrival at Pickup - Time')


In [5]:
# calculating delta_time for train data
delta_confirm_place_train = delta_time(train_data, 'Confirmation - Time', 'Placement - Time')
delta_pick_arr_confirm_train = delta_time(train_data, 'Arrival at Pickup - Time', 'Confirmation - Time')
delta_pickup_confirm_train = delta_time(train_data, 'Pickup - Time', 'Arrival at Pickup - Time')
delta_arrival_pickup_train = delta_time(train_data, 'Arrival at Destination - Time', 'Pickup - Time')
delta_placement_arrival_train = delta_time(train_data, 'Arrival at Destination - Time', 'Placement - Time')

delta_confirm_place_test = delta_time(test_data, 'Confirmation - Time', 'Placement - Time')
delta_pick_arr_confirm_test = delta_time(test_data, 'Arrival at Pickup - Time', 'Confirmation - Time')
delta_pickup_confirm_test = delta_time(test_data, 'Pickup - Time', 'Arrival at Pickup - Time')


In [6]:
train_with_rider_info = train_data.merge(riders_data, on='Rider Id')
test_with_rider_info = test_data.merge(riders_data, on='Rider Id')


In [7]:
train_with_rider_info = train_with_rider_info[train_with_rider_info['Placement - Weekday (Mo = 1)'] == train_with_rider_info['Confirmation - Weekday (Mo = 1)']]
train_with_rider_info = train_with_rider_info[train_with_rider_info['Placement - Day of Month'] == train_with_rider_info['Confirmation - Day of Month']]


In [8]:
train_with_rider_info['Temperature'].fillna(train_with_rider_info['Temperature'].mean(), inplace=True)
test_with_rider_info['Temperature'].fillna(test_with_rider_info['Temperature'].mean(), inplace=True)


In [9]:
# label encoding of personal/business column for train data
labelencoder_personal_business = LabelEncoder()
train_with_rider_info['Personal or Business'] = labelencoder_personal_business.fit_transform(train_with_rider_info['Personal or Business'])

# label encoding of personal/business column for test data
labelencoder_personal_business = LabelEncoder()
test_with_rider_info['Personal or Business'] = labelencoder_personal_business.fit_transform(test_with_rider_info['Personal or Business'])


In [10]:
# one hot encoding of the train_data['Platform Type'] column
train_with_rider_info['Platform Type'] = train_with_rider_info['Platform Type'].astype('category')
train_with_rider_info = pd.concat([train_with_rider_info.drop(columns=['Platform Type']), pd.get_dummies(train_with_rider_info['Platform Type'])], axis=1)



In [11]:
# one hot encoding of the test_data['Platform Type'] column
test_with_rider_info['Platform Type'] = test_with_rider_info['Platform Type'].astype('category')
test_with_rider_info = pd.concat([test_with_rider_info.drop(columns=['Platform Type']), pd.get_dummies(test_with_rider_info['Platform Type'])], axis=1)


In [12]:
bearing_test = calculate_bearing(test_with_rider_info['Pickup Lat'], test_with_rider_info['Pickup Long'],
                            test_with_rider_info['Destination Lat'], test_with_rider_info['Destination Long'])

bearing_train = calculate_bearing(train_with_rider_info['Pickup Lat'], train_with_rider_info['Pickup Long'],
                            train_with_rider_info['Destination Lat'], train_with_rider_info['Destination Long'])


In [13]:
test_with_rider_info['Bearing'] = bearing_test
train_with_rider_info['Bearing'] = bearing_train


In [14]:
# drop redundant columns train data
train_with_rider_info.drop(columns=['Confirmation - Day of Month', 'Confirmation - Weekday (Mo = 1)',
                                    'Arrival at Pickup - Day of Month','Arrival at Pickup - Weekday (Mo = 1)',
                                    'Pickup - Day of Month', 'Pickup - Weekday (Mo = 1)',
                                    'Arrival at Destination - Day of Month',
                                    'Arrival at Destination - Weekday (Mo = 1)',
                                    'Arrival at Destination - Time',
                                    'Pickup Lat', 'Pickup Long',
                                    'Destination Lat', 'Destination Long',
                                    'Vehicle Type', 'Order No',
                                    'User Id', 'Rider Id',
                                    'Precipitation in millimeters'], inplace=True)


In [15]:
# drop redundant columns test data
test_with_rider_info.drop(columns=['Confirmation - Day of Month', 'Confirmation - Weekday (Mo = 1)',
                                   'Arrival at Pickup - Day of Month', 'Arrival at Pickup - Weekday (Mo = 1)',
                                   'Pickup - Day of Month', 'Pickup - Weekday (Mo = 1)',
                                   'Vehicle Type', 'User Id', 'Rider Id', 'Precipitation in millimeters',
                                   'Pickup Lat', 'Pickup Long',
                                   'Destination Lat', 'Destination Long'], inplace=True)


In [16]:
train_with_rider_info.rename(columns={1: "Platform Type 1", 2: "Platform Type 2",
                                      3: "Platform Type 3", 4: "Platform Type 4"}, inplace=True)

test_with_rider_info.rename(columns={1: "Platform Type 1", 2: "Platform Type 2",
                                     3: "Platform Type 3", 4: "Platform Type 4"}, inplace=True)


In [17]:
train_with_rider_info.rename(columns={'Placement - Day of Month': 'Day of Month',
                                      'Placement - Weekday (Mo = 1)': 'Weekday (Mo = 1)'}, inplace=True)

test_with_rider_info.rename(columns={'Placement - Day of Month': 'Day of Month',
                                     'Placement - Weekday (Mo = 1)': 'Weekday (Mo = 1)'}, inplace=True)


In [18]:
# one hot encoding of the test_data['Weekday (Mo = 1)] column
test_with_rider_info['Weekday (Mo = 1)'] = test_with_rider_info['Weekday (Mo = 1)'].astype('category')
test_with_rider_info = pd.concat([test_with_rider_info.drop(columns=['Weekday (Mo = 1)']), pd.get_dummies(test_with_rider_info['Weekday (Mo = 1)'])], axis=1)


In [19]:
# one hot encoding of the train_data['Weekday (Mo = 1)] column
train_with_rider_info['Weekday (Mo = 1)'] = train_with_rider_info['Weekday (Mo = 1)'].astype('category')
train_with_rider_info = pd.concat([train_with_rider_info.drop(columns=['Weekday (Mo = 1)']), pd.get_dummies(train_with_rider_info['Weekday (Mo = 1)'])], axis=1)


In [20]:

train_with_rider_info.rename(columns={1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 
                                      4: 'Thursday', 5: 'Friday', 6: 'Saturday',
                                      7: 'Sunday'
                                      }, inplace=True)

test_with_rider_info.rename(columns={1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 
                                      4: 'Thursday', 5: 'Friday', 6: 'Saturday',
                                      7: 'Sunday'}, inplace=True)


In [21]:
# labeling part of the day in train data 
# train_with_rider_info['Placement - Time'] = train_with_rider_info['Placement - Time'].apply(time_to_day_part)
# train_with_rider_info['Confirmation - Time'] = train_with_rider_info['Confirmation - Time'].apply(time_to_day_part)
# train_with_rider_info['Arrival at Pickup - Time'] = train_with_rider_info['Arrival at Pickup - Time'].apply(time_to_day_part)
# train_with_rider_info['Pickup - Time'] = train_with_rider_info['Pickup - Time'].apply(time_to_day_part)


In [22]:
# labeling part of the day in test data
# test_with_rider_info['Placement - Time'] = test_with_rider_info['Placement - Time'].apply(time_to_day_part)
# test_with_rider_info['Confirmation - Time'] = test_with_rider_info['Confirmation - Time'].apply(time_to_day_part)
# test_with_rider_info['Arrival at Pickup - Time'] = test_with_rider_info['Arrival at Pickup - Time'].apply(time_to_day_part)
# test_with_rider_info['Pickup - Time'] = test_with_rider_info['Pickup - Time'].apply(time_to_day_part)


In [23]:
# label encoding, one hot encoding, and renaming of times
# labelencoder = LabelEncoder()
# train_with_rider_info['Placement - Time'] = labelencoder.fit_transform(train_with_rider_info['Placement - Time'])
# train_with_rider_info['Placement - Time'] = train_with_rider_info['Placement - Time'].astype('category')
# train_with_rider_info = pd.concat([train_with_rider_info.drop(columns=['Placement - Time']), pd.get_dummies(train_with_rider_info['Placement - Time'])], axis=1)



## Training Neural Networks

In [49]:
X = train_with_rider_info.drop(columns='Time from Pickup to Arrival')
Y = train_with_rider_info['Time from Pickup to Arrival']
variables = ['Distance (KM)',
'Bearing',
'No_Of_Orders',
'Age',
'No_of_Ratings',
'Arrival at Pickup - Time',
'Pickup - Time',
'Confirmation - Time',
'Placement - Time',
'Average_Rating',
'Temperature',
'Day of Month']
variables1 = ['Distance (KM)',
'No_Of_Orders',
'Age',
'No_of_Ratings',
'Arrival at Pickup - Time',
'Pickup - Time',
'Confirmation - Time',
'Placement - Time',
'Average_Rating',
'Temperature',
'Day of Month']

In [38]:
def split_train_evaluate(Xdata, Ydata, params,scale=True):
    X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata, test_size=0.2, random_state=0)
    if scale:
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
    # Initialising the ANN
    model = Sequential()
    # Adding the input layer and the first hidden layer
    model.add(Dense(units = params['units1'], kernel_initializer = 'normal', activation = 'relu', input_dim = params['input_dim']))
    # Adding the second hidden layer
    model.add(Dense(units = params['units2'], kernel_initializer = 'normal', activation = 'relu'))
    # Adding the output layer
    model.add(Dense(units = 1, kernel_initializer='normal', activation = 'linear'))
    # Compiling the ANN
    model.compile(loss='mean_squared_error', optimizer=params['optimizer'])
    # Fitting the ANN to the Training set
    model.fit(X_train, y_train, batch_size = params['batchsize'], epochs = params['Nepochs'], verbose=True)
    y_pred = model.predict(X_test)
    return mean_squared_error(y_test/60, y_pred/60)


In [40]:
parameters_dict = {'units1':12,'input_dim':24, 'units2':6, 'optimizer':'adam', 'batchsize':5, 'Nepochs':10}
split_train_evaluate(Xdata=X, Ydata=Y, params=parameters_dict,scale=True)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


181.50540123096545

In [43]:
parameters_dict = {'units1':12,'input_dim':24, 'units2':6, 'optimizer':'sgd', 'batchsize':5, 'Nepochs':10}
split_train_evaluate(Xdata=X, Ydata=Y, params=parameters_dict,scale=True)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


279.35327380305114

In [46]:
parameters_dict = {'units1':6,'input_dim':12, 'units2':6, 'optimizer':'sgd', 'batchsize':5, 'Nepochs':10}
split_train_evaluate(Xdata=X[variables], Ydata=Y, params=parameters_dict,scale=True)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


280.0198789861787

In [48]:
parameters_dict = {'units1':6,'input_dim':12, 'units2':6, 'optimizer':'adam', 'batchsize':5, 'Nepochs':10}
split_train_evaluate(Xdata=X[variables], Ydata=Y, params=parameters_dict,scale=True)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


178.56543579995798

In [51]:
parameters_dict = {'units1':6,'input_dim':11, 'units2':6, 'optimizer':'adam', 'batchsize':5, 'Nepochs':10}
split_train_evaluate(Xdata=X[variables1], Ydata=Y, params=parameters_dict,scale=True)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


178.93421060373802

In [129]:
test_with_rider_info1 = test_with_rider_info.loc[:,variables]

In [130]:
final_predict = model3.predict(sc.transform(test_with_rider_info1))
test_with_rider_info['Time from Pickup to Arrival'] = final_predict



  """Entry point for launching an IPython kernel.


In [131]:
submission = test_with_rider_info[['Order No','Time from Pickup to Arrival' ]]
submission['Time from Pickup to Arrival'] = submission['Time from Pickup to Arrival'].astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [132]:
submission.to_csv('Data/model3.csv', index=False)
