In [24]:
import pandas as pd
import numpy as np
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
df_train = pd.read_csv("input/train.csv")
df_test = pd.read_csv("input/test.csv")

In [3]:
df_test["trip_duration"] = -1

In [4]:
full_df = pd.concat([df_train, df_test], axis=0)

In [5]:
import haversine
def haversine_distance(x):
    a_lat, a_lon, b_lat, b_lon = x
    return haversine.haversine((a_lat, a_lon), (b_lat, b_lon))

In [37]:
import multiprocess
# Multithreaded apply function for a dataframe. This uses multiprocessing to map a function to a series, vastly speeding up feature generation

N_THREADS = 8
def apply_multithreaded(data, func):
    pool = multiprocess.Pool(N_THREADS)  # Spawn a pool of processes
    data = data.values  # Retrieve a numpy array which can be iterated over

    result = pool.map(func, data)  # Map the function over the data multi-threaded
    pool.close()  # Close the threads
    return result

In [7]:
full_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [12]:
full_df["store_and_fwd_flag"] = full_df["store_and_fwd_flag"].apply(lambda x: 1 if x == "Y" else 0)

In [14]:
full_df.dtypes

id                     object
vendor_id               int64
pickup_datetime        object
dropoff_datetime       object
passenger_count         int64
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
store_and_fwd_flag      int64
trip_duration           int64
dtype: object

In [15]:
full_df["pickup_datetime"] = pd.to_datetime(full_df["pickup_datetime"])
full_df["dropoff_datetime"] = pd.to_datetime(full_df["dropoff_datetime"])

In [20]:
full_df["pickup_unixtime"] = [np.int64(t.value) / 1000000000 for t in full_df["pickup_datetime"]]

In [26]:
full_df["daily_minute"] = [t.hour * 60 + t.minute for t in full_df["pickup_datetime"]]

In [27]:
full_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_unixtime,daily_minute
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982,40.768,-73.965,40.766,0,455,1457976295.0,1044
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.98,40.739,-73.999,40.731,0,663,1465692215.0,43
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979,40.764,-74.005,40.71,0,2124,1453203324.0,695
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01,40.72,-74.012,40.707,0,429,1459971151.0,1172
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973,40.793,-73.973,40.783,0,435,1458999055.0,810


In [28]:
full_df["duration"] = full_df["dropoff_datetime"] - full_df["pickup_datetime"]

In [30]:
full_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_unixtime,daily_minute,duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982,40.768,-73.965,40.766,0,455,1457976295.0,1044,00:07:35
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.98,40.739,-73.999,40.731,0,663,1465692215.0,43,00:11:03
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979,40.764,-74.005,40.71,0,2124,1453203324.0,695,00:35:24
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01,40.72,-74.012,40.707,0,429,1459971151.0,1172,00:07:09
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973,40.793,-73.973,40.783,0,435,1458999055.0,810,00:07:15


In [33]:
full_df["day_of_week"] = [t.dayofweek for t in full_df["pickup_datetime"]]

In [35]:
full_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_unixtime,daily_minute,duration,day_of_week
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982,40.768,-73.965,40.766,0,455,1457976295.0,1044,00:07:35,0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.98,40.739,-73.999,40.731,0,663,1465692215.0,43,00:11:03,6
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979,40.764,-74.005,40.71,0,2124,1453203324.0,695,00:35:24,1
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01,40.72,-74.012,40.707,0,429,1459971151.0,1172,00:07:09,2
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973,40.793,-73.973,40.783,0,435,1458999055.0,810,00:07:15,5


### Create Distance Features

In [38]:
# Map the three distance functions over all samples in the training set
full_df["dist_l1"] = np.abs(full_df["pickup_latitude"] - full_df["dropoff_latitude"]) + np.abs(full_df["pickup_longitude"] - full_df["dropoff_longitude"])
full_df["dist_l2"] = np.sqrt((full_df["pickup_latitude"] - full_df["dropoff_latitude"]) ** 2 + (full_df["pickup_longitude"] - full_df['dropoff_longitude']) ** 2)
# As haversine is not vectorised, we use the multithreading approach for speed
full_df["dist_haversine"] = apply_multithreaded(full_df[['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']], haversine_distance)

In [39]:
full_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_unixtime,daily_minute,duration,day_of_week,dist_l1,dist_l2,dist_haversine
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982,40.768,-73.965,40.766,0,455,1457976295.0,1044,00:07:35,0,0.02,0.018,1.499
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.98,40.739,-73.999,40.731,0,663,1465692215.0,43,00:11:03,6,0.026,0.02,1.806
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979,40.764,-74.005,40.71,0,2124,1453203324.0,695,00:35:24,1,0.08,0.06,6.385
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01,40.72,-74.012,40.707,0,429,1459971151.0,1172,00:07:09,2,0.015,0.013,1.486
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973,40.793,-73.973,40.783,0,435,1458999055.0,810,00:07:15,5,0.011,0.011,1.189
