# Exploratory Data Analysis 

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib as mpl
import matplotlib.pyplot as plt 
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans

#import xgboost as xgb
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Load the data

In [2]:
# Load training data as train
trainDF = pd.read_csv('input/train.csv')

# Load testing data as test
testDF = pd.read_csv('input/test.csv')

In [3]:
# Print size as well as the top 5 observation of training dataset
print('Size of the TRAINING set is: {} rows and {} columns'.format(*trainDF.shape))
print('Size of the TEST set is: {} rows and {} columns'.format(*testDF.shape))

Size of the TRAINING set is: 1458644 rows and 11 columns
Size of the TEST set is: 625134 rows and 9 columns


In [4]:
trainDF.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [5]:
testDF.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


# Understanding Data

In [6]:
trainDF.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0
mean,1.53495,1.66453,-73.97349,40.75092,-73.97342,40.7518,959.4923
std,0.4987772,1.314242,0.07090186,0.03288119,0.07064327,0.03589056,5237.432
min,1.0,0.0,-121.9333,34.3597,-121.9333,32.18114,1.0
25%,1.0,1.0,-73.99187,40.73735,-73.99133,40.73588,397.0
50%,2.0,1.0,-73.98174,40.7541,-73.97975,40.75452,662.0
75%,2.0,2.0,-73.96733,40.76836,-73.96301,40.76981,1075.0
max,2.0,9.0,-61.33553,51.88108,-61.33553,43.92103,3526282.0


In [7]:
testDF.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
count,625134.0,625134.0,625134.0,625134.0,625134.0,625134.0
mean,1.534884,1.661765,-73.973614,40.750927,-73.973458,40.751816
std,0.498782,1.311293,0.073389,0.029848,0.072565,0.035824
min,1.0,0.0,-121.933128,37.389587,-121.933327,36.601322
25%,1.0,1.0,-73.991852,40.737392,-73.991318,40.736
50%,2.0,1.0,-73.981743,40.754093,-73.979774,40.754543
75%,2.0,2.0,-73.9674,40.768394,-73.963013,40.769852
max,2.0,9.0,-69.248917,42.814938,-67.496796,48.857597


# Feature Engineering 

In [8]:
import datetime as dt 
now = dt.datetime.now()
now

datetime.datetime(2017, 9, 15, 0, 53, 37, 185406)

In [9]:
train = trainDF
test = testDF
del trainDF, testDF

In [10]:
train['pickup_datetime'] = pd.to_datetime(train.pickup_datetime)
test['pickup_datetime'] = pd.to_datetime(test.pickup_datetime)

train.loc[:, 'pickup_date'] = train['pickup_datetime'].dt.date
test.loc[:, 'pickup_date'] = test['pickup_datetime'].dt.date

train['dropoff_datetime'] = pd.to_datetime(train.dropoff_datetime)
train['store_and_fwd_flag'] = 1 * (train.store_and_fwd_flag.values == 'Y')

test['store_and_fwd_flag'] = 1 * (test.store_and_fwd_flag.values == 'Y')
train['check_trip_duration'] = (train['dropoff_datetime'] - train['pickup_datetime']).map(lambda x: x.total_seconds())

duration_difference = train[np.abs(train['check_trip_duration'].values  - train['trip_duration'].values) > 1]

train['log_trip_duration'] = np.log(train['trip_duration'].values + 1)

print('Trip_duration and datetimes are ok.') if len(duration_difference[['pickup_datetime', 'dropoff_datetime', 'trip_duration', 'check_trip_duration']]) == 0 else print('Ooops.')
train.describe()

Trip_duration and datetimes are ok.


Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,check_trip_duration,log_trip_duration
count,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0
mean,1.53495,1.66453,-73.97349,40.75092,-73.97342,40.7518,0.005515396,959.4923,959.4923,6.466978
std,0.4987772,1.314242,0.07090186,0.03288119,0.07064327,0.03589056,0.07406066,5237.432,5237.432,0.7957595
min,1.0,0.0,-121.9333,34.3597,-121.9333,32.18114,0.0,1.0,1.0,0.6931472
25%,1.0,1.0,-73.99187,40.73735,-73.99133,40.73588,0.0,397.0,397.0,5.986452
50%,2.0,1.0,-73.98174,40.7541,-73.97975,40.75452,0.0,662.0,662.0,6.496775
75%,2.0,2.0,-73.96733,40.76836,-73.96301,40.76981,0.0,1075.0,1075.0,6.981006
max,2.0,9.0,-61.33553,51.88108,-61.33553,43.92103,1.0,3526282.0,3526282.0,15.07575


In [11]:
# Feature Extraction
coords = np.vstack((train[['pickup_latitude', 'pickup_longitude']].values,
                    train[['dropoff_latitude', 'dropoff_longitude']].values,
                    test[['pickup_latitude', 'pickup_longitude']].values,
                    test[['dropoff_latitude', 'dropoff_longitude']].values))

pca = PCA().fit(coords)
train['pickup_pca0'] = pca.transform(train[['pickup_latitude', 'pickup_longitude']])[:, 0]
train['pickup_pca1'] = pca.transform(train[['pickup_latitude', 'pickup_longitude']])[:, 1]
train['dropoff_pca0'] = pca.transform(train[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
train['dropoff_pca1'] = pca.transform(train[['dropoff_latitude', 'dropoff_longitude']])[:, 1]
test['pickup_pca0'] = pca.transform(test[['pickup_latitude', 'pickup_longitude']])[:, 0]
test['pickup_pca1'] = pca.transform(test[['pickup_latitude', 'pickup_longitude']])[:, 1]
test['dropoff_pca0'] = pca.transform(test[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
test['dropoff_pca1'] = pca.transform(test[['dropoff_latitude', 'dropoff_longitude']])[:, 1]


In [12]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_date,check_trip_duration,log_trip_duration,pickup_pca0,pickup_pca1,dropoff_pca0,dropoff_pca1
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,455,2016-03-14,455.0,6.122493,0.007691,0.017053,-0.009666,0.013695
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,663,2016-06-12,663.0,6.498282,0.007677,-0.012371,0.027145,-0.018652
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,2124,2016-01-19,2124.0,7.661527,0.004803,0.012879,0.034222,-0.039337
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,429,2016-04-06,429.0,6.063785,0.038342,-0.029194,0.041343,-0.042293
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,435,2016-03-26,435.0,6.077642,-0.002877,0.041749,-0.00238,0.031071


In [13]:
pca.explained_variance_

array([ 0.00511756,  0.00114191])

In [14]:
# Distance
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

def dummy_manhattan_distance(lat1, lng1, lat2, lng2):
    a = haversine_array(lat1, lng1, lat1, lng2)
    b = haversine_array(lat1, lng1, lat2, lng1)
    return a + b

def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

In [15]:
train.loc[:, 'distance_haversine'] = haversine_array(train['pickup_latitude'].values, train['pickup_longitude'].values, train['dropoff_latitude'].values, train['dropoff_longitude'].values)
train.loc[:, 'distance_dummy_manhattan'] = dummy_manhattan_distance(train['pickup_latitude'].values, train['pickup_longitude'].values, train['dropoff_latitude'].values, train['dropoff_longitude'].values)
train.loc[:, 'direction'] = bearing_array(train['pickup_latitude'].values, train['pickup_longitude'].values, train['dropoff_latitude'].values, train['dropoff_longitude'].values)
train.loc[:, 'pca_manhattan'] = np.abs(train['dropoff_pca1'] - train['pickup_pca1']) + np.abs(train['dropoff_pca0'] - train['pickup_pca0'])

test.loc[:, 'distance_haversine'] = haversine_array(test['pickup_latitude'].values, test['pickup_longitude'].values, test['dropoff_latitude'].values, test['dropoff_longitude'].values)
test.loc[:, 'distance_dummy_manhattan'] = dummy_manhattan_distance(test['pickup_latitude'].values, test['pickup_longitude'].values, test['dropoff_latitude'].values, test['dropoff_longitude'].values)
test.loc[:, 'direction'] = bearing_array(test['pickup_latitude'].values, test['pickup_longitude'].values, test['dropoff_latitude'].values, test['dropoff_longitude'].values)
test.loc[:, 'pca_manhattan'] = np.abs(test['dropoff_pca1'] - test['pickup_pca1']) + np.abs(test['dropoff_pca0'] - test['pickup_pca0'])

train.loc[:, 'center_latitude'] = (train['pickup_latitude'].values + train['dropoff_latitude'].values) / 2
train.loc[:, 'center_longitude'] = (train['pickup_longitude'].values + train['dropoff_longitude'].values) / 2
test.loc[:, 'center_latitude'] = (test['pickup_latitude'].values + test['dropoff_latitude'].values) / 2
test.loc[:, 'center_longitude'] = (test['pickup_longitude'].values + test['dropoff_longitude'].values) / 2

train.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,check_trip_duration,log_trip_duration,pickup_pca0,pickup_pca1,dropoff_pca0,dropoff_pca1,distance_haversine,distance_dummy_manhattan,direction,pca_manhattan,center_latitude,center_longitude
count,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0
mean,1.53495,1.66453,-73.97349,40.75092,-73.97342,40.7518,0.005515396,959.4923,959.4923,6.466978,3.560932e-05,-0.0004414311,-8.615863e-05,0.0004315364,3.440864,4.44686,-15.66892,0.04601049,40.75136,-73.97345
std,0.4987772,1.314242,0.07090186,0.03288119,0.07064327,0.03589056,0.07406066,5237.432,5237.432,0.7957595,0.07084957,0.03299371,0.07081532,0.03554987,4.296538,5.66593,104.4915,0.06009216,0.02972943,0.06683386
min,1.0,0.0,-121.9333,34.3597,-121.9333,32.18114,0.0,1.0,1.0,0.6931472,-12.44609,-6.85721,-12.44609,-8.563596,0.0,0.0,-179.9927,0.0,33.44669,-121.9333
25%,1.0,1.0,-73.99187,40.73735,-73.99133,40.73588,0.0,397.0,397.0,5.986452,-0.00726055,-0.01295075,-0.01132004,-0.01457879,1.231837,1.570939,-125.2736,0.01612315,40.73715,-73.99012
50%,2.0,1.0,-73.98174,40.7541,-73.97975,40.75452,0.0,662.0,662.0,6.496775,0.007727282,0.003063014,0.006083989,0.003359148,2.093717,2.688798,8.199996,0.02742471,40.75323,-73.97973
75%,2.0,2.0,-73.96733,40.76836,-73.96301,40.76981,0.0,1075.0,1075.0,6.981006,0.01890063,0.01588978,0.01825289,0.01725344,3.875337,4.99899,53.40495,0.05067191,40.76726,-73.96552
max,2.0,9.0,-61.33553,51.88108,-61.33553,43.92103,1.0,3526282.0,3526282.0,15.07575,48.07449,11.0423,48.07444,3.489436,1240.909,1318.48,180.0,13.01444,46.31584,-61.33553


In [16]:
# Datetime features
train.loc[:, 'pickup_weekday'] = train['pickup_datetime'].dt.weekday
train.loc[:, 'pickup_hour_weekofyear'] = train['pickup_datetime'].dt.weekofyear
train.loc[:, 'pickup_hour'] = train['pickup_datetime'].dt.hour
train.loc[:, 'pickup_minute'] = train['pickup_datetime'].dt.minute
train.loc[:, 'pickup_dt'] = (train['pickup_datetime'] - train['pickup_datetime'].min()).dt.total_seconds()
train.loc[:, 'pickup_week_hour'] = train['pickup_weekday'] * 24 + train['pickup_hour']

test.loc[:, 'pickup_weekday'] = test['pickup_datetime'].dt.weekday
test.loc[:, 'pickup_hour_weekofyear'] = test['pickup_datetime'].dt.weekofyear
test.loc[:, 'pickup_hour'] = test['pickup_datetime'].dt.hour
test.loc[:, 'pickup_minute'] = test['pickup_datetime'].dt.minute
test.loc[:, 'pickup_dt'] = (test['pickup_datetime'] - train['pickup_datetime'].min()).dt.total_seconds()
test.loc[:, 'pickup_week_hour'] = test['pickup_weekday'] * 24 + test['pickup_hour']

train.loc[:,'week_delta'] = train['pickup_datetime'].dt.weekday + \
    ((train['pickup_datetime'].dt.hour + (train['pickup_datetime'].dt.minute / 60.0)) / 24.0)
test.loc[:,'week_delta'] = test['pickup_datetime'].dt.weekday + \
    ((test['pickup_datetime'].dt.hour + (test['pickup_datetime'].dt.minute / 60.0)) / 24.0)

In [17]:
train['pickup_datetime'].dt.month[0] , train['pickup_datetime'][0]

(3, Timestamp('2016-03-14 17:24:55'))

In [18]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,pca_manhattan,center_latitude,center_longitude,pickup_weekday,pickup_hour_weekofyear,pickup_hour,pickup_minute,pickup_dt,pickup_week_hour,week_delta
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,...,0.020716,40.766769,-73.973392,0,11,17,24,6369878.0,17,0.725
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,...,0.025749,40.734858,-73.989948,6,23,0,43,14085798.0,144,6.029861
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,...,0.081636,40.737013,-73.99218,1,3,11,35,1596907.0,35,1.482639
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,...,0.0161,40.713345,-74.011154,2,14,19,32,8364734.0,67,2.813889
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,...,0.011175,40.787865,-73.972988,5,12,13,30,7392638.0,133,5.5625


In [19]:
# Make time features cyclic
train.loc[:,'week_delta_sin'] = np.sin((train['week_delta'] / 7) * np.pi)**2
train.loc[:,'hour_sin'] = np.sin((train['pickup_hour'] / 24) * np.pi)**2

test.loc[:,'week_delta_sin'] = np.sin((test['week_delta'] / 7) * np.pi)**2
test.loc[:,'hour_sin'] = np.sin((test['pickup_hour'] / 24) * np.pi)**2

train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,center_longitude,pickup_weekday,pickup_hour_weekofyear,pickup_hour,pickup_minute,pickup_dt,pickup_week_hour,week_delta,week_delta_sin,hour_sin
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,...,-73.973392,0,11,17,24,6369878.0,17,0.725,0.102188,0.62941
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,...,-73.989948,6,23,0,43,14085798.0,144,6.029861,0.177891,0.0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,...,-73.99218,1,3,11,35,1596907.0,35,1.482639,0.381157,0.982963
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,...,-74.011154,2,14,19,32,8364734.0,67,2.813889,0.908141,0.37059
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,...,-73.972988,5,12,13,30,7392638.0,133,5.5625,0.361582,0.982963


In [20]:
# Speed
train.loc[:, 'avg_speed_h'] = 1000 * train['distance_haversine'] / train['trip_duration']
train.loc[:, 'avg_speed_m'] = 1000 * train['distance_dummy_manhattan'] / train['trip_duration']

train.loc[:, 'pickup_lat_bin'] = np.round(train['pickup_latitude'], 3)
train.loc[:, 'pickup_long_bin'] = np.round(train['pickup_longitude'], 3)
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,pickup_minute,pickup_dt,pickup_week_hour,week_delta,week_delta_sin,hour_sin,avg_speed_h,avg_speed_m,pickup_lat_bin,pickup_long_bin
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,...,24,6369878.0,17,0.725,0.102188,0.62941,3.293452,3.814139,40.768,-73.982
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,...,43,14085798.0,144,6.029861,0.177891,0.0,2.723239,3.665922,40.739,-73.98
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,...,35,1596907.0,35,1.482639,0.381157,0.982963,3.006167,3.862323,40.764,-73.979
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,...,32,8364734.0,67,2.813889,0.908141,0.37059,3.4627,3.872567,40.72,-74.01
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,...,30,7392638.0,133,5.5625,0.361582,0.982963,2.732387,2.757372,40.793,-73.973


In [21]:
# Average speed for regions
gby_cols = ['pickup_lat_bin', 'pickup_long_bin']
coord_speed = train.groupby(gby_cols).mean()[['avg_speed_h']].reset_index()
coord_count = train.groupby(gby_cols).count()[['id']].reset_index()
coord_stats = pd.merge(coord_speed, coord_count, on=gby_cols)
coord_stats = coord_stats[coord_stats['id'] > 100]
coord_stats.head()

Unnamed: 0,pickup_lat_bin,pickup_long_bin,avg_speed_h,id
547,40.641,-73.789,7.582097,253
548,40.641,-73.788,7.808076,453
570,40.642,-73.789,7.461802,530
571,40.642,-73.788,7.797803,885
600,40.643,-73.79,6.812635,517


In [22]:
train.loc[:, 'pickup_lat_bin'] = np.round(train['pickup_latitude'], 2)
train.loc[:, 'pickup_long_bin'] = np.round(train['pickup_longitude'], 2)
train.loc[:, 'center_lat_bin'] = np.round(train['center_latitude'], 2)
train.loc[:, 'center_long_bin'] = np.round(train['center_longitude'], 2)
train.loc[:, 'pickup_dt_bin'] = (train['pickup_dt'] // (3 * 3600))

test.loc[:, 'pickup_lat_bin'] = np.round(test['pickup_latitude'], 2)
test.loc[:, 'pickup_long_bin'] = np.round(test['pickup_longitude'], 2)
test.loc[:, 'center_lat_bin'] = np.round(test['center_latitude'], 2)
test.loc[:, 'center_long_bin'] = np.round(test['center_longitude'], 2)
test.loc[:, 'pickup_dt_bin'] = (test['pickup_dt'] // (3 * 3600))

train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,week_delta,week_delta_sin,hour_sin,avg_speed_h,avg_speed_m,pickup_lat_bin,pickup_long_bin,center_lat_bin,center_long_bin,pickup_dt_bin
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,...,0.725,0.102188,0.62941,3.293452,3.814139,40.77,-73.98,40.77,-73.97,589.0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,...,6.029861,0.177891,0.0,2.723239,3.665922,40.74,-73.98,40.73,-73.99,1304.0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,...,1.482639,0.381157,0.982963,3.006167,3.862323,40.76,-73.98,40.74,-73.99,147.0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,...,2.813889,0.908141,0.37059,3.4627,3.872567,40.72,-74.01,40.71,-74.01,774.0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,...,5.5625,0.361582,0.982963,2.732387,2.757372,40.79,-73.97,40.79,-73.97,684.0


In [23]:
# Clustering
t0 = dt.datetime.now()

sample_ind = np.random.permutation(len(coords))
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(coords[sample_ind])

train.loc[:, 'pickup_cluster'] = kmeans.predict(train[['pickup_latitude', 'pickup_longitude']])
train.loc[:, 'dropoff_cluster'] = kmeans.predict(train[['dropoff_latitude', 'dropoff_longitude']])
test.loc[:, 'pickup_cluster'] = kmeans.predict(test[['pickup_latitude', 'pickup_longitude']])
test.loc[:, 'dropoff_cluster'] = kmeans.predict(test[['dropoff_latitude', 'dropoff_longitude']])
t1 = dt.datetime.now()
print('Time for clustering: %i seconds' % (t1 - t0).seconds)
train.head()

Time for clustering: 10 seconds


Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,hour_sin,avg_speed_h,avg_speed_m,pickup_lat_bin,pickup_long_bin,center_lat_bin,center_long_bin,pickup_dt_bin,pickup_cluster,dropoff_cluster
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,...,0.62941,3.293452,3.814139,40.77,-73.98,40.77,-73.97,589.0,16,94
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,...,0.0,2.723239,3.665922,40.74,-73.98,40.73,-73.99,1304.0,23,83
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,...,0.982963,3.006167,3.862323,40.76,-73.98,40.74,-73.99,147.0,78,44
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,...,0.37059,3.4627,3.872567,40.72,-74.01,40.71,-74.01,774.0,81,4
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,...,0.982963,2.732387,2.757372,40.79,-73.97,40.79,-73.97,684.0,31,18


In [24]:
# Temporal and geospatial aggregation
for gby_col in ['pickup_hour', 'pickup_date', 'pickup_dt_bin',
               'pickup_week_hour', 'pickup_cluster', 'dropoff_cluster']:
    gby = train.groupby(gby_col).mean()[['avg_speed_h', 'avg_speed_m', 'log_trip_duration']]
    gby.columns = ['%s_gby_%s' % (col, gby_col) for col in gby.columns]
    train = pd.merge(train, gby, how='left', left_on=gby_col, right_index=True)
    test = pd.merge(test, gby, how='left', left_on=gby_col, right_index=True)

for gby_cols in [['center_lat_bin', 'center_long_bin'],
                 ['pickup_hour', 'center_lat_bin', 'center_long_bin'],
                 ['pickup_hour', 'pickup_cluster'],  
                 ['pickup_hour', 'dropoff_cluster'],
                 ['pickup_cluster', 'dropoff_cluster']]:
    coord_speed = train.groupby(gby_cols).mean()[['avg_speed_h']].reset_index()
    coord_count = train.groupby(gby_cols).count()[['id']].reset_index()
    coord_stats = pd.merge(coord_speed, coord_count, on=gby_cols)
    coord_stats = coord_stats[coord_stats['id'] > 100]
    coord_stats.columns = gby_cols + ['avg_speed_h_%s' % '_'.join(gby_cols), 'cnt_%s' %  '_'.join(gby_cols)]
    train = pd.merge(train, coord_stats, how='left', on=gby_cols)
    test = pd.merge(test, coord_stats, how='left', on=gby_cols)

group_freq = '60min'
df_all = pd.concat((train, test))[['id', 'pickup_datetime', 'pickup_cluster', 'dropoff_cluster']]
train.loc[:, 'pickup_datetime_group'] = train['pickup_datetime'].dt.round(group_freq)
test.loc[:, 'pickup_datetime_group'] = test['pickup_datetime'].dt.round(group_freq)

train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,cnt_center_lat_bin_center_long_bin,avg_speed_h_pickup_hour_center_lat_bin_center_long_bin,cnt_pickup_hour_center_lat_bin_center_long_bin,avg_speed_h_pickup_hour_pickup_cluster,cnt_pickup_hour_pickup_cluster,avg_speed_h_pickup_hour_dropoff_cluster,cnt_pickup_hour_dropoff_cluster,avg_speed_h_pickup_cluster_dropoff_cluster,cnt_pickup_cluster_dropoff_cluster,pickup_datetime_group
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,...,57030.0,3.085869,3451.0,3.152372,1901.0,3.121137,1219.0,3.05722,778.0,2016-03-14 17:00:00
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,...,53601.0,3.553433,2701.0,4.77489,722.0,3.88944,949.0,2.867844,473.0,2016-06-12 01:00:00
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,...,82646.0,2.933729,3795.0,3.023724,2210.0,4.186092,578.0,4.710144,235.0,2016-01-19 12:00:00
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,...,9309.0,2.971749,558.0,4.369606,1383.0,4.433726,712.0,3.225647,827.0,2016-04-06 20:00:00
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,...,19077.0,4.312554,990.0,4.129998,1515.0,3.843676,1555.0,4.294911,2487.0,2016-03-26 14:00:00


In [25]:
for gby_col in ['pickup_hour', 'pickup_date', 'pickup_dt_bin',
               'pickup_week_hour', 'pickup_cluster', 'dropoff_cluster']:
    gby = train.groupby(gby_col).mean()[['avg_speed_h', 'avg_speed_m', 'log_trip_duration']]
    gby.columns = ['%s_gby_%s' % (col, gby_col) for col in gby.columns]
    print(gby.columns)

Index(['avg_speed_h_gby_pickup_hour', 'avg_speed_m_gby_pickup_hour',
       'log_trip_duration_gby_pickup_hour'],
      dtype='object')
Index(['avg_speed_h_gby_pickup_date', 'avg_speed_m_gby_pickup_date',
       'log_trip_duration_gby_pickup_date'],
      dtype='object')
Index(['avg_speed_h_gby_pickup_dt_bin', 'avg_speed_m_gby_pickup_dt_bin',
       'log_trip_duration_gby_pickup_dt_bin'],
      dtype='object')
Index(['avg_speed_h_gby_pickup_week_hour', 'avg_speed_m_gby_pickup_week_hour',
       'log_trip_duration_gby_pickup_week_hour'],
      dtype='object')
Index(['avg_speed_h_gby_pickup_cluster', 'avg_speed_m_gby_pickup_cluster',
       'log_trip_duration_gby_pickup_cluster'],
      dtype='object')
Index(['avg_speed_h_gby_dropoff_cluster', 'avg_speed_m_gby_dropoff_cluster',
       'log_trip_duration_gby_dropoff_cluster'],
      dtype='object')


In [26]:
gby.head()

Unnamed: 0_level_0,avg_speed_h_gby_dropoff_cluster,avg_speed_m_gby_dropoff_cluster,log_trip_duration_gby_dropoff_cluster
dropoff_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3.222116,4.126587,6.52659
1,3.390453,4.286948,6.495371
2,8.137941,11.160425,7.362408
3,4.594309,6.044043,6.249759
4,4.537347,5.992313,6.744443


In [27]:
# Count trips over 60min
df_counts = df_all.set_index('pickup_datetime')[['id']].sort_index()
df_counts['count_60min'] = df_counts.isnull().rolling(group_freq).count()['id']
train = train.merge(df_counts, on='id', how='left')
test = test.merge(df_counts, on='id', how='left')

train.head(30)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,avg_speed_h_pickup_hour_center_lat_bin_center_long_bin,cnt_pickup_hour_center_lat_bin_center_long_bin,avg_speed_h_pickup_hour_pickup_cluster,cnt_pickup_hour_pickup_cluster,avg_speed_h_pickup_hour_dropoff_cluster,cnt_pickup_hour_dropoff_cluster,avg_speed_h_pickup_cluster_dropoff_cluster,cnt_pickup_cluster_dropoff_cluster,pickup_datetime_group,count_60min
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,...,3.085869,3451.0,3.152372,1901.0,3.121137,1219.0,3.05722,778.0,2016-03-14 17:00:00,580.0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,...,3.553433,2701.0,4.77489,722.0,3.88944,949.0,2.867844,473.0,2016-06-12 01:00:00,652.0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,...,2.933729,3795.0,3.023724,2210.0,4.186092,578.0,4.710144,235.0,2016-01-19 12:00:00,547.0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,...,2.971749,558.0,4.369606,1383.0,4.433726,712.0,3.225647,827.0,2016-04-06 20:00:00,776.0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,...,4.312554,990.0,4.129998,1515.0,3.843676,1555.0,4.294911,2487.0,2016-03-26 14:00:00,609.0
5,id0801584,2,2016-01-30 22:01:40,2016-01-30 22:09:03,6,-73.982857,40.742195,-73.992081,40.749184,0,...,3.575362,4844.0,3.862582,2367.0,3.544228,1537.0,2.247211,1326.0,2016-01-30 22:00:00,659.0
6,id1813257,1,2016-06-17 22:34:59,2016-06-17 22:40:40,4,-73.969017,40.757839,-73.957405,40.765896,0,...,4.785684,978.0,4.132709,831.0,4.442024,1367.0,3.25881,752.0,2016-06-17 23:00:00,704.0
7,id1324603,2,2016-05-21 07:54:58,2016-05-21 08:20:49,1,-73.969276,40.797779,-73.92247,40.760559,0,...,3.83421,939.0,4.653557,815.0,4.148504,149.0,,,2016-05-21 08:00:00,222.0
8,id1301050,1,2016-05-27 23:12:23,2016-05-27 23:16:38,1,-73.999481,40.7384,-73.985786,40.732815,0,...,3.918048,4133.0,4.144213,1432.0,4.142915,782.0,2.950334,487.0,2016-05-27 23:00:00,599.0
9,id0012891,2,2016-03-10 21:45:01,2016-03-10 22:05:26,1,-73.981049,40.744339,-73.973,40.789989,0,...,4.108588,2414.0,4.231687,2101.0,5.191671,1742.0,4.20069,197.0,2016-03-10 22:00:00,764.0


In [28]:
# Count how many trips are going to each cluster over time
dropoff_counts = df_all \
    .set_index('pickup_datetime') \
    .groupby([pd.TimeGrouper(group_freq), 'dropoff_cluster']) \
    .agg({'id': 'count'}) \
    .reset_index().set_index('pickup_datetime') \
    .groupby('dropoff_cluster').rolling('240min').mean() \
    .drop('dropoff_cluster', axis=1) \
    .reset_index().set_index('pickup_datetime').shift(freq='-120min').reset_index() \
    .rename(columns={'pickup_datetime': 'pickup_datetime_group', 'id': 'dropoff_cluster_count'})

train['dropoff_cluster_count'] = train[['pickup_datetime_group', 'dropoff_cluster']].merge(dropoff_counts, on=['pickup_datetime_group', 'dropoff_cluster'], how='left')['dropoff_cluster_count'].fillna(0)
test['dropoff_cluster_count'] = test[['pickup_datetime_group', 'dropoff_cluster']].merge(dropoff_counts, on=['pickup_datetime_group', 'dropoff_cluster'], how='left')['dropoff_cluster_count'].fillna(0)

# Count how many trips are going from each cluster over time
df_all = pd.concat((train, test))[['id', 'pickup_datetime', 'pickup_cluster', 'dropoff_cluster']]
pickup_counts = df_all \
    .set_index('pickup_datetime') \
    .groupby([pd.TimeGrouper(group_freq), 'pickup_cluster']) \
    .agg({'id': 'count'}) \
    .reset_index().set_index('pickup_datetime') \
    .groupby('pickup_cluster').rolling('240min').mean() \
    .drop('pickup_cluster', axis=1) \
    .reset_index().set_index('pickup_datetime').shift(freq='-120min').reset_index() \
    .rename(columns={'pickup_datetime': 'pickup_datetime_group', 'id': 'pickup_cluster_count'})

train['pickup_cluster_count'] = train[['pickup_datetime_group', 'pickup_cluster']].merge(pickup_counts, on=['pickup_datetime_group', 'pickup_cluster'], how='left')['pickup_cluster_count'].fillna(0)
test['pickup_cluster_count'] = test[['pickup_datetime_group', 'pickup_cluster']].merge(pickup_counts, on=['pickup_datetime_group', 'pickup_cluster'], how='left')['pickup_cluster_count'].fillna(0)


train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,avg_speed_h_pickup_hour_pickup_cluster,cnt_pickup_hour_pickup_cluster,avg_speed_h_pickup_hour_dropoff_cluster,cnt_pickup_hour_dropoff_cluster,avg_speed_h_pickup_cluster_dropoff_cluster,cnt_pickup_cluster_dropoff_cluster,pickup_datetime_group,count_60min,dropoff_cluster_count,pickup_cluster_count
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,...,3.152372,1901.0,3.121137,1219.0,3.05722,778.0,2016-03-14 17:00:00,580.0,12.0,25.5
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,...,4.77489,722.0,3.88944,949.0,2.867844,473.0,2016-06-12 01:00:00,652.0,10.5,7.25
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,...,3.023724,2210.0,4.186092,578.0,4.710144,235.0,2016-01-19 12:00:00,547.0,5.75,22.75
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,...,4.369606,1383.0,4.433726,712.0,3.225647,827.0,2016-04-06 20:00:00,776.0,6.0,15.0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,...,4.129998,1515.0,3.843676,1555.0,4.294911,2487.0,2016-03-26 14:00:00,609.0,11.0,12.25


In [29]:
# OSRM Features
fr1 = pd.read_csv('input/fastest_routes_train_part_1.csv', usecols=['id', 'total_distance', 'total_travel_time',  'number_of_steps'])
fr2 = pd.read_csv('input/fastest_routes_train_part_2.csv', usecols=['id', 'total_distance', 'total_travel_time', 'number_of_steps'])
test_street_info = pd.read_csv('input/fastest_routes_test.csv', usecols=['id', 'total_distance', 'total_travel_time', 'number_of_steps'])
train_street_info = pd.concat((fr1, fr2))
train = train.merge(train_street_info, how='left', on='id')
test = test.merge(test_street_info, how='left', on='id')
train_street_info.head()

Unnamed: 0,id,total_distance,total_travel_time,number_of_steps
0,id2875421,2009.1,164.9,5
1,id2377394,2513.2,332.0,6
2,id3504673,1779.4,235.8,4
3,id2181028,1614.9,140.1,5
4,id0801584,1393.5,189.4,5


In [30]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,cnt_pickup_hour_dropoff_cluster,avg_speed_h_pickup_cluster_dropoff_cluster,cnt_pickup_cluster_dropoff_cluster,pickup_datetime_group,count_60min,dropoff_cluster_count,pickup_cluster_count,total_distance,total_travel_time,number_of_steps
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,...,1219.0,3.05722,778.0,2016-03-14 17:00:00,580.0,12.0,25.5,2009.1,164.9,5.0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,...,949.0,2.867844,473.0,2016-06-12 01:00:00,652.0,10.5,7.25,2513.2,332.0,6.0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,...,578.0,4.710144,235.0,2016-01-19 12:00:00,547.0,5.75,22.75,11060.8,767.6,16.0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,...,712.0,3.225647,827.0,2016-04-06 20:00:00,776.0,6.0,15.0,1779.4,235.8,4.0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,...,1555.0,4.294911,2487.0,2016-03-26 14:00:00,609.0,11.0,12.25,1614.9,140.1,5.0


In [31]:
feature_names = list(train.columns)
print(np.setdiff1d(train.columns, test.columns))
do_not_use_for_training = ['id', 'log_trip_duration', 'pickup_datetime', 'dropoff_datetime', 'trip_duration', 'check_trip_duration',
                           'pickup_date', 'avg_speed_h', 'avg_speed_m', 'pickup_lat_bin', 'pickup_long_bin',
                           'center_lat_bin', 'center_long_bin', 'pickup_dt_bin', 'pickup_datetime_group']
feature_names = [f for f in train.columns if f not in do_not_use_for_training]
print(feature_names)
print('We have %i features.' % len(feature_names))
train[feature_names].count()
y = np.log(train['trip_duration'].values + 1)

t1 = dt.datetime.now()
print('Feature extraction time: %i seconds' % (t1 - t0).seconds)

['avg_speed_h' 'avg_speed_m' 'check_trip_duration' 'dropoff_datetime'
 'log_trip_duration' 'trip_duration']
['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag', 'pickup_pca0', 'pickup_pca1', 'dropoff_pca0', 'dropoff_pca1', 'distance_haversine', 'distance_dummy_manhattan', 'direction', 'pca_manhattan', 'center_latitude', 'center_longitude', 'pickup_weekday', 'pickup_hour_weekofyear', 'pickup_hour', 'pickup_minute', 'pickup_dt', 'pickup_week_hour', 'week_delta', 'week_delta_sin', 'hour_sin', 'pickup_cluster', 'dropoff_cluster', 'avg_speed_h_gby_pickup_hour', 'avg_speed_m_gby_pickup_hour', 'log_trip_duration_gby_pickup_hour', 'avg_speed_h_gby_pickup_date', 'avg_speed_m_gby_pickup_date', 'log_trip_duration_gby_pickup_date', 'avg_speed_h_gby_pickup_dt_bin', 'avg_speed_m_gby_pickup_dt_bin', 'log_trip_duration_gby_pickup_dt_bin', 'avg_speed_h_gby_pickup_week_hour', 'avg_speed_m_gby_pickup_week_hour', 'log_trip_d

# Feature check before modeling


In [32]:
feature_stats = pd.DataFrame({'feature': feature_names})
feature_stats.loc[:, 'train_mean'] = np.nanmean(train[feature_names].values, axis=0).round(4)
feature_stats.loc[:, 'test_mean'] = np.nanmean(test[feature_names].values, axis=0).round(4)
feature_stats.loc[:, 'train_std'] = np.nanstd(train[feature_names].values, axis=0).round(4)
feature_stats.loc[:, 'test_std'] = np.nanstd(test[feature_names].values, axis=0).round(4)
feature_stats.loc[:, 'train_nan'] = np.mean(np.isnan(train[feature_names].values), axis=0).round(3)
feature_stats.loc[:, 'test_nan'] = np.mean(np.isnan(test[feature_names].values), axis=0).round(3)
feature_stats.loc[:, 'train_test_mean_diff'] = np.abs(feature_stats['train_mean'] - feature_stats['test_mean']) / np.abs(feature_stats['train_std'] + feature_stats['test_std'])  * 2
feature_stats.loc[:, 'train_test_nan_diff'] = np.abs(feature_stats['train_nan'] - feature_stats['test_nan'])
feature_stats = feature_stats.sort_values(by='train_test_mean_diff')
feature_stats[['feature', 'train_test_mean_diff']].tail()

Unnamed: 0,feature,train_test_mean_diff
46,avg_speed_h_center_lat_bin_center_long_bin,0.002543
21,pickup_dt,0.002648
7,pickup_pca0,0.002774
10,dropoff_pca1,0.002833
18,pickup_hour_weekofyear,0.002872


# Modeling 

In [33]:
y[y.argsort()[:50]] # y is log :-)

array([ 0.69314718,  0.69314718,  0.69314718,  0.69314718,  0.69314718,
        0.69314718,  0.69314718,  0.69314718,  0.69314718,  0.69314718,
        0.69314718,  0.69314718,  0.69314718,  0.69314718,  0.69314718,
        0.69314718,  0.69314718,  0.69314718,  0.69314718,  0.69314718,
        0.69314718,  0.69314718,  0.69314718,  0.69314718,  0.69314718,
        0.69314718,  0.69314718,  0.69314718,  0.69314718,  0.69314718,
        0.69314718,  0.69314718,  0.69314718,  1.09861229,  1.09861229,
        1.09861229,  1.09861229,  1.09861229,  1.09861229,  1.09861229,
        1.09861229,  1.09861229,  1.09861229,  1.09861229,  1.09861229,
        1.09861229,  1.09861229,  1.09861229,  1.09861229,  1.09861229])

In [34]:
train.loc[train.trip_duration.argsort() , ['pickup_longitude', 
                                           'pickup_latitude', 
                                           'dropoff_longitude', 
                                           'dropoff_latitude', 
                                           'store_and_fwd_flag', 
                                           'distance_haversine', 
                                           'trip_duration']].head(50)

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,distance_haversine,trip_duration
207497,-73.819893,40.740822,-73.819885,40.740822,0,0.000643,1
1382872,-73.987991,40.724083,-73.987991,40.724079,0,0.000424,1
1360664,-73.991486,40.74194,-73.991478,40.741955,0,0.001814,1
346102,-73.985825,40.75576,-73.985901,40.755829,0,0.00998,1
1034341,-73.953728,40.670036,-73.953346,40.670021,0,0.032217,1
1439166,-73.975677,40.785488,-73.976372,40.785831,0,0.069815,1
35196,-73.940384,40.786423,-73.9403,40.786373,0,0.008963,1
918415,-74.004005,40.745125,-74.003998,40.745144,0,0.002216,1
767271,-73.982925,40.738781,-73.982925,40.738781,0,0.0,1
810851,-73.946075,40.705254,-73.946075,40.705254,0,0.0,1


In [35]:
train.loc[train.trip_duration.argsort() , ['pickup_longitude', 
                                           'pickup_latitude', 
                                           'dropoff_longitude', 
                                           'dropoff_latitude', 
                                           'store_and_fwd_flag', 
                                           'distance_haversine', 
                                           'trip_duration']].tail(50)

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,distance_haversine,trip_duration
913947,-73.980469,40.733913,-73.995087,40.743858,0,1.655186,86353
1298124,-74.005936,40.736198,-73.994583,40.750385,0,1.844787,86353
660239,-73.994003,40.724518,-73.985214,40.727486,0,0.81083,86354
98761,-74.009071,40.710686,-73.980965,40.764664,0,6.452339,86354
992593,-74.003242,40.732742,-74.006226,40.71188,0,2.333395,86354
447905,-74.00087,40.75758,-74.004807,40.748051,0,1.110267,86354
1107580,-73.991158,40.734909,-74.007851,40.714909,0,2.631501,86355
645313,-73.99115,40.750629,-73.987946,40.736725,0,1.569508,86356
836498,-74.001968,40.728039,-74.000465,40.742641,0,1.628673,86356
1296280,-73.992279,40.743641,-73.985779,40.74728,0,0.680896,86356


In [36]:
train.shape  , train.loc[train.trip_duration < 1939735].shape 

((1458644, 77), (1458640, 77))

# Additional data 

In [38]:
train_augmented = pd.read_csv('nyc-taxi-trip-noisy/train_augmented.csv')
test_augmented = pd.read_csv('nyc-taxi-trip-noisy/test_augmented.csv')

In [39]:
train_augmented.shape , test_augmented.shape

((1458643, 16), (625134, 16))

In [40]:
train_augmented.head()

Unnamed: 0,id,distance,duration,motorway,trunk,primary,secondary,tertiary,unclassified,residential,nTrafficSignals,nCrossing,nStop,nIntersection,srcCounty,dstCounty
0,id2875421,2009.1,160.9,0.0,0.0,0.0,0.0,1.0,0.0,0.0,14,5,0,4,1.0,1.0
1,id2377394,2513.4,256.5,0.0,0.0,0.0,0.348518,0.174776,0.0,0.143903,25,13,0,0,1.0,1.0
2,id3858529,9910.7,679.6,0.0,0.54282,0.0,0.372717,0.039806,0.0,0.006861,38,12,0,3,1.0,1.0
3,id3504673,1779.1,181.8,0.0,0.0,0.0,0.0,0.424452,0.0,0.039741,18,6,0,1,1.0,1.0
4,id2181028,1615.0,132.2,0.0,0.0,0.0,0.637338,0.362663,0.0,0.0,17,2,0,2,1.0,1.0


In [42]:
ti = np.intersect1d(ar1=train.id,ar2=train_augmented.id)

In [43]:
tie = np.intersect1d(ar1=train.id,ar2=test_augmented.id)

In [45]:
len(ti) , len(tie) , train.shape

(1458643, 0, (1458644, 77))

In [62]:
train.loc[train['id']=='id0551003'].trip_duration , '--',train_augmented.loc[train_augmented['id']=='id0551003'].duration

(66718    9451
 Name: trip_duration, dtype: int64, '--', 66718    1213.5
 Name: duration, dtype: float64)

In [54]:
train.loc[train['id']=='id2875421']

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,cnt_pickup_hour_dropoff_cluster,avg_speed_h_pickup_cluster_dropoff_cluster,cnt_pickup_cluster_dropoff_cluster,pickup_datetime_group,count_60min,dropoff_cluster_count,pickup_cluster_count,total_distance,total_travel_time,number_of_steps
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,...,1219.0,3.05722,778.0,2016-03-14 17:00:00,580.0,12.0,25.5,2009.1,164.9,5.0


In [50]:
train_augmented.loc[train_augmented['id']=='id2875421']

Unnamed: 0,id,distance,duration,motorway,trunk,primary,secondary,tertiary,unclassified,residential,nTrafficSignals,nCrossing,nStop,nIntersection,srcCounty,dstCounty
0,id2875421,2009.1,160.9,0.0,0.0,0.0,0.0,1.0,0.0,0.0,14,5,0,4,1.0,1.0


In [55]:
tie2 = np.intersect1d(ar1=test.id,ar2=test_augmented.id)
len(tie2) ,  test.shape

(625134, (625134, 71))

In [56]:
test_augmented.head()

Unnamed: 0,id,distance,duration,motorway,trunk,primary,secondary,tertiary,unclassified,residential,nTrafficSignals,nCrossing,nStop,nIntersection,srcCounty,dstCounty
0,id3004672,3795.9,360.7,0.0,0.0,0.0,0.522511,0.477489,0.0,0.0,37,39,0,1,1.0,1.0
1,id3505355,2829.9,196.4,0.0,0.0,0.0,0.255318,0.724963,0.0,0.0,5,0,0,17,3.0,3.0
2,id1217141,1499.5,148.5,0.0,0.0,0.0,0.428047,0.231395,0.0,0.0,12,4,0,0,1.0,1.0
3,id2150126,6492.3,442.4,0.0,0.561299,0.0,0.213389,0.074967,0.0,0.078344,23,9,0,0,1.0,1.0
4,id1598245,1108.2,94.0,0.0,0.0,0.0,0.638693,0.195813,0.0,0.0,9,0,0,1,1.0,1.0


In [57]:
test.loc[test['id']=='id3004672']

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,pickup_date,...,cnt_pickup_hour_dropoff_cluster,avg_speed_h_pickup_cluster_dropoff_cluster,cnt_pickup_cluster_dropoff_cluster,pickup_datetime_group,count_60min,dropoff_cluster_count,pickup_cluster_count,total_distance,total_travel_time,number_of_steps
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,0,2016-06-30,...,1722.0,3.343171,369.0,2016-07-01,624.0,0.0,0.0,3795.9,424.6,4


In [None]:
train_augmented = pd.read_csv('nyc-taxi-trip-noisy/train_augmented.csv')