In [1]:
import datetime
import pandas as pd
import numpy as np


In [3]:
df_train = pd.read_hdf("data/train_preprocessed_v4.h5", low_memory = False)
df_train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,weekday,hour,distance,JFK_distance,invalid
0,7.5,-73.978463,40.759072,-73.963577,40.774255,1,2014.0,9,6,15,1.71964,21.029044,0
1,6.5,-73.982397,40.770245,-73.97245,40.764062,1,2012.0,5,1,7,1.122217,21.940741,0
2,12.9,-74.006507,40.718808,-73.961096,40.760711,1,2009.0,5,2,20,5.210553,20.679895,0
3,20.1,-73.999242,40.734165,-73.972597,40.751228,2,2010.0,7,4,22,3.008668,21.891888,0
4,4.1,-73.97775,40.746274,-73.97276,40.755768,1,2010.0,8,2,13,0.626738,21.931951,0


In [4]:
taxi = df_train

In [7]:
taxi = taxi.drop(columns = ['passenger_count'])

In [8]:
taxi.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,year,month,weekday,hour,distance,JFK_distance,invalid
0,7.5,-73.978463,40.759072,-73.963577,40.774255,2014.0,9,6,15,1.71964,21.029044,0
1,6.5,-73.982397,40.770245,-73.97245,40.764062,2012.0,5,1,7,1.122217,21.940741,0
2,12.9,-74.006507,40.718808,-73.961096,40.760711,2009.0,5,2,20,5.210553,20.679895,0
3,20.1,-73.999242,40.734165,-73.972597,40.751228,2010.0,7,4,22,3.008668,21.891888,0
4,4.1,-73.97775,40.746274,-73.97276,40.755768,2010.0,8,2,13,0.626738,21.931951,0


In [4]:
taxi = taxi.dropna()

In [5]:
taxi = taxi[taxi.fare_amount != 'fare_amount']

In [6]:
taxi['pickup_latitude'] = pd.to_numeric(taxi['pickup_latitude'])
taxi['pickup_longitude'] = pd.to_numeric(taxi['pickup_longitude'])
taxi['dropoff_latitude'] = pd.to_numeric(taxi['dropoff_latitude'])
taxi['dropoff_longitude'] = pd.to_numeric(taxi['dropoff_longitude'])
taxi['fare_amount'] = pd.to_numeric(taxi['fare_amount'])
taxi['passenger_count'] = pd.to_numeric(taxi['passenger_count'])


In [7]:
taxi = taxi[taxi['passenger_count'] <= 6]

In [8]:
def hours_after_midnight(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        time_arr = time[1].split(':')
        hours_after_midnight = int(time_arr[0]) 
        return hours_after_midnight
    else:
        return np.nan

def day_of_week(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        date_arr = time[0].split('-')
        date = datetime.date(int(date_arr[0]), int(date_arr[1]), int(date_arr[2]))
        return (date.weekday() + 1)
    else:
        return np.nan

def month(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        return int(time[0].split('-')[1])
    else:
        return np.nan

def year(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        return float(time[0].split('-')[0])
    else:
        return np.nan

hour_of_day_vec = np.vectorize(hours_after_midnight)
day_of_week_vec = np.vectorize(day_of_week)
month_vec = np.vectorize(month)
year_vec = np.vectorize(year)

In [9]:
time_column = taxi['pickup_datetime'].to_numpy()
taxi.drop(columns = ['pickup_datetime', 'key'], inplace = True)

In [10]:
taxi['year'] = year_vec(time_column)
taxi['month'] = month_vec(time_column)
taxi['weekday'] = day_of_week_vec(time_column)
taxi['hour'] = hour_of_day_vec(time_column)

In [11]:
from math import radians, cos, sin, asin, sqrt
import numpy as np

def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6371 * c  # 6371 is Radius of earth in kilometers. Use 3956 for miles
    return km

taxi['distance'] = haversine_np(taxi['pickup_latitude'], taxi['pickup_longitude'], taxi['dropoff_latitude'] , taxi['dropoff_longitude'])

In [12]:
JFK_coord = (40.6413, -73.7781)
pickup_JFK = haversine_np(taxi['pickup_latitude'], taxi['pickup_longitude'], JFK_coord[0], JFK_coord[1]) 
dropoff_JFK = haversine_np(JFK_coord[0], JFK_coord[1], taxi['dropoff_latitude'], taxi['dropoff_longitude'])
taxi['JFK_distance'] = pd.concat([pickup_JFK, dropoff_JFK], axis=1).min(axis=1)
taxi.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,weekday,hour,distance,JFK_distance
0,7.5,-73.978463,40.759072,-73.963577,40.774255,1,2014.0,9,6,15,1.71964,21.029044
1,6.5,-73.982397,40.770245,-73.97245,40.764062,1,2012.0,5,1,7,1.122217,21.940741
2,12.9,-74.006507,40.718808,-73.961096,40.760711,1,2009.0,5,2,20,5.210553,20.679895
3,20.1,-73.999242,40.734165,-73.972597,40.751228,2,2010.0,7,4,22,3.008668,21.891888
4,4.1,-73.97775,40.746274,-73.97276,40.755768,1,2010.0,8,2,13,0.626738,21.931951


In [13]:
taxi = taxi[taxi['distance'] > 0]

In [14]:
def is_valid(p_lat, p_long, d_lat, d_long):
    bounds = (-74.5, -72.8, 40.5, 41.8)
    if ((p_long >= bounds[0]) & (p_long <= bounds[1]) & (p_lat >= bounds[2]) & (p_lat <= bounds[3])):
        if (d_long >= bounds[0]) & (d_long <= bounds[1]) & (d_lat >= bounds[2]) & (d_lat <= bounds[3]):
            return 0
    return 1

valid_vec = np.vectorize(is_valid)
taxi['invalid'] = valid_vec(taxi['pickup_latitude'], taxi['pickup_longitude'], taxi['dropoff_latitude'], taxi['dropoff_longitude'])
taxi.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,weekday,hour,distance,JFK_distance,invalid
0,7.5,-73.978463,40.759072,-73.963577,40.774255,1,2014.0,9,6,15,1.71964,21.029044,0
1,6.5,-73.982397,40.770245,-73.97245,40.764062,1,2012.0,5,1,7,1.122217,21.940741,0
2,12.9,-74.006507,40.718808,-73.961096,40.760711,1,2009.0,5,2,20,5.210553,20.679895,0
3,20.1,-73.999242,40.734165,-73.972597,40.751228,2,2010.0,7,4,22,3.008668,21.891888,0
4,4.1,-73.97775,40.746274,-73.97276,40.755768,1,2010.0,8,2,13,0.626738,21.931951,0


In [15]:
import matplotlib.pyplot as plt
nyc_bounds = (-74.5, -72.8, 40.5, 41.8)

def select_within_bounds(df, bounds):
    pickup_indices = (df.pickup_longitude >= bounds[0]) & (df.pickup_longitude <= bounds[1]) & \
        (df.pickup_latitude >= bounds[2]) & (df.pickup_latitude <= bounds[3])

    dropoff_indices = (df.dropoff_longitude >= bounds[0]) & (df.dropoff_longitude <= bounds[1]) & \
        (df.dropoff_latitude >= bounds[2]) & (df.dropoff_latitude <= bounds[3])

    return pickup_indices & dropoff_indices

def map_to_nyc_mask(longitude, latitude, points_x, points_y, bounds):
    x = (points_x * (longitude - bounds[0]) / (bounds[1] - bounds[0])).astype('int')
    y = (points_y - points_y * (latitude - bounds[2]) / (bounds[3] - bounds[2])).astype('int')
    return x,y

def remove_points_in_water(df):
    # Create a mask of the New York City with 1 as land and 0 as water
    nyc_mask = plt.imread('img/nyc_water_mask.png')[:,:,0] > 0.9

    # Remove points outside New York
    df = df[select_within_bounds(df, nyc_bounds)]
    print("After Bounds:", df.shape[0])

    # Map the latitudes and longitudes to the points in the map
    pickup_x, pickup_y = map_to_nyc_mask(df.pickup_longitude, df.pickup_latitude, nyc_mask.shape[1],
                                         nyc_mask.shape[0], nyc_bounds)
    dropoff_x, dropoff_y = map_to_nyc_mask(df.dropoff_longitude, df.dropoff_latitude, nyc_mask.shape[1],
                                        nyc_mask.shape[0], nyc_bounds)
    
    pickup_y[pickup_y == 1262] = 1261
    dropoff_y[dropoff_y == 1262] = 1261
    pickup_x[pickup_x == 1242] = 1241
    dropoff_x[dropoff_x == 1242] = 1241

    # Compute the indices where pickup and dropoff locations are on land
    indices = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]

    df = df[indices]
    print("Number of trips in water: ", np.sum(~indices))
    return df



In [16]:
def make_invalid_water(invalid_col):
    if (invalid_col == 1):
        return 1
    else:
        return 2
inv_vec = np.vectorize(make_invalid_water)

def get_water_invalid(df):
    df2 = remove_points_in_water(df)
    df_diff = pd.concat([df, df2])
    print("Concatenated dataframes")
    df_diff = df_diff.drop_duplicates(keep=False)
    print("dropped duplicates")
    df_diff['invalid'] = inv_vec(df_diff.invalid)
    df = pd.concat([df2, df_diff])
    df.reset_index(inplace = True)
    return df

In [17]:
taxi = get_water_invalid(taxi)

After Bounds: 42940936
Number of trips in water:  8402
Concatenated dataframes
dropped duplicates


In [18]:
taxi.drop(columns = ['index'], inplace = True)
taxi.shape

(43074441, 13)

In [19]:
taxi.to_hdf(r'data/train_preprocessed_v4.h5', 'data')

In [20]:
from sklearn.model_selection import train_test_split
y = taxi['fare_amount']
X = taxi.drop(columns=['fare_amount'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

In [21]:
import lightgbm as lgb

params = {
        'learning_rate': 0.65,
        'application': 'regression',
        'max_depth': 3,
        'num_leaves': 200,
        'verbosity': -1,
        'metric': 'RMSE',
    }

In [22]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval,
                num_boost_round=300)
gbm.save_model('model_v9.txt')

[1]	valid_0's rmse: 21.856
[2]	valid_0's rmse: 21.6396
[3]	valid_0's rmse: 21.5651
[4]	valid_0's rmse: 21.5336
[5]	valid_0's rmse: 21.5164
[6]	valid_0's rmse: 21.488
[7]	valid_0's rmse: 21.4791
[8]	valid_0's rmse: 21.4709
[9]	valid_0's rmse: 21.4645
[10]	valid_0's rmse: 21.46
[11]	valid_0's rmse: 21.4524
[12]	valid_0's rmse: 21.4455
[13]	valid_0's rmse: 21.4418
[14]	valid_0's rmse: 21.433
[15]	valid_0's rmse: 21.431
[16]	valid_0's rmse: 21.4229
[17]	valid_0's rmse: 21.4197
[18]	valid_0's rmse: 21.4134
[19]	valid_0's rmse: 21.4111
[20]	valid_0's rmse: 21.4077
[21]	valid_0's rmse: 21.4043
[22]	valid_0's rmse: 21.4034
[23]	valid_0's rmse: 21.4005
[24]	valid_0's rmse: 21.3987
[25]	valid_0's rmse: 21.3972
[26]	valid_0's rmse: 21.3961
[27]	valid_0's rmse: 21.3951
[28]	valid_0's rmse: 21.3941
[29]	valid_0's rmse: 21.3924
[30]	valid_0's rmse: 21.3912
[31]	valid_0's rmse: 21.3898
[32]	valid_0's rmse: 21.389
[33]	valid_0's rmse: 21.3877
[34]	valid_0's rmse: 21.3866
[35]	valid_0's rmse: 21.384
[3

[279]	valid_0's rmse: 21.3162
[280]	valid_0's rmse: 21.3162
[281]	valid_0's rmse: 21.3161
[282]	valid_0's rmse: 21.316
[283]	valid_0's rmse: 21.3159
[284]	valid_0's rmse: 21.3158
[285]	valid_0's rmse: 21.3157
[286]	valid_0's rmse: 21.3156
[287]	valid_0's rmse: 21.3156
[288]	valid_0's rmse: 21.3154
[289]	valid_0's rmse: 21.3154
[290]	valid_0's rmse: 21.3153
[291]	valid_0's rmse: 21.3154
[292]	valid_0's rmse: 21.3152
[293]	valid_0's rmse: 21.3152
[294]	valid_0's rmse: 21.315
[295]	valid_0's rmse: 21.3149
[296]	valid_0's rmse: 21.3153
[297]	valid_0's rmse: 21.3153
[298]	valid_0's rmse: 21.3152
[299]	valid_0's rmse: 21.3151
[300]	valid_0's rmse: 21.3162


<lightgbm.basic.Booster at 0x7fad500ed490>

In [23]:
df_test = pd.read_csv('data/test.csv', low_memory = False)
df_test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-01-01 00:01:04.0000003,2009-01-01 00:01:04 UTC,-73.972484,40.742743,-73.918937,40.764496,1
1,2009-01-01 00:01:26.0000001,2009-01-01 00:01:26 UTC,-73.98585,40.722826,-73.986301,40.739347,1
2,2009-01-01 00:04:42.0000001,2009-01-01 00:04:42 UTC,-73.988917,40.740142,-73.982769,40.777291,1
3,2009-01-01 00:04:54.0000001,2009-01-01 00:04:54 UTC,-73.977163,40.76449,-73.914474,40.771575,1
4,2009-01-01 00:04:59.0000004,2009-01-01 00:04:59 UTC,-73.948849,40.778003,-73.977678,40.748692,2


In [24]:
time_column = df_test['pickup_datetime'].to_numpy()
df_test.drop(columns = ['pickup_datetime'], inplace = True)

In [25]:
df_test['year'] = year_vec(time_column)
df_test['month'] = month_vec(time_column)
df_test['weekday'] = day_of_week_vec(time_column)
df_test['hour'] = hour_of_day_vec(time_column)

In [27]:
pickup_JFK = haversine_np(df_test['pickup_latitude'], df_test['pickup_longitude'], JFK_coord[0], JFK_coord[1]) 
dropoff_JFK = haversine_np(JFK_coord[0], JFK_coord[1], df_test['dropoff_latitude'], df_test['dropoff_longitude'])
df_test['JFK_distance'] = pd.concat([pickup_JFK, dropoff_JFK], axis=1).min(axis=1)

In [26]:
df_test['distance'] = haversine_np(df_test['pickup_latitude'], df_test['pickup_longitude'], df_test['dropoff_latitude'] , df_test['dropoff_longitude'])

In [28]:
df_test['invalid'] = valid_vec(df_test['pickup_latitude'], df_test['pickup_longitude'], df_test['dropoff_latitude'], df_test['dropoff_longitude'])
df_test.head()

Unnamed: 0,key,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,weekday,hour,distance,JFK_distance,invalid
0,2009-01-01 00:01:04.0000003,-73.972484,40.742743,-73.918937,40.764496,1,2009.0,1,4,0,5.991612,16.117319,0
1,2009-01-01 00:01:26.0000001,-73.98585,40.722826,-73.986301,40.739347,1,2009.0,1,4,0,0.509264,23.237424,0
2,2009-01-01 00:04:42.0000001,-73.988917,40.740142,-73.982769,40.777291,1,2009.0,1,4,0,1.328903,23.142153,0
3,2009-01-01 00:04:54.0000001,-73.977163,40.76449,-73.914474,40.771575,1,2009.0,1,4,0,6.974103,15.690509,0
4,2009-01-01 00:04:59.0000004,-73.948849,40.778003,-73.977678,40.748692,2,2009.0,1,4,0,3.329684,19.45075,0


In [29]:
df_test = get_water_invalid(df_test)

After Bounds: 10849338
Number of trips in water:  2311
Concatenated dataframes
dropped duplicates


In [30]:
keys = df_test['key'] 
df_test.drop(columns = ['key'], inplace = True)
df_test.head()

Unnamed: 0,index,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,weekday,hour,distance,JFK_distance,invalid
0,0,-73.972484,40.742743,-73.918937,40.764496,1,2009.0,1,4,0,5.991612,16.117319,0
1,1,-73.98585,40.722826,-73.986301,40.739347,1,2009.0,1,4,0,0.509264,23.237424,0
2,2,-73.988917,40.740142,-73.982769,40.777291,1,2009.0,1,4,0,1.328903,23.142153,0
3,3,-73.977163,40.76449,-73.914474,40.771575,1,2009.0,1,4,0,6.974103,15.690509,0
4,4,-73.948849,40.778003,-73.977678,40.748692,2,2009.0,1,4,0,3.329684,19.45075,0


In [31]:
df_test.fillna(df_test.mean())

Unnamed: 0,index,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,weekday,hour,distance,JFK_distance,invalid
0,0,-73.972484,40.742743,-73.918937,40.764496,1,2009.0,1,4,0,5.991612,16.117319,0
1,1,-73.985850,40.722826,-73.986301,40.739347,1,2009.0,1,4,0,0.509264,23.237424,0
2,2,-73.988917,40.740142,-73.982769,40.777291,1,2009.0,1,4,0,1.328903,23.142153,0
3,3,-73.977163,40.764490,-73.914474,40.771575,1,2009.0,1,4,0,6.974103,15.690509,0
4,4,-73.948849,40.778003,-73.977678,40.748692,2,2009.0,1,4,0,3.329684,19.450750,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11084767,11084429,0.000000,0.000000,0.000000,0.000000,2,2015.0,6,2,22,0.000000,8646.710415,1
11084768,11084468,0.000000,0.000000,0.000000,0.000000,1,2015.0,6,2,22,0.000000,8646.710415,1
11084769,11084635,0.000000,0.000000,0.000000,0.000000,5,2015.0,6,2,23,0.000000,8646.710415,1
11084770,11084715,0.000000,0.000000,0.000000,0.000000,3,2015.0,6,2,23,0.000000,8646.710415,1


In [33]:
df_test.drop(columns = ['index'], inplace = True)

In [34]:
df_test.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,weekday,hour,distance,JFK_distance,invalid
0,-73.972484,40.742743,-73.918937,40.764496,1,2009.0,1,4,0,5.991612,16.117319,0
1,-73.98585,40.722826,-73.986301,40.739347,1,2009.0,1,4,0,0.509264,23.237424,0
2,-73.988917,40.740142,-73.982769,40.777291,1,2009.0,1,4,0,1.328903,23.142153,0
3,-73.977163,40.76449,-73.914474,40.771575,1,2009.0,1,4,0,6.974103,15.690509,0
4,-73.948849,40.778003,-73.977678,40.748692,2,2009.0,1,4,0,3.329684,19.45075,0


In [37]:
pred_fares = gbm.predict(df_test, num_iteration=gbm.best_iteration)
df_final = pd.DataFrame({'key':keys, 'fare_amount':pred_fares})

In [38]:
df_final.to_csv(r'predictions/lightgbm_withfeaures_v4.csv', index = False)

In [39]:
df_test['key'] = keys
df_test.to_hdf(r'data/test_prepro_v4.h5', 'data')