In [1]:
import datetime
import pandas as pd
import numpy as np

In [2]:
def minutes_after_midnight(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        time_arr = time[1].split(':')
        minutes_after_midnight = 60 * int(time_arr[0]) + int(time_arr[1])
        return 2 * np.pi * minutes_after_midnight / 1440
    else:
        return np.nan

def day_of_week(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        date_arr = time[0].split('-')
        date = datetime.date(int(date_arr[0]), int(date_arr[1]), int(date_arr[2]))
        return 2 * np.pi * (date.weekday() + 1) / 7
    else:
        return np.nan

def month(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        return 2 * np.pi * int(time[0].split('-')[1]) / 12
    else:
        return np.nan

def year(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        return float(time[0].split('-')[0])
    else:
        return np.nan

time_of_day_vec = np.vectorize(minutes_after_midnight)
day_of_week_vec = np.vectorize(day_of_week)
month_vec = np.vectorize(month)
year_vec = np.vectorize(year)

# Function to calculate distance between two points and add it as a feature
from haversine import haversine
def distance(p_lat, p_long, d_lat, d_long):
    pickup = (p_lat, p_long)
    dropoff = (d_lat, d_long)
    dist = haversine(pickup, dropoff)
    return dist

dist_vector = np.vectorize(distance)

# Points in wata are bad..
import matplotlib.pyplot as plt
nyc_bounds = (-74.5, -72.8, 40.5, 41.8)

def select_within_bounds(df, bounds):
    pickup_indices = (df.pickup_longitude >= bounds[0]) & (df.pickup_longitude <= bounds[1]) & \
        (df.pickup_latitude >= bounds[2]) & (df.pickup_latitude <= bounds[3])

    dropoff_indices = (df.dropoff_longitude >= bounds[0]) & (df.dropoff_longitude <= bounds[1]) & \
        (df.dropoff_latitude >= bounds[2]) & (df.dropoff_latitude <= bounds[3])

    return pickup_indices & dropoff_indices

def map_to_nyc_mask(longitude, latitude, points_x, points_y, bounds):
    x = (points_x * (longitude - bounds[0]) / (bounds[1] - bounds[0])).astype('int')
    y = (points_y - points_y * (latitude - bounds[2]) / (bounds[3] - bounds[2])).astype('int')
    return x,y

def remove_points_in_water(df):
    # Create a mask of the New York City with 1 as land and 0 as water
    nyc_mask = plt.imread('img/nyc_water_mask.png')[:,:,0] > 0.9

    # Remove points outside New York
    df = df[select_within_bounds(df, nyc_bounds)]
    print("After Bounds:", df.shape[0])

    # Map the latitudes and longitudes to the points in the map
    pickup_x, pickup_y = map_to_nyc_mask(df.pickup_longitude, df.pickup_latitude, nyc_mask.shape[1],
                                         nyc_mask.shape[0], nyc_bounds)
    dropoff_x, dropoff_y = map_to_nyc_mask(df.dropoff_longitude, df.dropoff_latitude, nyc_mask.shape[1],
                                        nyc_mask.shape[0], nyc_bounds)
    
    pickup_y[pickup_y == 1262] = 1261
    dropoff_y[dropoff_y == 1262] = 1261
    pickup_x[pickup_x == 1242] = 1241
    dropoff_x[dropoff_x == 1242] = 1241

    # Compute the indices where pickup and dropoff locations are on land
    indices = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]

    df = df[indices]
    print("Number of trips in water: ", np.sum(~indices))
    return df

def preprocess(df):
    print("Initial number of points: ", df.shape[0])
    # Drop all null values
    df = df.dropna()

    # Cyclise time and remove key column
    time_column = df['pickup_datetime'].to_numpy()
    df = df.drop(columns=['pickup_datetime', 'key'])

    time_of_day = time_of_day_vec(time_column)
    day_of_week = day_of_week_vec(time_column)
    month = month_vec(time_column)
    df['year'] = year_vec(time_column)

    df['sin_time_of_day'] = np.sin(time_of_day)
    df['cos_time_of_day'] = np.cos(time_of_day)
    df['sin_day_of_week'] = np.sin(day_of_week)
    df['cos_day_of_week'] = np.cos(day_of_week)
    df['sin_month'] = np.sin(month)
    df['cos_month'] = np.cos(month)

    df = df.dropna()
    print("Number of points after removing null:", df.shape[0])

    # Make latitude and longitude numeric
    df['pickup_latitude'] = pd.to_numeric(df['pickup_latitude'])
    df['pickup_longitude'] = pd.to_numeric(df['pickup_longitude'])
    df['dropoff_latitude'] = pd.to_numeric(df['dropoff_latitude'])
    df['dropoff_longitude'] = pd.to_numeric(df['dropoff_longitude'])
    df['fare_amount'] = pd.to_numeric(df['fare_amount'])
    df['passenger_count'] = pd.to_numeric(df['passenger_count'])

    # Add distance column
    df['distance'] = dist_vector(df['pickup_latitude'].to_numpy(), df['pickup_longitude'].to_numpy(),
                     df['dropoff_latitude'].to_numpy(), df['dropoff_longitude'].to_numpy())

    # Remove 0 passenger count and negative fare amounts
    df = df[df['passenger_count'] > 0]
    df = df[df['passenger_count'] < 7]
    df['fare_amount'] = df[df['fare_amount'] > 0]
    df = df[df['distance'] > 0]
    print("Number of points after removing semantic:", df.shape[0])
    
    
    df = df.reset_index()

    return df

def preprocess_test(df):
    print("Initial number of points: ", df.shape[0])
    df_key = df['key']
    df.drop(columns = ['key'], inplace = True)
    # Cyclise time
    time_column = df['pickup_datetime'].to_numpy()
    df = df.drop(columns=['pickup_datetime'])

    time_of_day = time_of_day_vec(time_column)
    day_of_week = day_of_week_vec(time_column)
    month = month_vec(time_column)
    df['year'] = year_vec(time_column)
    print("checkpoint 1")
    df['sin_time_of_day'] = np.sin(time_of_day)
    df['cos_time_of_day'] = np.cos(time_of_day)
    df['sin_day_of_week'] = np.sin(day_of_week)
    df['cos_day_of_week'] = np.cos(day_of_week)
    df['sin_month'] = np.sin(month)
    df['cos_month'] = np.cos(month)
    print('Checkpoint 2')
    # Make illegal passenger_counts null
    df = df.mask(df['passenger_count'] <= 0)
    df = df.mask(df['passenger_count'] > 7)
    print('checkpoint 3')
    # Add distance column
    df['distance'] = dist_vector(df['pickup_latitude'].to_numpy(), df['pickup_longitude'].to_numpy(),
                     df['dropoff_latitude'].to_numpy(), df['dropoff_longitude'].to_numpy())
    df = df.reset_index(drop=True)
    print('checkpoint 4')
    df_mean = df.mean()
    print("Mean computed")
    # Impute the null points with mean
    df = df.fillna(df_mean)
    df['key'] = df_key
    df['invalid'] = valid_vec(df['pickup_latitude'], df['pickup_longitude'], df['dropoff_latitude'], df['dropoff_longitude'])
    df = get_water_invalid(df)
    print("Final number of points:", df.shape[0])
    return df

from sklearn.preprocessing import MinMaxScaler, StandardScaler
def scale(df):
    mm_scaler = MinMaxScaler()
    std_scaler = StandardScaler()

    mm_features = ['passenger_count', 'year']
    std_features = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude',
                    'distance']

    mm_scaler.fit(df[mm_features])
    std_scaler.fit(df[std_features])

    df[mm_features] = pd.DataFrame(mm_scaler.transform(df[mm_features]), columns=mm_features)
    df[std_features] = pd.DataFrame(std_scaler.transform(df[std_features]), columns=std_features)
    return df

In [3]:
df_train = pd.read_csv("data/train.csv", low_memory = False)

In [4]:
df_train = preprocess(df_train)

Initial number of points:  44339095
Number of points after removing null: 44338777
Number of points after removing semantic: 42920854


In [5]:
df_train.head()

Unnamed: 0,index,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,sin_time_of_day,cos_time_of_day,sin_day_of_week,cos_day_of_week,sin_month,cos_month,distance
0,0,7.5,-73.978463,40.759072,-73.963577,40.774255,1,2014.0,-0.782608,-0.622515,-0.781831,0.62349,-1.0,-1.83697e-16,2.102831
1,1,6.5,-73.982397,40.770245,-73.97245,40.764062,1,2012.0,0.872496,-0.488621,0.781831,0.62349,0.5,-0.8660254,1.083704
2,2,12.9,-74.006507,40.718808,-73.961096,40.760711,1,2009.0,-0.861629,0.507538,0.974928,-0.222521,0.5,-0.8660254,6.028895
3,3,20.1,-73.999242,40.734165,-73.972597,40.751228,2,2010.0,-0.27144,0.962455,-0.433884,-0.900969,-0.5,-0.8660254,2.939176
4,4,4.1,-73.97775,40.746274,-73.97276,40.755768,1,2010.0,-0.321439,-0.94693,0.974928,-0.222521,-0.866025,-0.5,1.136291


In [6]:
def is_valid(p_lat, p_long, d_lat, d_long):
    bounds = (-74.5, -72.8, 40.5, 41.8)
    if ((p_long >= bounds[0]) & (p_long <= bounds[1]) & (p_lat >= bounds[2]) & (p_lat <= bounds[3])):
        if (d_long >= bounds[0]) & (d_long <= bounds[1]) & (d_lat >= bounds[2]) & (d_lat <= bounds[3]):
            return 0
    return 1

valid_vec = np.vectorize(is_valid)
df_train['invalid'] = valid_vec(df_train['pickup_latitude'], df_train['pickup_longitude'], df_train['dropoff_latitude'], df_train['dropoff_longitude'])
df_train.head()

Unnamed: 0,index,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,sin_time_of_day,cos_time_of_day,sin_day_of_week,cos_day_of_week,sin_month,cos_month,distance,invalid
0,0,7.5,-73.978463,40.759072,-73.963577,40.774255,1,2014.0,-0.782608,-0.622515,-0.781831,0.62349,-1.0,-1.83697e-16,2.102831,0
1,1,6.5,-73.982397,40.770245,-73.97245,40.764062,1,2012.0,0.872496,-0.488621,0.781831,0.62349,0.5,-0.8660254,1.083704,0
2,2,12.9,-74.006507,40.718808,-73.961096,40.760711,1,2009.0,-0.861629,0.507538,0.974928,-0.222521,0.5,-0.8660254,6.028895,0
3,3,20.1,-73.999242,40.734165,-73.972597,40.751228,2,2010.0,-0.27144,0.962455,-0.433884,-0.900969,-0.5,-0.8660254,2.939176,0
4,4,4.1,-73.97775,40.746274,-73.97276,40.755768,1,2010.0,-0.321439,-0.94693,0.974928,-0.222521,-0.866025,-0.5,1.136291,0


In [7]:
def make_invalid_water(invalid_col):
    if (invalid_col == 1):
        return 1
    else:
        return 2
inv_vec = np.vectorize(make_invalid_water)

def get_water_invalid(df):
    df2 = remove_points_in_water(df)
    df_diff = pd.concat([df, df2])
    print("Concatenated dataframes")
    df_diff = df_diff.drop_duplicates(keep=False)
    print("dropped duplicates")
    df_diff['invalid'] = inv_vec(df_diff.invalid)
    df = pd.concat([df2, df_diff])
    df.reset_index(inplace = True)
    return df

In [8]:
df_train = get_water_invalid(df_train)

After Bounds: 42788396
Number of trips in water:  8402
Concatenated dataframes
dropped duplicates


In [9]:
df_test = pd.read_csv('data/test.csv', low_memory = False)

In [10]:
df_test = preprocess_test(df_test)

Initial number of points:  11084772
checkpoint 1
Checkpoint 2
checkpoint 3
checkpoint 4
Mean computed
Final number of points: 11084772


In [11]:
df_train.to_hdf(r'data/temp_train.h5', 'data')
df_test.to_hdf(r'data/temp_test.h5', 'data')

In [12]:
import lightgbm as lgb

params = {
        'learning_rate': 0.10,
        'application': 'regression',
        'max_depth': 10,
        'num_leaves': 800,
        'verbosity': -1,
        'metric': 'RMSE',
    }

In [18]:
df_train.drop(columns = ['level_0','index'], inplace = True)
df_train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,sin_time_of_day,cos_time_of_day,sin_day_of_week,cos_day_of_week,sin_month,cos_month,distance,invalid
0,7.5,-73.978463,40.759072,-73.963577,40.774255,1,2014.0,-0.782608,-0.622515,-0.781831,0.62349,-1.0,-1.83697e-16,2.102831,0
1,6.5,-73.982397,40.770245,-73.97245,40.764062,1,2012.0,0.872496,-0.488621,0.781831,0.62349,0.5,-0.8660254,1.083704,0
2,12.9,-74.006507,40.718808,-73.961096,40.760711,1,2009.0,-0.861629,0.507538,0.974928,-0.222521,0.5,-0.8660254,6.028895,0
3,20.1,-73.999242,40.734165,-73.972597,40.751228,2,2010.0,-0.27144,0.962455,-0.433884,-0.900969,-0.5,-0.8660254,2.939176,0
4,4.1,-73.97775,40.746274,-73.97276,40.755768,1,2010.0,-0.321439,-0.94693,0.974928,-0.222521,-0.866025,-0.5,1.136291,0


In [19]:
from sklearn.model_selection import train_test_split
y = df_train['fare_amount']
X = df_train.drop(columns=['fare_amount'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [20]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval,
                num_boost_round=500,
                early_stopping_rounds = 10)
gbm.save_model('model_v8.txt')

[1]	valid_0's rmse: 6.40643
Training until validation scores don't improve for 10 rounds
[2]	valid_0's rmse: 5.7425
[3]	valid_0's rmse: 5.51669
[4]	valid_0's rmse: 5.40852
[5]	valid_0's rmse: 5.34276
[6]	valid_0's rmse: 5.30946
[7]	valid_0's rmse: 5.28904
[8]	valid_0's rmse: 5.25909
[9]	valid_0's rmse: 5.23151
[10]	valid_0's rmse: 5.22019
[11]	valid_0's rmse: 5.21278
[12]	valid_0's rmse: 5.18837
[13]	valid_0's rmse: 5.17124
[14]	valid_0's rmse: 5.15906
[15]	valid_0's rmse: 5.14835
[16]	valid_0's rmse: 5.13405
[17]	valid_0's rmse: 5.12259
[18]	valid_0's rmse: 5.11246
[19]	valid_0's rmse: 5.10561
[20]	valid_0's rmse: 5.10303
[21]	valid_0's rmse: 5.0979
[22]	valid_0's rmse: 5.09653
[23]	valid_0's rmse: 5.08959
[24]	valid_0's rmse: 5.0835
[25]	valid_0's rmse: 5.07505
[26]	valid_0's rmse: 5.06865
[27]	valid_0's rmse: 5.06371
[28]	valid_0's rmse: 5.05851
[29]	valid_0's rmse: 5.05285
[30]	valid_0's rmse: 5.04923
[31]	valid_0's rmse: 5.04383
[32]	valid_0's rmse: 5.04025
[33]	valid_0's rmse: 5.

<lightgbm.basic.Booster at 0x7f8bb96253d0>

In [15]:
df_test.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,sin_time_of_day,cos_time_of_day,sin_day_of_week,cos_day_of_week,sin_month,cos_month,distance,key
0,-73.972484,40.742743,-73.918937,40.764496,1.0,2009.0,0.004363,0.99999,-0.433884,-0.900969,0.5,0.866025,5.118067,2009-01-01 00:01:04.0000003
1,-73.98585,40.722826,-73.986301,40.739347,1.0,2009.0,0.004363,0.99999,-0.433884,-0.900969,0.5,0.866025,1.837447,2009-01-01 00:01:26.0000001
2,-73.988917,40.740142,-73.982769,40.777291,1.0,2009.0,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,4.163116,2009-01-01 00:04:42.0000001
3,-73.977163,40.76449,-73.914474,40.771575,1.0,2009.0,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,5.33779,2009-01-01 00:04:54.0000001
4,-73.948849,40.778003,-73.977678,40.748692,2.0,2009.0,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,4.06421,2009-01-01 00:04:59.0000004


In [16]:
keys = df_test['key'] 
df_test.drop(columns = ['key'], inplace = True)
df_test.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,sin_time_of_day,cos_time_of_day,sin_day_of_week,cos_day_of_week,sin_month,cos_month,distance
0,-73.972484,40.742743,-73.918937,40.764496,1.0,2009.0,0.004363,0.99999,-0.433884,-0.900969,0.5,0.866025,5.118067
1,-73.98585,40.722826,-73.986301,40.739347,1.0,2009.0,0.004363,0.99999,-0.433884,-0.900969,0.5,0.866025,1.837447
2,-73.988917,40.740142,-73.982769,40.777291,1.0,2009.0,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,4.163116
3,-73.977163,40.76449,-73.914474,40.771575,1.0,2009.0,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,5.33779
4,-73.948849,40.778003,-73.977678,40.748692,2.0,2009.0,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,4.06421


In [21]:
df_test.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,sin_time_of_day,cos_time_of_day,sin_day_of_week,cos_day_of_week,sin_month,cos_month,distance
0,-73.972484,40.742743,-73.918937,40.764496,1.0,2009.0,0.004363,0.99999,-0.433884,-0.900969,0.5,0.866025,5.118067
1,-73.98585,40.722826,-73.986301,40.739347,1.0,2009.0,0.004363,0.99999,-0.433884,-0.900969,0.5,0.866025,1.837447
2,-73.988917,40.740142,-73.982769,40.777291,1.0,2009.0,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,4.163116
3,-73.977163,40.76449,-73.914474,40.771575,1.0,2009.0,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,5.33779
4,-73.948849,40.778003,-73.977678,40.748692,2.0,2009.0,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,4.06421


In [22]:
pred_fares = gbm.predict(df_test, num_iteration=gbm.best_iteration)
df_final = pd.DataFrame({'key':keys, 'fare_amount':pred_fares})

In [23]:
df_final.to_csv(r'predictions/lightgbm_withfeaures_v6.csv', index = False)