In [1]:
import datetime
import pandas as pd
import numpy as np


In [2]:
df_train = pd.read_hdf("data/train_preprocessed.h5", low_memory = False)
df_train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,sin_time_of_day,cos_time_of_day,sin_day_of_week,cos_day_of_week,sin_month,cos_month,distance
0,7.5,-0.08654,0.270342,0.285063,0.696356,0.0,0.833333,-0.782608,-0.622515,-0.781831,0.62349,-1.0,-1.83697e-16,-0.325587
1,6.5,-0.189025,0.649418,0.048992,0.384996,0.0,0.5,0.872496,-0.488621,0.781831,0.62349,0.5,-0.8660254,-0.596447
2,12.9,-0.817116,-1.095728,0.351071,0.282635,0.0,0.0,-0.861629,0.507538,0.974928,-0.222521,0.5,-0.8660254,0.717868
3,20.1,-0.627855,-0.574698,0.045081,-0.007037,0.166667,0.166667,-0.27144,0.962455,-0.433884,-0.900969,-0.5,-0.8660254,-0.103307
4,4.1,-0.067965,-0.163866,0.040744,0.131644,0.0,0.166667,-0.321439,-0.94693,0.974928,-0.222521,-0.866025,-0.5,-0.582471


In [4]:
df_train.isna().sum()

fare_amount          0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
year                 0
sin_time_of_day      0
cos_time_of_day      0
sin_day_of_week      0
cos_day_of_week      0
sin_month            0
cos_month            0
distance             0
dtype: int64

In [3]:
df_train.dropna(inplace = True)

In [22]:
def minutes_after_midnight(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        time_arr = time[1].split(':')
        minutes_after_midnight = 60 * int(time_arr[0]) + int(time_arr[1])
        return 2 * np.pi * minutes_after_midnight / 1440
    else:
        return np.nan

def day_of_week(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        date_arr = time[0].split('-')
        date = datetime.date(int(date_arr[0]), int(date_arr[1]), int(date_arr[2]))
        return 2 * np.pi * (date.weekday() + 1) / 7
    else:
        return np.nan

def month(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        return 2 * np.pi * int(time[0].split('-')[1]) / 12
    else:
        return np.nan

def year(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        return float(time[0].split('-')[0])
    else:
        return np.nan

time_of_day_vec = np.vectorize(minutes_after_midnight)
day_of_week_vec = np.vectorize(day_of_week)
month_vec = np.vectorize(month)
year_vec = np.vectorize(year)

# Function to calculate distance between two points and add it as a feature
from haversine import haversine
def distance(p_lat, p_long, d_lat, d_long):
    pickup = (p_lat, p_long)
    dropoff = (d_lat, d_long)
    dist = haversine(pickup, dropoff)
    return dist

dist_vector = np.vectorize(distance)

# Points in wata are bad..
import matplotlib.pyplot as plt
nyc_bounds = (-74.5, -72.8, 40.5, 41.8)

def select_within_bounds(df, bounds):
    pickup_indices = (df.pickup_longitude >= bounds[0]) & (df.pickup_longitude <= bounds[1]) & \
        (df.pickup_latitude >= bounds[2]) & (df.pickup_latitude <= bounds[3])

    dropoff_indices = (df.dropoff_longitude >= bounds[0]) & (df.dropoff_longitude <= bounds[1]) & \
        (df.dropoff_latitude >= bounds[2]) & (df.dropoff_latitude <= bounds[3])

    return pickup_indices & dropoff_indices

def map_to_nyc_mask(longitude, latitude, points_x, points_y, bounds):
    x = (points_x * (longitude - bounds[0]) / (bounds[1] - bounds[0])).astype('int')
    y = (points_y - points_y * (latitude - bounds[2]) / (bounds[3] - bounds[2])).astype('int')
    return x,y

def remove_points_in_water(df):
    # Create a mask of the New York City with 1 as land and 0 as water
    nyc_mask = plt.imread('img/nyc_water_mask.png')[:,:,0] > 0.9

    # Remove points outside New York
    df = df[select_within_bounds(df, nyc_bounds)]
    print("After Bounds:", df.shape[0])

    # Map the latitudes and longitudes to the points in the map
    pickup_x, pickup_y = map_to_nyc_mask(df.pickup_longitude, df.pickup_latitude, nyc_mask.shape[1],
                                         nyc_mask.shape[0], nyc_bounds)
    dropoff_x, dropoff_y = map_to_nyc_mask(df.dropoff_longitude, df.dropoff_latitude, nyc_mask.shape[1],
                                        nyc_mask.shape[0], nyc_bounds)
    
    pickup_y[pickup_y == 1262] = 1261
    dropoff_y[dropoff_y == 1262] = 1261
    pickup_x[pickup_x == 1242] = 1241
    dropoff_x[dropoff_x == 1242] = 1241

    # Compute the indices where pickup and dropoff locations are on land
    indices = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]

    df = df[indices]
    print("Number of trips in water: ", np.sum(~indices))
    return df

def preprocess(df):
    print("Initial number of points: ", df.shape[0])
    # Drop all null values
    df = df.dropna()

    # Cyclise time and remove key column
    time_column = df['pickup_datetime'].to_numpy()
    df = df.drop(columns=['pickup_datetime', 'key'])

    time_of_day = time_of_day_vec(time_column)
    day_of_week = day_of_week_vec(time_column)
    month = month_vec(time_column)
    df['year'] = year_vec(time_column)

    df['sin_time_of_day'] = np.sin(time_of_day)
    df['cos_time_of_day'] = np.cos(time_of_day)
    df['sin_day_of_week'] = np.sin(day_of_week)
    df['cos_day_of_week'] = np.cos(day_of_week)
    df['sin_month'] = np.sin(month)
    df['cos_month'] = np.cos(month)

    df = df.dropna()
    print("Number of points after removing null:", df.shape[0])

    # Make latitude and longitude numeric
    df['pickup_latitude'] = pd.to_numeric(df['pickup_latitude'])
    df['pickup_longitude'] = pd.to_numeric(df['pickup_longitude'])
    df['dropoff_latitude'] = pd.to_numeric(df['dropoff_latitude'])
    df['dropoff_longitude'] = pd.to_numeric(df['dropoff_longitude'])
    df['fare_amount'] = pd.to_numeric(df['fare_amount'])
    df['passenger_count'] = pd.to_numeric(df['passenger_count'])

    # Remove points in water
    df = remove_points_in_water(df)

    # Remove 0 passenger count and negative fare amounts
    df = df[df['passenger_count'] > 0]
    df = df[df['passenger_count'] <= 7]
    df['fare_amount'] = df[df['fare_amount'] > 0]
    print("Number of points after removing semantic:", df.shape[0])

    # Add distance column
    df['distance'] = dist_vector(df['pickup_latitude'].to_numpy(), df['pickup_longitude'].to_numpy(),
                     df['dropoff_latitude'].to_numpy(), df['dropoff_longitude'].to_numpy())
    df = df.reset_index(drop=True)

    return df

def preprocess_test(df):
    print("Initial number of points: ", df.shape[0])
    df_key = df['key']
    df.drop(columns = ['key'], inplace = True)
    # Cyclise time
    time_column = df['pickup_datetime'].to_numpy()
    df = df.drop(columns=['pickup_datetime'])

    time_of_day = time_of_day_vec(time_column)
    day_of_week = day_of_week_vec(time_column)
    month = month_vec(time_column)
    df['year'] = year_vec(time_column)
    print("checkpoint 1")
    df['sin_time_of_day'] = np.sin(time_of_day)
    df['cos_time_of_day'] = np.cos(time_of_day)
    df['sin_day_of_week'] = np.sin(day_of_week)
    df['cos_day_of_week'] = np.cos(day_of_week)
    df['sin_month'] = np.sin(month)
    df['cos_month'] = np.cos(month)
    print('Checkpoint 2')
    # Make illegal passenger_counts null
    df = df.mask(df['passenger_count'] <= 0)
    df = df.mask(df['passenger_count'] > 7)
    print('checkpoint 3')
    # Add distance column
    df['distance'] = dist_vector(df['pickup_latitude'].to_numpy(), df['pickup_longitude'].to_numpy(),
                     df['dropoff_latitude'].to_numpy(), df['dropoff_longitude'].to_numpy())
    df = df.reset_index(drop=True)
    print('checkpoint 4')
    df_mean = df.mean()
    print("Mean computed")
    # Impute the null points with mean
    df = df.fillna(df_mean)
    df['key'] = df_key
    print("Final number of points:", df.shape[0])
    return df

from sklearn.preprocessing import MinMaxScaler, StandardScaler
def scale(df):
    mm_scaler = MinMaxScaler()
    std_scaler = StandardScaler()

    mm_features = ['passenger_count', 'year']
    std_features = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude',
                    'distance']

    mm_scaler.fit(df[mm_features])
    std_scaler.fit(df[std_features])

    df[mm_features] = pd.DataFrame(mm_scaler.transform(df[mm_features]), columns=mm_features)
    df[std_features] = pd.DataFrame(std_scaler.transform(df[std_features]), columns=std_features)
    return df

In [21]:
df_test = pd.read_csv('data/test.csv', low_memory = False)

In [None]:
df_train = preprocess(df_train)

Initial number of points:  44339095
Number of points after removing null: 44338777
After Bounds: 43399655
Number of trips in water:  8977


In [None]:
df_train = scale(df_train)

In [None]:
df_train.to_hdf(r'data/train_preprocessed.h5', 'data')

In [23]:
df_test = preprocess_test(df_test)

Initial number of points:  11084772
checkpoint 1
Checkpoint 2
checkpoint 3
checkpoint 4
Mean computed
Final number of points: 11084772


In [24]:
df_test = scale(df_test)

In [25]:
df_test.to_hdf(r'data/test_preprocessed.h5', 'data')

Now that we are done preprocessing the dataset, we will move on to training and evaluating the LightGBM model

In [5]:
target = df_train['fare_amount']
df_train.drop(columns = ['fare_amount'], inplace = True)

In [51]:
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(df_train, target, test_size = 0.15, random_state = 0)

In [10]:
import lightgbm as lgb
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# target = df_train['fare_amount']
# df_train.drop(columns = ['fare_amount'], inplace = True)
lgb_train = lgb.Dataset(xTrain, yTrain)
lgb_eval = lgb.Dataset(xTest, yTest, reference=lgb_train)
gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval,
                num_boost_round=200,
                early_stopping_rounds=5)
gbm.save_model('model_v2.txt')

[1]	valid_0's l2: 87.4036	valid_0's l1: 5.75879
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l2: 80.9719	valid_0's l1: 5.52811
[3]	valid_0's l2: 75.1448	valid_0's l1: 5.31194
[4]	valid_0's l2: 69.836	valid_0's l1: 5.1083
[5]	valid_0's l2: 65.06	valid_0's l1: 4.91523
[6]	valid_0's l2: 60.7456	valid_0's l1: 4.73278
[7]	valid_0's l2: 56.798	valid_0's l1: 4.56125
[8]	valid_0's l2: 53.2478	valid_0's l1: 4.40042
[9]	valid_0's l2: 50.0471	valid_0's l1: 4.24771
[10]	valid_0's l2: 47.1435	valid_0's l1: 4.10303
[11]	valid_0's l2: 44.8017	valid_0's l1: 4.00086
[12]	valid_0's l2: 42.3672	valid_0's l1: 3.87107
[13]	valid_0's l2: 40.1745	valid_0's l1: 3.74839
[14]	valid_0's l2: 38.1835	valid_0's l1: 3.63325
[15]	valid_0's l2: 36.3615	valid_0's l1: 3.5264
[16]	valid_0's l2: 34.9112	valid_0's l1: 3.45123
[17]	valid_0's l2: 33.3805	valid_0's l1: 3.35501
[18]	valid_0's l2: 32.0144	valid_0's l1: 3.26457
[19]	valid_0's l2: 30.9314	valid_0's l1: 3.20246
[20]	valid_0's l2: 29.77

<lightgbm.basic.Booster at 0x7fe243f05890>

In [11]:
df_test.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,sin_time_of_day,cos_time_of_day,sin_day_of_week,cos_day_of_week,sin_month,cos_month,distance,key
0,-0.111649,0.085858,-0.108606,0.084,0.0,0.0,0.004363,0.99999,-0.433884,-0.900969,0.5,0.866025,-0.039683,2009-01-01 00:01:04.0000003
1,-0.112667,0.083791,-0.1138,0.081495,0.0,0.0,0.004363,0.99999,-0.433884,-0.900969,0.5,0.866025,-0.048554,2009-01-01 00:01:26.0000001
2,-0.112901,0.085588,-0.113528,0.085275,0.0,0.0,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,-0.042266,2009-01-01 00:04:42.0000001
3,-0.112005,0.088116,-0.108262,0.084706,0.0,0.0,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,-0.039089,2009-01-01 00:04:54.0000001
4,-0.109848,0.089519,-0.113136,0.082426,0.166667,0.0,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,-0.042533,2009-01-01 00:04:59.0000004


In [26]:
keys = df_test['key']
df_test.drop(columns = ['key'], inplace = True)
df_test.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,sin_time_of_day,cos_time_of_day,sin_day_of_week,cos_day_of_week,sin_month,cos_month,distance
0,-0.111649,0.085858,-0.108606,0.084,0.0,0.0,0.004363,0.99999,-0.433884,-0.900969,0.5,0.866025,-0.039683
1,-0.112667,0.083791,-0.1138,0.081495,0.0,0.0,0.004363,0.99999,-0.433884,-0.900969,0.5,0.866025,-0.048554
2,-0.112901,0.085588,-0.113528,0.085275,0.0,0.0,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,-0.042266
3,-0.112005,0.088116,-0.108262,0.084706,0.0,0.0,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,-0.039089
4,-0.109848,0.089519,-0.113136,0.082426,0.166667,0.0,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,-0.042533


In [27]:
pred_fares = gbm.predict(df_test, num_iteration=gbm.best_iteration)

In [28]:
df_final = pd.DataFrame({'key':keys, 'fare_amount':pred_fares})

In [29]:
df_final.head()

Unnamed: 0,key,fare_amount
0,2009-01-01 00:01:04.0000003,9.42614
1,2009-01-01 00:01:26.0000001,9.384005
2,2009-01-01 00:04:42.0000001,9.42614
3,2009-01-01 00:04:54.0000001,9.42614
4,2009-01-01 00:04:59.0000004,9.384005


In [30]:
df_final.shape

(11084772, 2)

In [32]:
df_final.to_csv(r'predictions/pred_lightgbm.csv', index = False)

If i get the time, I will do PCA as well. Here goes...

In [52]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
pca.fit(xTrain)

PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [53]:
xTrain = pca.transform(xTrain)
xTest = pca.transform(xTest)
xTrain.shape

(36752057, 10)

In [54]:
xTest.shape

(6485658, 10)

In [55]:
lgb_train = lgb.Dataset(xTrain, yTrain)
lgb_eval = lgb.Dataset(xTest, yTest, reference=lgb_train)
gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval,
                num_boost_round=200,
                early_stopping_rounds=12)
gbm.save_model('model_pca.txt')

[1]	valid_0's l2: 88.253	valid_0's l1: 5.80276
Training until validation scores don't improve for 12 rounds
[2]	valid_0's l2: 82.5519	valid_0's l1: 5.61473
[3]	valid_0's l2: 77.3473	valid_0's l1: 5.43643
[4]	valid_0's l2: 72.6158	valid_0's l1: 5.26775
[5]	valid_0's l2: 68.287	valid_0's l1: 5.10528
[6]	valid_0's l2: 64.3351	valid_0's l1: 4.9509
[7]	valid_0's l2: 60.7685	valid_0's l1: 4.80685
[8]	valid_0's l2: 57.5284	valid_0's l1: 4.67113
[9]	valid_0's l2: 54.5933	valid_0's l1: 4.54552
[10]	valid_0's l2: 51.9373	valid_0's l1: 4.4256
[11]	valid_0's l2: 49.5472	valid_0's l1: 4.31465
[12]	valid_0's l2: 47.3528	valid_0's l1: 4.20975
[13]	valid_0's l2: 45.351	valid_0's l1: 4.10869
[14]	valid_0's l2: 43.5305	valid_0's l1: 4.0154
[15]	valid_0's l2: 41.8758	valid_0's l1: 3.92855
[16]	valid_0's l2: 40.3446	valid_0's l1: 3.84394
[17]	valid_0's l2: 38.9651	valid_0's l1: 3.76583
[18]	valid_0's l2: 37.6933	valid_0's l1: 3.6917
[19]	valid_0's l2: 36.5463	valid_0's l1: 3.6235
[20]	valid_0's l2: 35.502

<lightgbm.basic.Booster at 0x7fdbdcf9b8d0>

In [56]:
pred_fares = gbm.predict(df_test, num_iteration=gbm.best_iteration)

In [57]:
df_final = pd.DataFrame({'key':keys, 'fare_amount':pred_fares})

In [58]:
df_final.to_csv(r'predictions/pred_lightgbm_pca_10.csv', index = False)

In [6]:
from sklearn.svm import SVR
svr_rbf = SVR(kernel='linear', C=100, gamma='auto', epsilon=.1, cache_size=7000)

In [None]:
svr_rbf.fit(df_train, target)

In [None]:
pred_fares = svr_rbf.predict(df_test)

In [None]:
df_final = pd.DataFrame({'key':keys, 'fare_amount':pred_fares})
df_final.shape()

In [None]:
df_final.shape

In [None]:
df_final.to_csv(r'predictions/pred_svr.csv', index = False)