In [2]:
import datetime
import pandas as pd
import numpy as np

def minutes_after_midnight(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        time_arr = time[1].split(':')
        minutes_after_midnight = 60 * int(time_arr[0]) + int(time_arr[1])
        return 2 * np.pi * minutes_after_midnight / 1440
    else:
        return np.nan

def day_of_week(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        date_arr = time[0].split('-')
        date = datetime.date(int(date_arr[0]), int(date_arr[1]), int(date_arr[2]))
        return 2 * np.pi * (date.weekday() + 1) / 7
    else:
        return np.nan

def month(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        return 2 * np.pi * int(time[0].split('-')[1]) / 12
    else:
        return np.nan

def year(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        return float(time[0].split('-')[0])
    else:
        return np.nan

time_of_day_vec = np.vectorize(minutes_after_midnight)
day_of_week_vec = np.vectorize(day_of_week)
month_vec = np.vectorize(month)
year_vec = np.vectorize(year)

def preprocess(df):
    # Drop all null values
    df = df.dropna()

    # Cyclise time and remove key column
    time_column = df['pickup_datetime'].to_numpy()
    df = df.drop(columns=['pickup_datetime', 'key'])

    df['time_of_day'] = time_of_day_vec(time_column)
    df['day_of_week'] = day_of_week_vec(time_column)
    df['month'] = month_vec(time_column)
    df['year'] = year_vec(time_column)
    return df

In [4]:
df_train = pd.read_csv("data/train.csv", low_memory = False)
df_train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2014-09-27 15:26:00.00000043,7.5,2014-09-27 15:26:00 UTC,-73.978463,40.759072,-73.963577,40.774255,1
1,2012-05-07 07:57:00.000000142,6.5,2012-05-07 07:57:00 UTC,-73.98239699999999,40.770245,-73.97245,40.764062,1
2,2009-05-05 20:02:36.0000005,12.9,2009-05-05 20:02:36 UTC,-74.006507,40.718808,-73.961096,40.760711,1
3,2010-07-15 22:57:00.00000042,20.1,2010-07-15 22:57:00 UTC,-73.99924200000001,40.734165,-73.972597,40.751228000000005,2
4,2010-08-31 13:15:00.00000028,4.1,2010-08-31 13:15:00 UTC,-73.97775,40.746274,-73.97276,40.755768,1


In [7]:
df_train = preprocess(df_train)

In [9]:
df_train = df_train[df_train.fare_amount != 'fare_amount']
df_train['pickup_longitude'] = pd.to_numeric(df_train['pickup_longitude'])
df_train['dropoff_longitude'] = pd.to_numeric(df_train['dropoff_longitude'])
df_train['pickup_latitude'] = pd.to_numeric(df_train['pickup_latitude'])
df_train['dropoff_latitude'] = pd.to_numeric(df_train['dropoff_latitude'])
df_train['fare_amount'] = pd.to_numeric(df_train['fare_amount'])
df_train['passenger_count'] = pd.to_numeric(df_train['passenger_count'])
df_train = df_train[df_train['passenger_count'] > 0]
df_train = df_train[df_train['fare_amount'] > 0]
df_train = df_train[df_train.passenger_count <= 7]

In [7]:
from haversine import haversine
def distance(p_lat, p_long, d_lat, d_long):
    pickup = (p_lat, p_long)
    dropoff = (d_lat, d_long)
    dist = haversine(pickup, dropoff)
    return dist

dist_vector = np.vectorize(distance)

In [16]:
df_train['distance'] = dist_vector(df_train['pickup_latitude'].to_numpy(), df_train['pickup_longitude'].to_numpy(), df_train['dropoff_latitude'].to_numpy(), df_train['dropoff_longitude'].to_numpy())
df_train.head()

Unnamed: 0,level_0,index,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,time_of_day,day_of_week,month,year,distance
0,0,0,7.5,-73.978463,40.759072,-73.963577,40.774255,1,4.040437,5.385587,4.712389,2014.0,2.102831
1,1,1,6.5,-73.982397,40.770245,-73.97245,40.764062,1,2.081305,0.897598,2.617994,2012.0,1.083704
2,2,2,12.9,-74.006507,40.718808,-73.961096,40.760711,1,5.244714,1.795196,2.617994,2009.0,6.028895
3,3,3,20.1,-73.999242,40.734165,-73.972597,40.751228,2,6.008296,3.590392,3.665191,2010.0,2.939176
4,4,4,4.1,-73.97775,40.746274,-73.97276,40.755768,1,3.468842,1.795196,4.18879,2010.0,1.136291


In [17]:
import matplotlib.pyplot as plt
nyc_bounds = (-74.5, -72.8, 40.5, 41.8)

def select_within_bounds(df, bounds):
    pickup_indices = (df.pickup_longitude >= bounds[0]) & (df.pickup_longitude <= bounds[1]) & \
        (df.pickup_latitude >= bounds[2]) & (df.pickup_latitude <= bounds[3])

    dropoff_indices = (df.dropoff_longitude >= bounds[0]) & (df.dropoff_longitude <= bounds[1]) & \
        (df.dropoff_latitude >= bounds[2]) & (df.dropoff_latitude <= bounds[3])

    return pickup_indices & dropoff_indices

def map_to_nyc_mask(longitude, latitude, points_x, points_y, bounds):
    x = (points_x * (longitude - bounds[0]) / (bounds[1] - bounds[0])).astype('int')
    y = (points_y - points_y * (latitude - bounds[2]) / (bounds[3] - bounds[2])).astype('int')
    return x,y

def remove_points_in_water(df):
    # Create a mask of the New York City with 1 as land and 0 as water
    nyc_mask = plt.imread('img/nyc_map.png')[:,:,0] > 0.9

    df = df[select_within_bounds(df, nyc_bounds)]
    df = df.reset_index()
    return df

In [19]:
df_train.drop(columns = ['level_0', 'index'], inplace = True)
df_train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,time_of_day,day_of_week,month,year,distance
0,7.5,-73.978463,40.759072,-73.963577,40.774255,1,4.040437,5.385587,4.712389,2014.0,2.102831
1,6.5,-73.982397,40.770245,-73.97245,40.764062,1,2.081305,0.897598,2.617994,2012.0,1.083704
2,12.9,-74.006507,40.718808,-73.961096,40.760711,1,5.244714,1.795196,2.617994,2009.0,6.028895
3,20.1,-73.999242,40.734165,-73.972597,40.751228,2,6.008296,3.590392,3.665191,2010.0,2.939176
4,4.1,-73.97775,40.746274,-73.97276,40.755768,1,3.468842,1.795196,4.18879,2010.0,1.136291


In [20]:
df_train = remove_points_in_water(df_train)
df_train.head()

Unnamed: 0,index,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,time_of_day,day_of_week,month,year,distance
0,0,7.5,-73.978463,40.759072,-73.963577,40.774255,1,4.040437,5.385587,4.712389,2014.0,2.102831
1,1,6.5,-73.982397,40.770245,-73.97245,40.764062,1,2.081305,0.897598,2.617994,2012.0,1.083704
2,2,12.9,-74.006507,40.718808,-73.961096,40.760711,1,5.244714,1.795196,2.617994,2009.0,6.028895
3,3,20.1,-73.999242,40.734165,-73.972597,40.751228,2,6.008296,3.590392,3.665191,2010.0,2.939176
4,4,4.1,-73.97775,40.746274,-73.97276,40.755768,1,3.468842,1.795196,4.18879,2010.0,1.136291


In [21]:
df_train.drop(columns = ['index'], inplace = True)

In [22]:
from sklearn import preprocessing
scaler_mm = preprocessing.MinMaxScaler()
scaler_std = preprocessing.StandardScaler()
mm_features = df_train[['passenger_count', 'year']]
std_features = df_train[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'distance', 'time_of_day', 'day_of_week', 'month']]
scaler_mm.fit(mm_features)
mm_features = scaler_mm.transform(mm_features)
mm_features = pd.DataFrame(mm_features, columns = ['passenger_count', 'year'])
scaler_std.fit(std_features)
std_features = scaler_std.transform(std_features)
std_features = pd.DataFrame(std_features, columns = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'distance', 'time_of_day', 'day_of_week', 'month'])

In [23]:
df_train[['passenger_count', 'year']] = mm_features
df_train[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'distance', 'time_of_day', 'day_of_week', 'month']] = std_features
df_train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,time_of_day,day_of_week,month,year,distance
0,7.5,-0.086676,0.27028,0.28388,0.695598,0.0,0.219252,1.004915,0.794524,0.833333,-0.32555
1,6.5,-0.188877,0.648544,0.04849,0.384781,0.0,-0.927838,-1.560289,-0.36948,0.5,-0.596285
2,12.9,-0.815226,-1.092867,0.349698,0.282598,0.0,0.924367,-1.047249,-0.36948,0.0,0.717421
3,20.1,-0.62649,-0.572952,0.04459,-0.006569,0.166667,1.371451,-0.021167,0.212522,0.166667,-0.103373
4,4.1,-0.068153,-0.162999,0.040266,0.13187,0.0,-0.115422,-1.047249,0.503523,0.166667,-0.582315


In [26]:
df_train.to_hdf(r'data/train_preprocessed_v2.h5', 'data')

In [5]:
from sklearn import preprocessing
scaler_mm = preprocessing.MinMaxScaler()
scaler_std = preprocessing.StandardScaler()
def preprocess_test(df):

    # Cyclise time and remove key column
    time_column = df['pickup_datetime'].to_numpy()
    df = df.drop(columns=['pickup_datetime'])

    df['time_of_day'] = time_of_day_vec(time_column)
    df['day_of_week'] = day_of_week_vec(time_column)
    df['month'] = month_vec(time_column)
    df['year'] = year_vec(time_column)
    df['distance'] = dist_vector(df['pickup_latitude'].to_numpy(), df['pickup_longitude'].to_numpy(), df['dropoff_latitude'].to_numpy(), df['dropoff_longitude'].to_numpy())
    df = df.fillna(df.mean())
    mm_features = df[['passenger_count', 'year']]
    std_features = df[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'distance', 'distance', 'time_of_day', 'day_of_week', 'month']]
    scaler_mm.fit(mm_features)
    mm_features = scaler_mm.transform(mm_features)
    mm_features = pd.DataFrame(mm_features, columns = ['passenger_count', 'year'])
    scaler_std.fit(std_features)
    std_features = scaler_std.transform(std_features)
    std_features = pd.DataFrame(std_features, columns = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'distance', 'distance', 'time_of_day', 'day_of_week', 'month'])
    df[['passenger_count', 'year']] = mm_features
    df[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'distance', 'distance', 'time_of_day', 'day_of_week', 'month']] = std_features
    return df

In [28]:
df_train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,time_of_day,day_of_week,month,year,distance
0,7.5,-0.086676,0.27028,0.28388,0.695598,0.0,0.219252,1.004915,0.794524,0.833333,-0.32555
1,6.5,-0.188877,0.648544,0.04849,0.384781,0.0,-0.927838,-1.560289,-0.36948,0.5,-0.596285
2,12.9,-0.815226,-1.092867,0.349698,0.282598,0.0,0.924367,-1.047249,-0.36948,0.0,0.717421
3,20.1,-0.62649,-0.572952,0.04459,-0.006569,0.166667,1.371451,-0.021167,0.212522,0.166667,-0.103373
4,4.1,-0.068153,-0.162999,0.040266,0.13187,0.0,-0.115422,-1.047249,0.503523,0.166667,-0.582315


In [32]:
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(df_train, target, test_size = 0.1, random_state = 0)

In [36]:
import lightgbm as lgb
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# target = df_train['fare_amount']
# df_train.drop(columns = ['fare_amount'], inplace = True)
lgb_train = lgb.Dataset(xTrain, yTrain)
lgb_eval = lgb.Dataset(xTest, yTest, reference=lgb_train)
gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval,
                num_boost_round=200,
                early_stopping_rounds=5)
gbm.save_model('model.txt')

[1]	valid_0's l1: 5.76738	valid_0's l2: 87.1553
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l1: 5.53725	valid_0's l2: 80.6698
[3]	valid_0's l1: 5.31951	valid_0's l2: 74.8073
[4]	valid_0's l1: 5.11311	valid_0's l2: 69.5667
[5]	valid_0's l1: 4.9732	valid_0's l2: 65.4238
[6]	valid_0's l1: 4.78875	valid_0's l2: 61.0299
[7]	valid_0's l1: 4.61357	valid_0's l2: 57.0374
[8]	valid_0's l1: 4.44898	valid_0's l2: 53.4139
[9]	valid_0's l1: 4.29482	valid_0's l2: 50.1571
[10]	valid_0's l1: 4.14728	valid_0's l2: 47.1861
[11]	valid_0's l1: 4.00868	valid_0's l2: 44.5018
[12]	valid_0's l1: 3.8794	valid_0's l2: 42.0632
[13]	valid_0's l1: 3.75659	valid_0's l2: 39.8516
[14]	valid_0's l1: 3.64246	valid_0's l2: 37.8479
[15]	valid_0's l1: 3.56251	valid_0's l2: 36.2538
[16]	valid_0's l1: 3.46096	valid_0's l2: 34.5898
[17]	valid_0's l1: 3.39085	valid_0's l2: 33.2841
[18]	valid_0's l1: 3.298	valid_0's l2: 31.8593
[19]	valid_0's l1: 3.23886	valid_0's l2: 30.7757
[20]	valid_0's l1: 3.1

<lightgbm.basic.Booster at 0x7f227eeac490>

In [3]:
df_test = pd.read_csv('data/test.csv', low_memory = False)
df_test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-01-01 00:01:04.0000003,2009-01-01 00:01:04 UTC,-73.972484,40.742743,-73.918937,40.764496,1
1,2009-01-01 00:01:26.0000001,2009-01-01 00:01:26 UTC,-73.98585,40.722826,-73.986301,40.739347,1
2,2009-01-01 00:04:42.0000001,2009-01-01 00:04:42 UTC,-73.988917,40.740142,-73.982769,40.777291,1
3,2009-01-01 00:04:54.0000001,2009-01-01 00:04:54 UTC,-73.977163,40.76449,-73.914474,40.771575,1
4,2009-01-01 00:04:59.0000004,2009-01-01 00:04:59 UTC,-73.948849,40.778003,-73.977678,40.748692,2


In [None]:
df_test = preprocess_test(df_test)

In [None]:
df_test.to_hdf(r'data/test_preprocessed_v2.h5', 'data')

In [None]:
df_test.head()

In [25]:
keys = df_test['key']
df_test.drop(columns = ['key'], inplace = True)
df_test.head()

NameError: name 'df_test' is not defined

In [None]:
df_test.head()


In [None]:
gbm = lgb.Booster(model_file='model.txt')

In [None]:
pred_fares = gbm.predict(df_test, num_iteration=gbm.best_iteration)

In [None]:
df_final = pd.DataFrame({'key':keys, 'fare_amount':pred_fares})

In [None]:
df_final.head()

In [None]:
df_final.shape

In [None]:
df_final.to_csv(r'predictions/pred_lin_reg_v2.csv', index = False)