In [1]:
import datetime
import pandas as pd
import numpy as np


In [15]:
df_train = pd.read_hdf("../data/train_airport.h5", low_memory = False)
df_train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,year,month,weekday,hour,distance,jfk_dist,ewr_dist,lga_dist,invalid
0,7.5,-73.978463,40.759072,-73.963577,40.774255,2014.0,9,6,15,1.71964,21.428526,17.650658,7.667005,0
1,6.5,-73.982397,40.770245,-73.97245,40.764062,2012.0,5,1,7,1.122217,21.386847,17.914444,8.534492,0
2,12.9,-74.006507,40.718808,-73.961096,40.760711,2009.0,5,2,20,5.210553,20.418768,13.973941,7.6752,0
3,20.1,-73.999242,40.734165,-73.972597,40.751228,2010.0,7,4,22,3.008668,20.504349,15.008048,8.904018,0
4,4.1,-73.97775,40.746274,-73.97276,40.755768,2010.0,8,2,13,0.626738,20.529747,17.160455,8.765927,0


In [4]:
df_train = df_train.drop(columns = ['passenger_count'])

In [5]:
df_train.isna().sum()

key                    0
fare_amount            0
pickup_datetime        0
pickup_longitude       0
pickup_latitude        0
dropoff_longitude    307
dropoff_latitude     307
dtype: int64

In [6]:
df_train.dropna()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,2014-09-27 15:26:00.00000043,7.5,2014-09-27 15:26:00 UTC,-73.978463,40.759071999999996,-73.963577,40.774255
1,2012-05-07 07:57:00.000000142,6.5,2012-05-07 07:57:00 UTC,-73.98239699999999,40.770245,-73.97245,40.764062
2,2009-05-05 20:02:36.0000005,12.9,2009-05-05 20:02:36 UTC,-74.006507,40.718808,-73.961096,40.760711
3,2010-07-15 22:57:00.00000042,20.1,2010-07-15 22:57:00 UTC,-73.99924200000001,40.734165000000004,-73.972597,40.751228000000005
4,2010-08-31 13:15:00.00000028,4.1,2010-08-31 13:15:00 UTC,-73.97775,40.746274,-73.97276,40.755767999999996
...,...,...,...,...,...,...,...
44339090,2013-03-08 20:39:00.000000143,12.0,2013-03-08 20:39:00 UTC,-73.98270699999999,40.761759999999995,-73.98209200000001,40.731111999999996
44339091,2011-06-28 18:16:00.000000160,34.9,2011-06-28 18:16:00 UTC,-73.874517,40.774058000000004,-73.97907,40.764117999999996
44339092,2011-10-18 11:28:25.0000002,8.1,2011-10-18 11:28:25 UTC,-73.98898100000001,40.748278000000006,-73.98824300000001,40.754293
44339093,2010-01-07 08:13:42.0000001,7.3,2010-01-07 08:13:42 UTC,-73.983521,40.738239,-73.999622,40.754099


In [8]:
df_train = df_train[df_train.fare_amount != 'fare_amount']

In [10]:
df_train['pickup_latitude'] = pd.to_numeric(df_train['pickup_latitude'])
df_train['pickup_longitude'] = pd.to_numeric(df_train['pickup_longitude'])
df_train['dropoff_latitude'] = pd.to_numeric(df_train['dropoff_latitude'])
df_train['dropoff_longitude'] = pd.to_numeric(df_train['dropoff_longitude'])
df_train['fare_amount'] = pd.to_numeric(df_train['fare_amount'])

In [11]:
def hours_after_midnight(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        time_arr = time[1].split(':')
        hours_after_midnight = int(time_arr[0]) 
        return hours_after_midnight
    else:
        return np.nan

def day_of_week(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        date_arr = time[0].split('-')
        date = datetime.date(int(date_arr[0]), int(date_arr[1]), int(date_arr[2]))
        return (date.weekday() + 1)
    else:
        return np.nan

def month(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        return int(time[0].split('-')[1])
    else:
        return np.nan

def year(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        return float(time[0].split('-')[0])
    else:
        return np.nan

hour_of_day_vec = np.vectorize(hours_after_midnight)
day_of_week_vec = np.vectorize(day_of_week)
month_vec = np.vectorize(month)
year_vec = np.vectorize(year)

In [12]:
time_column = df_train['pickup_datetime'].to_numpy()
df_train.drop(columns = ['pickup_datetime', 'key'], inplace = True)

In [13]:
df_train['year'] = year_vec(time_column)
df_train['month'] = month_vec(time_column)
df_train['weekday'] = day_of_week_vec(time_column)
df_train['hour'] = hour_of_day_vec(time_column)

In [16]:
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6371 * c  # 6371 is Radius of earth in kilometers. Use 3956 for miles
    return km

def sphere_dist(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = dropoff_lon - pickup_lon
    
    #Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2
    
    return 2 * R_earth * np.arcsin(np.sqrt(a))


In [15]:
df_train['distance'] = haversine_np(df_train['pickup_latitude'], df_train['pickup_longitude'], df_train['dropoff_latitude'] , df_train['dropoff_longitude'])

In [17]:
def add_airport_dist(dataset):
    """
    Return minumum distance from pickup or dropoff coordinates to each airport.
    JFK: John F. Kennedy International Airport
    EWR: Newark Liberty International Airport
    LGA: LaGuardia Airport
    """
    jfk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    
    pickup_lat = dataset['pickup_latitude']
    dropoff_lat = dataset['dropoff_latitude']
    pickup_lon = dataset['pickup_longitude']
    dropoff_lon = dataset['dropoff_longitude']
    
    pickup_jfk = sphere_dist(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1]) 
    dropoff_jfk = sphere_dist(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon) 
    pickup_ewr = sphere_dist(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
    dropoff_ewr = sphere_dist(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon) 
    pickup_lga = sphere_dist(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1]) 
    dropoff_lga = sphere_dist(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon) 
    
    dataset['jfk_dist'] = pd.concat([pickup_jfk, dropoff_jfk], axis=1).min(axis=1)
    dataset['ewr_dist'] = pd.concat([pickup_ewr, dropoff_ewr], axis=1).min(axis=1)
    dataset['lga_dist'] = pd.concat([pickup_lga, dropoff_lga], axis=1).min(axis=1)
    
    return dataset

In [19]:
df_train = add_airport_dist(df_train)
df_train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,year,month,weekday,hour,distance,jfk_dist,ewr_dist,lga_dist
0,7.5,-73.978463,40.759072,-73.963577,40.774255,2014.0,9,6,15,1.71964,21.428526,17.650658,7.667005
1,6.5,-73.982397,40.770245,-73.97245,40.764062,2012.0,5,1,7,1.122217,21.386847,17.914444,8.534492
2,12.9,-74.006507,40.718808,-73.961096,40.760711,2009.0,5,2,20,5.210553,20.418768,13.973941,7.6752
3,20.1,-73.999242,40.734165,-73.972597,40.751228,2010.0,7,4,22,3.008668,20.504349,15.008048,8.904018
4,4.1,-73.97775,40.746274,-73.97276,40.755768,2010.0,8,2,13,0.626738,20.529747,17.160455,8.765927


In [24]:
params = {
    # Parameters that we are going to tune.
    'max_depth': 8, #Result of tuning with CV
    'eta':.03, #Result of tuning with CV
    'subsample': 1, #Result of tuning with CV
    'colsample_bytree': 0.8, #Result of tuning with CV
    # Other parameters
    'objective':'reg:linear',
    'eval_metric':'rmse',
    'silent': 0
}


In [27]:
def is_valid(p_lat, p_long, d_lat, d_long):
    bounds = (-74.5, -72.8, 40.5, 41.8)
    if ((p_long >= bounds[0]) & (p_long <= bounds[1]) & (p_lat >= bounds[2]) & (p_lat <= bounds[3])):
        if (d_long >= bounds[0]) & (d_long <= bounds[1]) & (d_lat >= bounds[2]) & (d_lat <= bounds[3]):
            return 0
    return 1

valid_vec = np.vectorize(is_valid)
df_train['invalid'] = valid_vec(df_train['pickup_latitude'], df_train['pickup_longitude'], df_train['dropoff_latitude'], df_train['dropoff_longitude'])
df_train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,year,month,weekday,hour,distance,jfk_dist,ewr_dist,lga_dist,invalid
0,7.5,-73.978463,40.759072,-73.963577,40.774255,2014.0,9,6,15,1.71964,21.428526,17.650658,7.667005,0
1,6.5,-73.982397,40.770245,-73.97245,40.764062,2012.0,5,1,7,1.122217,21.386847,17.914444,8.534492,0
2,12.9,-74.006507,40.718808,-73.961096,40.760711,2009.0,5,2,20,5.210553,20.418768,13.973941,7.6752,0
3,20.1,-73.999242,40.734165,-73.972597,40.751228,2010.0,7,4,22,3.008668,20.504349,15.008048,8.904018,0
4,4.1,-73.97775,40.746274,-73.97276,40.755768,2010.0,8,2,13,0.626738,20.529747,17.160455,8.765927,0


In [28]:
import matplotlib.pyplot as plt
nyc_bounds = (-74.5, -72.8, 40.5, 41.8)

def select_within_bounds(df, bounds):
    pickup_indices = (df.pickup_longitude >= bounds[0]) & (df.pickup_longitude <= bounds[1]) & \
        (df.pickup_latitude >= bounds[2]) & (df.pickup_latitude <= bounds[3])

    dropoff_indices = (df.dropoff_longitude >= bounds[0]) & (df.dropoff_longitude <= bounds[1]) & \
        (df.dropoff_latitude >= bounds[2]) & (df.dropoff_latitude <= bounds[3])

    return pickup_indices & dropoff_indices

def map_to_nyc_mask(longitude, latitude, points_x, points_y, bounds):
    x = (points_x * (longitude - bounds[0]) / (bounds[1] - bounds[0])).astype('int')
    y = (points_y - points_y * (latitude - bounds[2]) / (bounds[3] - bounds[2])).astype('int')
    return x,y

def remove_points_in_water(df):
    # Create a mask of the New York City with 1 as land and 0 as water
    nyc_mask = plt.imread('../img/nyc_water_mask.png')[:,:,0] > 0.9

    # Remove points outside New York
    df = df[select_within_bounds(df, nyc_bounds)]
    print("After Bounds:", df.shape[0])

    # Map the latitudes and longitudes to the points in the map
    pickup_x, pickup_y = map_to_nyc_mask(df.pickup_longitude, df.pickup_latitude, nyc_mask.shape[1],
                                         nyc_mask.shape[0], nyc_bounds)
    dropoff_x, dropoff_y = map_to_nyc_mask(df.dropoff_longitude, df.dropoff_latitude, nyc_mask.shape[1],
                                        nyc_mask.shape[0], nyc_bounds)
    
    pickup_y[pickup_y == 1262] = 1261
    dropoff_y[dropoff_y == 1262] = 1261
    pickup_x[pickup_x == 1242] = 1241
    dropoff_x[dropoff_x == 1242] = 1241

    # Compute the indices where pickup and dropoff locations are on land
    indices = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]

    df = df[indices]
    print("Number of trips in water: ", np.sum(~indices))
    return df

In [29]:
def make_invalid_water(invalid_col):
    if (invalid_col == 1):
        return 1
    else:
        return 2
inv_vec = np.vectorize(make_invalid_water)

def get_water_invalid(df):
    df2 = remove_points_in_water(df)
    df_diff = pd.concat([df, df2])
    print("Concatenated dataframes")
    df_diff = df_diff.drop_duplicates(keep=False)
    print("dropped duplicates")
    df_diff['invalid'] = inv_vec(df_diff.invalid)
    df = pd.concat([df2, df_diff])
    df.reset_index(inplace = True)
    return df

In [30]:
df_train = get_water_invalid(df_train)

After Bounds: 43399655
Number of trips in water:  8977
Concatenated dataframes
dropped duplicates


In [16]:
df_train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,year,month,weekday,hour,distance,jfk_dist,ewr_dist,lga_dist,invalid
0,7.5,-73.978463,40.759072,-73.963577,40.774255,2014.0,9,6,15,1.71964,21.428526,17.650658,7.667005,0
1,6.5,-73.982397,40.770245,-73.97245,40.764062,2012.0,5,1,7,1.122217,21.386847,17.914444,8.534492,0
2,12.9,-74.006507,40.718808,-73.961096,40.760711,2009.0,5,2,20,5.210553,20.418768,13.973941,7.6752,0
3,20.1,-73.999242,40.734165,-73.972597,40.751228,2010.0,7,4,22,3.008668,20.504349,15.008048,8.904018,0
4,4.1,-73.97775,40.746274,-73.97276,40.755768,2010.0,8,2,13,0.626738,20.529747,17.160455,8.765927,0


In [17]:
df_train.drop(columns = ['index'], inplace = True)

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,year,month,weekday,hour,distance,invalid
0,7.5,-73.978463,40.759072,-73.963577,40.774255,2014.0,9,6,15,1.71964,0
1,6.5,-73.982397,40.770245,-73.97245,40.764062,2012.0,5,1,7,1.122217,0
2,12.9,-74.006507,40.718808,-73.961096,40.760711,2009.0,5,2,20,5.210553,0
3,20.1,-73.999242,40.734165,-73.972597,40.751228,2010.0,7,4,22,3.008668,0
4,4.1,-73.97775,40.746274,-73.97276,40.755768,2010.0,8,2,13,0.626738,0


In [6]:
from sklearn.model_selection import train_test_split
df_train = df_train.sample(frac = 0.2)
y = df_train['fare_amount']
train = df_train.drop(columns=['fare_amount'])

x_train,x_test,y_train,y_test = train_test_split(train,y,random_state=0,test_size=0.01)


In [34]:
df_train.to_hdf('../data/train_airport.h5', 'data')

In [7]:
import xgboost as xgb
def XGBmodel(x_train,x_test,y_train,y_test,params):
    matrix_train = xgb.DMatrix(x_train,label=y_train)
    matrix_test = xgb.DMatrix(x_test,label=y_test)
    model=xgb.train(params=params,
                    dtrain=matrix_train,num_boost_round=5000, 
                    early_stopping_rounds=10,evals=[(matrix_test,'test')])
    return model

model = XGBmodel(x_train,x_test,y_train,y_test,params)

  if getattr(data, 'base', None) is not None and \


XGBoostError: value -1 for Parameter verbosity exceed bound [0,3]

In [None]:
import joblib
joblib.dump(model, '../models/xgboost/baseline_model_v2.dat')

In [3]:
import lightgbm as lgb

params = {
        'learning_rate': 0.65,
        'application': 'regression',
        'verbosity': -1,
        'metric': 'RMSE',
    }

In [11]:
df_train.drop(columns = ['jfk_dist', 'ewr_dist', 'lga_dist'], inplace = True)
df_train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,year,month,weekday,hour,distance,invalid
8240575,7.0,-74.006715,40.735585,-73.989394,40.757752,2014.0,6,1,1,2.042353,0
8773929,4.9,-74.003285,40.738753,-74.005798,40.727232,2011.0,7,4,8,0.450225,0
42683421,4.1,-73.963983,40.792588,-73.97009,40.784243,2010.0,2,2,13,0.725819,0
31155599,28.0,-73.976958,40.751652,-73.99999,40.673822,2012.0,11,1,21,3.50104,0
29098943,9.5,-74.006027,40.751102,-73.97999,40.740022,2013.0,10,5,21,2.915048,0


In [13]:
from sklearn.model_selection import train_test_split
for i in range(10):
    X = df_train.sample(frac = 1, replace = True)
    y = X['fare_amount']
    X = X.drop(columns = ['fare_amount'])
    print ("Iteration Number = "+ str(i))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    gbm = lgb.train(params,
                    lgb_train,
                    valid_sets=lgb_eval,
                    num_leaf_nodes = 31,
                    num_boost_round=300,
                    early_stopping_rounds=10)
    gbm.save_model('bootstrap_model_new_reduced_features_v'+ str(i) + '.txt')

Iteration Number = 0


TypeError: train() got an unexpected keyword argument 'num_leaf_nodes'

In [20]:
df_train.shape

(43741782, 10)

In [21]:
# For random forest
from sklearn.ensemble import RandomForestRegressor
import pickle
rf = RandomForestRegressor()
y = df_train['fare_amount']
df_train = df_train.drop(columns = ['fare_amount'])
rf.fit(df_train, y)
pickle.dump(rf, open('../models/random_forest.dat', 'wb'))
    

KeyError: 'fare_amount'

In [None]:
df_train = pd.read_hdf('../data/train_airport.h5', nrows = 10000000)
y = df_train['fare_amount']
train = df_train.drop(columns=['fare_amount'])

x_train,x_test,y_train,y_test = train_test_split(train,y,random_state=0,test_size=0.01)

In [None]:
model = XGBmodel(x_train,x_test,y_train,y_test,params)

In [None]:
joblib.dump(model, '../models/xgboost/baseline_model_v2.dat')