In [1]:
import datetime
import pandas as pd
import numpy as np


In [2]:
df_test = pd.read_csv("../data/test.csv", low_memory = False)
df_test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-01-01 00:01:04.0000003,2009-01-01 00:01:04 UTC,-73.972484,40.742743,-73.918937,40.764496,1
1,2009-01-01 00:01:26.0000001,2009-01-01 00:01:26 UTC,-73.98585,40.722826,-73.986301,40.739347,1
2,2009-01-01 00:04:42.0000001,2009-01-01 00:04:42 UTC,-73.988917,40.740142,-73.982769,40.777291,1
3,2009-01-01 00:04:54.0000001,2009-01-01 00:04:54 UTC,-73.977163,40.76449,-73.914474,40.771575,1
4,2009-01-01 00:04:59.0000004,2009-01-01 00:04:59 UTC,-73.948849,40.778003,-73.977678,40.748692,2


In [3]:
df_test = df_test.drop(columns = ['passenger_count'])

In [4]:
def hours_after_midnight(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        time_arr = time[1].split(':')
        hours_after_midnight = int(time_arr[0]) 
        return hours_after_midnight
    else:
        return np.nan

def day_of_week(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        date_arr = time[0].split('-')
        date = datetime.date(int(date_arr[0]), int(date_arr[1]), int(date_arr[2]))
        return (date.weekday() + 1)
    else:
        return np.nan

def month(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        return int(time[0].split('-')[1])
    else:
        return np.nan

def year(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        return float(time[0].split('-')[0])
    else:
        return np.nan

hour_of_day_vec = np.vectorize(hours_after_midnight)
day_of_week_vec = np.vectorize(day_of_week)
month_vec = np.vectorize(month)
year_vec = np.vectorize(year)

In [6]:
time_column = df_test['pickup_datetime'].to_numpy()
keys = df_test['key']
df_test.drop(columns = ['pickup_datetime', 'key'], inplace = True)

In [7]:
df_test.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,-73.972484,40.742743,-73.918937,40.764496
1,-73.98585,40.722826,-73.986301,40.739347
2,-73.988917,40.740142,-73.982769,40.777291
3,-73.977163,40.76449,-73.914474,40.771575
4,-73.948849,40.778003,-73.977678,40.748692


In [8]:
df_test['year'] = year_vec(time_column)
df_test['month'] = month_vec(time_column)
df_test['weekday'] = day_of_week_vec(time_column)
df_test['hour'] = hour_of_day_vec(time_column)

In [9]:
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6371 * c  # 6371 is Radius of earth in kilometers. Use 3956 for miles
    return km

def sphere_dist(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = dropoff_lon - pickup_lon
    
    #Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2
    
    return 2 * R_earth * np.arcsin(np.sqrt(a))

In [10]:
df_test['distance'] = haversine_np(df_test['pickup_latitude'], df_test['pickup_longitude'], df_test['dropoff_latitude'] , df_test['dropoff_longitude'])

In [11]:
df_test.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,year,month,weekday,hour,distance
0,-73.972484,40.742743,-73.918937,40.764496,2009.0,1,4,0,5.991612
1,-73.98585,40.722826,-73.986301,40.739347,2009.0,1,4,0,0.509264
2,-73.988917,40.740142,-73.982769,40.777291,2009.0,1,4,0,1.328903
3,-73.977163,40.76449,-73.914474,40.771575,2009.0,1,4,0,6.974103
4,-73.948849,40.778003,-73.977678,40.748692,2009.0,1,4,0,3.329684


In [12]:
def add_airport_dist(dataset):
    """
    Return minumum distance from pickup or dropoff coordinates to each airport.
    JFK: John F. Kennedy International Airport
    EWR: Newark Liberty International Airport
    LGA: LaGuardia Airport
    """
    jfk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    
    pickup_lat = dataset['pickup_latitude']
    dropoff_lat = dataset['dropoff_latitude']
    pickup_lon = dataset['pickup_longitude']
    dropoff_lon = dataset['dropoff_longitude']
    
    pickup_jfk = sphere_dist(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1]) 
    dropoff_jfk = sphere_dist(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon) 
    pickup_ewr = sphere_dist(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
    dropoff_ewr = sphere_dist(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon) 
    pickup_lga = sphere_dist(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1]) 
    dropoff_lga = sphere_dist(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon) 
    
    dataset['jfk_dist'] = pd.concat([pickup_jfk, dropoff_jfk], axis=1).min(axis=1)
    dataset['ewr_dist'] = pd.concat([pickup_ewr, dropoff_ewr], axis=1).min(axis=1)
    dataset['lga_dist'] = pd.concat([pickup_lga, dropoff_lga], axis=1).min(axis=1)
    
    return dataset

In [13]:
df_test = add_airport_dist(df_test)
df_test.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,year,month,weekday,hour,distance,jfk_dist,ewr_dist,lga_dist
0,-73.972484,40.742743,-73.918937,40.764496,2009.0,1,4,0,5.991612,18.217311,17.447879,4.150934
1,-73.98585,40.722826,-73.986301,40.739347,2009.0,1,4,0,0.509264,19.747365,15.769852,10.46222
2,-73.988917,40.740142,-73.982769,40.777291,2009.0,1,4,0,1.328903,20.934736,16.044417,9.275615
3,-73.977163,40.76449,-73.914474,40.771575,2009.0,1,4,0,6.974103,18.589692,18.009149,3.581169
4,-73.948849,40.778003,-73.977678,40.748692,2009.0,1,4,0,3.329684,20.680915,17.261364,6.419967


In [14]:
def is_valid(p_lat, p_long, d_lat, d_long):
    bounds = (-74.5, -72.8, 40.5, 41.8)
    if ((p_long >= bounds[0]) & (p_long <= bounds[1]) & (p_lat >= bounds[2]) & (p_lat <= bounds[3])):
        if (d_long >= bounds[0]) & (d_long <= bounds[1]) & (d_lat >= bounds[2]) & (d_lat <= bounds[3]):
            return 0
    return 1

valid_vec = np.vectorize(is_valid)
df_test['invalid'] = valid_vec(df_test['pickup_latitude'], df_test['pickup_longitude'], df_test['dropoff_latitude'], df_test['dropoff_longitude'])
df_test.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,year,month,weekday,hour,distance,jfk_dist,ewr_dist,lga_dist,invalid
0,-73.972484,40.742743,-73.918937,40.764496,2009.0,1,4,0,5.991612,18.217311,17.447879,4.150934,0
1,-73.98585,40.722826,-73.986301,40.739347,2009.0,1,4,0,0.509264,19.747365,15.769852,10.46222,0
2,-73.988917,40.740142,-73.982769,40.777291,2009.0,1,4,0,1.328903,20.934736,16.044417,9.275615,0
3,-73.977163,40.76449,-73.914474,40.771575,2009.0,1,4,0,6.974103,18.589692,18.009149,3.581169,0
4,-73.948849,40.778003,-73.977678,40.748692,2009.0,1,4,0,3.329684,20.680915,17.261364,6.419967,0


In [15]:
import matplotlib.pyplot as plt
nyc_bounds = (-74.5, -72.8, 40.5, 41.8)

def select_within_bounds(df, bounds):
    pickup_indices = (df.pickup_longitude >= bounds[0]) & (df.pickup_longitude <= bounds[1]) & \
        (df.pickup_latitude >= bounds[2]) & (df.pickup_latitude <= bounds[3])

    dropoff_indices = (df.dropoff_longitude >= bounds[0]) & (df.dropoff_longitude <= bounds[1]) & \
        (df.dropoff_latitude >= bounds[2]) & (df.dropoff_latitude <= bounds[3])

    return pickup_indices & dropoff_indices

def map_to_nyc_mask(longitude, latitude, points_x, points_y, bounds):
    x = (points_x * (longitude - bounds[0]) / (bounds[1] - bounds[0])).astype('int')
    y = (points_y - points_y * (latitude - bounds[2]) / (bounds[3] - bounds[2])).astype('int')
    return x,y

def remove_points_in_water(df):
    # Create a mask of the New York City with 1 as land and 0 as water
    nyc_mask = plt.imread('../img/nyc_water_mask.png')[:,:,0] > 0.9

    # Remove points outside New York
    df = df[select_within_bounds(df, nyc_bounds)]
    print("After Bounds:", df.shape[0])

    # Map the latitudes and longitudes to the points in the map
    pickup_x, pickup_y = map_to_nyc_mask(df.pickup_longitude, df.pickup_latitude, nyc_mask.shape[1],
                                         nyc_mask.shape[0], nyc_bounds)
    dropoff_x, dropoff_y = map_to_nyc_mask(df.dropoff_longitude, df.dropoff_latitude, nyc_mask.shape[1],
                                        nyc_mask.shape[0], nyc_bounds)
    
    pickup_y[pickup_y == 1262] = 1261
    dropoff_y[dropoff_y == 1262] = 1261
    pickup_x[pickup_x == 1242] = 1241
    dropoff_x[dropoff_x == 1242] = 1241

    # Compute the indices where pickup and dropoff locations are on land
    indices = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]

    df = df[indices]
    print("Number of trips in water: ", np.sum(~indices))
    return df

In [16]:
def make_invalid_water(invalid_col):
    if (invalid_col == 1):
        return 1
    else:
        return 2
inv_vec = np.vectorize(make_invalid_water)

def get_water_invalid(df):
    df2 = remove_points_in_water(df)
    df_diff = pd.concat([df, df2])
    print("Concatenated dataframes")
    df_diff = df_diff.drop_duplicates(keep=False)
    print("dropped duplicates")
    df_diff['invalid'] = inv_vec(df_diff.invalid)
    df = pd.concat([df2, df_diff])
    df.reset_index(inplace = True)
    return df

In [17]:
df_test['key'] = keys
df_test = get_water_invalid(df_test)
df_test.drop(columns = ['key'], inplace = True)
df_test.head()

After Bounds: 10849338
Number of trips in water:  2311
Concatenated dataframes
dropped duplicates


Unnamed: 0,index,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,year,month,weekday,hour,distance,jfk_dist,ewr_dist,lga_dist,invalid
0,0,-73.972484,40.742743,-73.918937,40.764496,2009.0,1,4,0,5.991612,18.217311,17.447879,4.150934,0
1,1,-73.98585,40.722826,-73.986301,40.739347,2009.0,1,4,0,0.509264,19.747365,15.769852,10.46222,0
2,2,-73.988917,40.740142,-73.982769,40.777291,2009.0,1,4,0,1.328903,20.934736,16.044417,9.275615,0
3,3,-73.977163,40.76449,-73.914474,40.771575,2009.0,1,4,0,6.974103,18.589692,18.009149,3.581169,0
4,4,-73.948849,40.778003,-73.977678,40.748692,2009.0,1,4,0,3.329684,20.680915,17.261364,6.419967,0


In [18]:
df_test.shape

(11084772, 14)

In [19]:
df_test.drop(columns = ['index'], inplace = True)
df_test.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,year,month,weekday,hour,distance,jfk_dist,ewr_dist,lga_dist,invalid
0,-73.972484,40.742743,-73.918937,40.764496,2009.0,1,4,0,5.991612,18.217311,17.447879,4.150934,0
1,-73.98585,40.722826,-73.986301,40.739347,2009.0,1,4,0,0.509264,19.747365,15.769852,10.46222,0
2,-73.988917,40.740142,-73.982769,40.777291,2009.0,1,4,0,1.328903,20.934736,16.044417,9.275615,0
3,-73.977163,40.76449,-73.914474,40.771575,2009.0,1,4,0,6.974103,18.589692,18.009149,3.581169,0
4,-73.948849,40.778003,-73.977678,40.748692,2009.0,1,4,0,3.329684,20.680915,17.261364,6.419967,0


In [20]:
df_test.to_hdf('../data/test_airport.h5', 'data')

In [21]:
import joblib

In [22]:
model = joblib.load('../models/xgboost/baseline_model.dat')



In [26]:
import xgboost as xgb
dtest = xgb.DMatrix(df_test)

In [27]:
ypred = model.predict(dtest, ntree_limit=model.best_ntree_limit)

In [28]:
ypred.shape

(11084772,)

In [29]:
df_final = pd.DataFrame({'key':keys, 'fare_amount':ypred})
df_final.shape

(11084772, 2)

In [30]:
df_final.to_csv(r'../predictions/xgboost.csv', index = False)

In [31]:
from sklearn.model_selection import train_test_split
df_train = pd.read_hdf('../data/train_airport.h5', nrows = 10000000)
y = df_train['fare_amount']
train = df_train.drop(columns=['fare_amount'])

x_train,x_test,y_train,y_test = train_test_split(train,y,random_state=0,test_size=0.01)

MemoryError: 

In [None]:
df_train.head()

In [None]:
import xgboost as xgb
def XGBmodel(x_train,x_test,y_train,y_test,params):
    matrix_train = xgb.DMatrix(x_train,label=y_train)
    matrix_test = xgb.DMatrix(x_test,label=y_test)
    model=xgb.train(params=params,
                    dtrain=matrix_train,num_boost_round=5000, 
                    early_stopping_rounds=10,evals=[(matrix_test,'test')])
    return model

model = XGBmodel(x_train,x_test,y_train,y_test,params)