In [1]:
import datetime
import pandas as pd
import numpy as np

def minutes_after_midnight(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        time_arr = time[1].split(':')
        minutes_after_midnight = 60 * int(time_arr[0]) + int(time_arr[1])
        return 2 * np.pi * minutes_after_midnight / 1440
    else:
        return np.nan

def day_of_week(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        date_arr = time[0].split('-')
        date = datetime.date(int(date_arr[0]), int(date_arr[1]), int(date_arr[2]))
        return 2 * np.pi * (date.weekday() + 1) / 7
    else:
        return np.nan

def month(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        return 2 * np.pi * int(time[0].split('-')[1]) / 12
    else:
        return np.nan

def year(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        return float(time[0].split('-')[0])
    else:
        return np.nan

time_of_day_vec = np.vectorize(minutes_after_midnight)
day_of_week_vec = np.vectorize(day_of_week)
month_vec = np.vectorize(month)
year_vec = np.vectorize(year)

def preprocess(df):
    # Drop all null values
    df = df.dropna()

    # Cyclise time and remove key column
    time_column = df['pickup_datetime'].to_numpy()
    df = df.drop(columns=['pickup_datetime', 'key'])

    time_of_day = time_of_day_vec(time_column)
    day_of_week = day_of_week_vec(time_column)
    month = month_vec(time_column)
    year = year_vec(time_column)

    df['sin_time_of_day'] = np.sin(time_of_day)
    df['cos_time_of_day'] = np.cos(time_of_day)
    df['sin_day_of_week'] = np.sin(day_of_week)
    df['cos_day_of_week'] = np.cos(day_of_week)
    df['sin_month'] = np.sin(month)
    df['cos_month'] = np.cos(month)
    df['year'] = year
    return df

In [2]:
df_train = pd.read_csv("data/train.csv", low_memory = False)
df_train.head()


Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2010-10-20 23:26:26.0000003,4.5,2010-10-20 23:26:26 UTC,-73.98691,40.739538,-73.991381,40.745614,2
1,2009-12-30 10:56:00.00000075,4.1,2009-12-30 10:56:00 UTC,-73.961572,40.760283,-73.957438,40.769387,5
2,2012-07-20 11:24:00.00000022,6.1,2012-07-20 11:24:00 UTC,-73.979437,40.746517,-73.984195,40.732117,1
3,2011-05-31 11:29:00.000000136,4.9,2011-05-31 11:29:00 UTC,-73.964097,40.792508000000005,-73.976422,40.785767,1
4,2010-05-25 17:57:00.000000145,6.5,2010-05-25 17:57:00 UTC,-74.003943,40.72567,-73.988915,40.74837,1


In [3]:
df_train = preprocess(df_train)

In [4]:
df_train = df_train[df_train.fare_amount != 'fare_amount']
df_train['pickup_longitude'] = pd.to_numeric(df_train['pickup_longitude'])
df_train['dropoff_longitude'] = pd.to_numeric(df_train['dropoff_longitude'])
df_train['pickup_latitude'] = pd.to_numeric(df_train['pickup_latitude'])
df_train['dropoff_latitude'] = pd.to_numeric(df_train['dropoff_latitude'])

In [5]:
df_train['fare_amount'] = pd.to_numeric(df_train['fare_amount'])
df_train['passenger_count'] = pd.to_numeric(df_train['passenger_count'])
df_train = df_train[df_train['passenger_count'] > 0]
df_train = df_train[df_train['fare_amount'] > 0]

In [6]:
df_train.head()


Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,sin_time_of_day,cos_time_of_day,sin_day_of_week,cos_day_of_week,sin_month,cos_month,year
0,4.5,-73.98691,40.739538,-73.991381,40.745614,2,-0.147809,0.989016,0.433884,-0.900969,-0.8660254,0.5,2010.0
1,4.1,-73.961572,40.760283,-73.957438,40.769387,5,0.275637,-0.961262,0.433884,-0.900969,-2.449294e-16,1.0,2009.0
2,6.1,-73.979437,40.746517,-73.984195,40.732117,1,0.156434,-0.987688,-0.974928,-0.222521,-0.5,-0.866025,2012.0
3,4.9,-73.964097,40.792508,-73.976422,40.785767,1,0.134851,-0.990866,0.974928,-0.222521,0.5,-0.866025,2011.0
4,6.5,-74.003943,40.72567,-73.988915,40.74837,1,-0.999914,-0.01309,0.974928,-0.222521,0.5,-0.866025,2010.0


In [7]:
df_test = pd.read_csv("data/test.csv", low_memory = False)
df_test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-01-01 00:01:04.0000003,2009-01-01 00:01:04 UTC,-73.972484,40.742743,-73.918937,40.764496,1
1,2009-01-01 00:01:26.0000001,2009-01-01 00:01:26 UTC,-73.98585,40.722826,-73.986301,40.739347,1
2,2009-01-01 00:04:42.0000001,2009-01-01 00:04:42 UTC,-73.988917,40.740142,-73.982769,40.777291,1
3,2009-01-01 00:04:54.0000001,2009-01-01 00:04:54 UTC,-73.977163,40.76449,-73.914474,40.771575,1
4,2009-01-01 00:04:59.0000004,2009-01-01 00:04:59 UTC,-73.948849,40.778003,-73.977678,40.748692,2


In [8]:
df_test = preprocess(df_test)

In [9]:
df_test.head()


Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,sin_time_of_day,cos_time_of_day,sin_day_of_week,cos_day_of_week,sin_month,cos_month,year
0,-73.972484,40.742743,-73.918937,40.764496,1,0.004363,0.99999,-0.433884,-0.900969,0.5,0.866025,2009.0
1,-73.98585,40.722826,-73.986301,40.739347,1,0.004363,0.99999,-0.433884,-0.900969,0.5,0.866025,2009.0
2,-73.988917,40.740142,-73.982769,40.777291,1,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,2009.0
3,-73.977163,40.76449,-73.914474,40.771575,1,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,2009.0
4,-73.948849,40.778003,-73.977678,40.748692,2,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,2009.0


Our next objective is to remove all the invalid latitude and longitude values from the training and test data. So according to what we discussed, the best way to do this is to assume NYC as a box and remove all values not in the box.

In [11]:
df_train.dtypes

fare_amount          float64
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
sin_time_of_day      float64
cos_time_of_day      float64
sin_day_of_week      float64
cos_day_of_week      float64
sin_month            float64
cos_month            float64
year                 float64
dtype: object

In [10]:
df_train.to_hdf(r'data/train_mod.h5', 'data')

In [11]:
df_train = pd.read_hdf("data/train_mod.h5", low_memory = False)

In [14]:
df_train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,sin_time_of_day,cos_time_of_day,sin_day_of_week,cos_day_of_week,sin_month,cos_month,year
0,4.5,-73.98691,40.739538,-73.991381,40.745614,2,-0.147809,0.989016,0.433884,-0.900969,-0.8660254,0.5,2010.0
1,4.1,-73.961572,40.760283,-73.957438,40.769387,5,0.275637,-0.961262,0.433884,-0.900969,-2.449294e-16,1.0,2009.0
2,6.1,-73.979437,40.746517,-73.984195,40.732117,1,0.156434,-0.987688,-0.974928,-0.222521,-0.5,-0.866025,2012.0
3,4.9,-73.964097,40.792508,-73.976422,40.785767,1,0.134851,-0.990866,0.974928,-0.222521,0.5,-0.866025,2011.0
4,6.5,-74.003943,40.72567,-73.988915,40.74837,1,-0.999914,-0.01309,0.974928,-0.222521,0.5,-0.866025,2010.0


In [50]:
# Points in wata are bad..
import matplotlib.pyplot as plt
nyc_bounds = (-74.5, -72.8, 40.5, 41.8)

def select_within_bounds(df, bounds):
    pickup_indices = (df.pickup_longitude >= bounds[0]) & (df.pickup_longitude <= bounds[1]) & \
        (df.pickup_latitude >= bounds[2]) & (df.pickup_latitude <= bounds[3])

    dropoff_indices = (df.dropoff_longitude >= bounds[0]) & (df.dropoff_longitude <= bounds[1]) & \
        (df.dropoff_latitude >= bounds[2]) & (df.dropoff_latitude <= bounds[3])

    return pickup_indices & dropoff_indices

def map_to_nyc_mask(longitude, latitude, points_x, points_y, bounds):
    x = (points_x * (longitude - bounds[0]) / (bounds[1] - bounds[0])).astype('int')
    y = (points_y - points_y * (latitude - bounds[2]) / (bounds[3] - bounds[2])).astype('int')
    return x,y

def remove_points_in_water(df):
    # Create a mask of the New York City with 1 as land and 0 as water
    nyc_mask = plt.imread('img/nyc_map.png')[:,:,0] > 0.9

    df = df[select_within_bounds(df, nyc_bounds)]
    df = df.reset_index()
    longitude_min = [min(df.pickup_longitude), min(df.dropoff_longitude)]
    latitude_min = [min(df.pickup_latitude), min(df.dropoff_latitude)]
    longitude_max = [max(df.pickup_longitude), max(df.dropoff_longitude)]
    latitude_max = [max(df.pickup_latitude), max(df.dropoff_latitude)]
    
    nyc_edited_bounds = (min(longitude_min), max(longitude_max), min(latitude_min), max(latitude_max))
#     # Map the latitudes and longitudes to the points in the map
    pickup_y, pickup_x = map_to_nyc_mask(df.pickup_longitude, df.pickup_latitude, nyc_mask.shape[1], nyc_mask.shape[0], nyc_edited_bounds)
    dropoff_y, dropoff_x = map_to_nyc_mask(df.dropoff_longitude, df.dropoff_latitude, nyc_mask.shape[1], nyc_mask.shape[0], nyc_edited_bounds)


#     min_len = min([len(pickup_x), len(pickup_y), len(dropoff_x), len(dropoff_y)])
#     for i in range(min_len):
#         if (pickup_x[i] < 0 or pickup_x[i] > max_x):
#             pickup_x_del.append(i)
#         if (dropoff_x[i] < 0 or dropoff_x[i] > max_x):
#             dropoff_x_del.append(i)
#         if (pickup_y[i] < 0 or pickup_y[i] > max_y):
#             pickup_y_del.append(i)
#         if (dropoff_y[i] < 0 or dropoff_y[i] > max_y):
#             dropoff_y_del.append(i)

#     pickup_x = np.delete(pickup_x, pickup_x_del)
#     pickup_y = np.delete(pickup_y, pickup_y_del)
#     dropoff_x = np.delete(dropoff_x, dropoff_x_del)
#     dropoff_y = np.delete(dropoff_y, dropoff_y_del)
    
    
    # Compute the indices where pickup and dropoff locations are on land
    indices = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]
    
    return df[indices]

In [51]:
df_train = remove_points_in_water(df_train)

AttributeError: 'DataFrame' object has no attribute 'pickup_latutude'

In [47]:
df_train.head()

Unnamed: 0,index,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,sin_time_of_day,cos_time_of_day,sin_day_of_week,cos_day_of_week,sin_month,cos_month,year
0,0,4.5,-73.98691,40.739538,-73.991381,40.745614,2,-0.147809,0.989016,0.433884,-0.900969,-0.8660254,0.5,2010.0
1,1,4.1,-73.961572,40.760283,-73.957438,40.769387,5,0.275637,-0.961262,0.433884,-0.900969,-2.449294e-16,1.0,2009.0
2,28,16.5,-73.981567,40.743969,-73.957866,40.719673,2,-0.544639,0.838671,0.433884,-0.900969,-1.0,-1.83697e-16,2014.0
3,29,7.7,-73.992193,40.739235,-73.983641,40.756168,1,0.267238,-0.96363,-0.433884,-0.900969,-0.5,-0.8660254,2012.0
4,30,6.1,-73.961588,40.76273,-73.975683,40.765477,1,0.450098,-0.892979,0.433884,-0.900969,0.5,-0.8660254,2012.0
