In [2]:
import datetime
import pandas as pd
import numpy as np

def minutes_after_midnight(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        time_arr = time[1].split(':')
        minutes_after_midnight = 60 * int(time_arr[0]) + int(time_arr[1])
        return 2 * np.pi * minutes_after_midnight / 1440
    else:
        return np.nan

def day_of_week(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        date_arr = time[0].split('-')
        date = datetime.date(int(date_arr[0]), int(date_arr[1]), int(date_arr[2]))
        return 2 * np.pi * (date.weekday() + 1) / 7
    else:
        return np.nan

def month(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        return 2 * np.pi * int(time[0].split('-')[1]) / 12
    else:
        return np.nan

def year(date_time):
    time = date_time.split(' ')
    if len(time) == 3:
        return float(time[0].split('-')[0])
    else:
        return np.nan

time_of_day_vec = np.vectorize(minutes_after_midnight)
day_of_week_vec = np.vectorize(day_of_week)
month_vec = np.vectorize(month)
year_vec = np.vectorize(year)

def preprocess(df):
    time_column = df['pickup_datetime'].to_numpy()
    df = df.drop(columns=['pickup_datetime'])

    time_of_day = time_of_day_vec(time_column)
    df['time_of_day_sin'] = np.sin(time_of_day)
    df['time_of_day_cos'] = np.cos(time_of_day)

    day_of_week = day_of_week_vec(time_column)
    df['day_of_week_sin'] = np.sin(day_of_week)
    df['day_of_week_cos'] = np.cos(day_of_week)

    month = month_vec(time_column)
    df['month_sin'] = np.sin(month)
    df['month_cos'] = np.cos(month)

    df['year'] = year_vec(time_column)

    df = df.dropna()
    return df

In [3]:
df = pd.read_csv("../train.csv", low_memory = False)
df.head()


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2010-10-20 23:26:26.0000003,4.5,2010-10-20 23:26:26 UTC,-73.9869,40.7395,-73.9914,40.7456,2
1,2009-12-30 10:56:00.00000075,4.1,2009-12-30 10:56:00 UTC,-73.9616,40.7603,-73.9574,40.7694,5
2,2012-07-20 11:24:00.00000022,6.1,2012-07-20 11:24:00 UTC,-73.9794,40.7465,-73.9842,40.7321,1
3,2011-05-31 11:29:00.000000136,4.9,2011-05-31 11:29:00 UTC,-73.9641,40.7925,-73.9764,40.7858,1
4,2010-05-25 17:57:00.000000145,6.5,2010-05-25 17:57:00 UTC,-74.0039,40.7257,-73.9889,40.7484,1


In [4]:
df = preprocess(df)

In [5]:
df.head()


Unnamed: 0,key,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,time_of_day_sin,time_of_day_cos,day_of_week_sin,day_of_week_cos,month_sin,month_cos,year
0,2010-10-20 23:26:26.0000003,4.5,-73.9869,40.7395,-73.9914,40.7456,2,-0.147809,0.989016,0.433884,-0.900969,-0.8660254,0.5,2010.0
1,2009-12-30 10:56:00.00000075,4.1,-73.9616,40.7603,-73.9574,40.7694,5,0.275637,-0.961262,0.433884,-0.900969,-2.449294e-16,1.0,2009.0
2,2012-07-20 11:24:00.00000022,6.1,-73.9794,40.7465,-73.9842,40.7321,1,0.156434,-0.987688,-0.974928,-0.222521,-0.5,-0.866025,2012.0
3,2011-05-31 11:29:00.000000136,4.9,-73.9641,40.7925,-73.9764,40.7858,1,0.134851,-0.990866,0.974928,-0.222521,0.5,-0.866025,2011.0
4,2010-05-25 17:57:00.000000145,6.5,-74.0039,40.7257,-73.9889,40.7484,1,-0.999914,-0.01309,0.974928,-0.222521,0.5,-0.866025,2010.0


In [6]:
df.to_csv("train_mod.csv")

In [10]:
df = pd.read_csv("../test.csv", low_memory = False)
df.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-01-01 00:01:04.0000003,2009-01-01 00:01:04 UTC,-73.972484,40.742743,-73.918937,40.764496,1
1,2009-01-01 00:01:26.0000001,2009-01-01 00:01:26 UTC,-73.98585,40.722826,-73.986301,40.739347,1
2,2009-01-01 00:04:42.0000001,2009-01-01 00:04:42 UTC,-73.988917,40.740142,-73.982769,40.777291,1
3,2009-01-01 00:04:54.0000001,2009-01-01 00:04:54 UTC,-73.977163,40.76449,-73.914474,40.771575,1
4,2009-01-01 00:04:59.0000004,2009-01-01 00:04:59 UTC,-73.948849,40.778003,-73.977678,40.748692,2


In [11]:
df = preprocess(df)

In [12]:
df.to_csv("test_mod.csv")

In [13]:
df.head()


Unnamed: 0,key,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,time_of_day_sin,time_of_day_cos,day_of_week_sin,day_of_week_cos,month_sin,month_cos,year
0,2009-01-01 00:01:04.0000003,-73.972484,40.742743,-73.918937,40.764496,1,0.004363,0.99999,-0.433884,-0.900969,0.5,0.866025,2009.0
1,2009-01-01 00:01:26.0000001,-73.98585,40.722826,-73.986301,40.739347,1,0.004363,0.99999,-0.433884,-0.900969,0.5,0.866025,2009.0
2,2009-01-01 00:04:42.0000001,-73.988917,40.740142,-73.982769,40.777291,1,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,2009.0
3,2009-01-01 00:04:54.0000001,-73.977163,40.76449,-73.914474,40.771575,1,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,2009.0
4,2009-01-01 00:04:59.0000004,-73.948849,40.778003,-73.977678,40.748692,2,0.017452,0.999848,-0.433884,-0.900969,0.5,0.866025,2009.0
