In [1]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("train.csv")
#df = pd.read_csv("test_public.csv")

#Remove rows with missing data
df = df[df['MISSING_DATA'] == False]

In [3]:
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [4]:
from datetime import datetime
def parse_time(x):
    # We are using python's builtin datetime library
    # https://docs.python.org/3/library/datetime.html#datetime.date.fromtimestamp

    # Each x is essentially a 1 row, 1 column pandas Series
    dt = datetime.fromtimestamp(x["TIMESTAMP"])
    return dt.year, dt.month, dt.day, dt.hour, dt.weekday()

# Over every single 
def polyline_to_trip_duration(polyline):
    return max(polyline.count("[") - 2, 0) * 15

# This code creates a new column, "LEN", in our dataframe. The value is
# the (polyline_length - 1) * 15, where polyline_length = count("[") - 1
df["LEN"] = df["POLYLINE"].apply(polyline_to_trip_duration)
df[["YR", "MON", "DAY", "HR", "WK"]] = df[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

In [5]:
df["ORIGIN_STAND"] = df["ORIGIN_STAND"].fillna(0)
df["ORIGIN_CALL"] = df["ORIGIN_CALL"].fillna(0)

In [5]:
#One hot encoding functions
def encode_a(val):
    return int(val == 'A')

def encode_b(val):
    return int(val == 'B')

def encode_c(val):
    return int(val == 'C')

def encode_year(val):
    return int(val == 2014)

In [6]:
#Actual one hot encoding
df["CALL_TYPE_A"] = df["CALL_TYPE"].apply(encode_a)
df["CALL_TYPE_B"] = df["CALL_TYPE"].apply(encode_b)
df["CALL_TYPE_C"] = df["CALL_TYPE"].apply(encode_c)

df["DAY_TYPE_A"] = df["DAY_TYPE"].apply(encode_a)
df["DAY_TYPE_B"] = df["DAY_TYPE"].apply(encode_b)
df["DAY_TYPE_C"] = df["DAY_TYPE"].apply(encode_c)

df["YR_2014"] = df["YR"].apply(encode_year)

In [7]:
#Polyline (str type) to list
def polyline_to_list(polyline):
    all_pos = []
    polyline = polyline[1:len(polyline)-1].split('],[')[:-1]
    for pos in polyline:
        pos = pos.strip('[').strip(']').split(',')
        pos[0] = float(pos[0])
        pos[1] = float(pos[1])
        all_pos.append(pos)
    return all_pos

#takes too long
#df_tr["POLYLINE_LIST"] = df_tr["POLYLINE"].apply(polyline_to_list)

In [10]:
#Clean out unnecessary columns
df = df.drop(columns = ["MISSING_DATA", "CALL_TYPE", "DAY_TYPE"])

In [11]:
df.head()

Unnamed: 0,TRIP_ID,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,YR,MON,DAY,HR,WK,CALL_TYPE_A,CALL_TYPE_B,CALL_TYPE_C,DAY_TYPE_A,DAY_TYPE_B,DAY_TYPE_C,YR_2014
0,T1,,15.0,20000542,1408039037,2014,8,14,17,3,0,1,0,1,0,0,1
1,T2,,57.0,20000108,1408038611,2014,8,14,17,3,0,1,0,1,0,0,1
2,T3,,15.0,20000370,1408038568,2014,8,14,17,3,0,1,0,1,0,0,1
3,T4,,53.0,20000492,1408039090,2014,8,14,17,3,0,1,0,1,0,0,1
4,T5,,18.0,20000621,1408039177,2014,8,14,17,3,0,1,0,1,0,0,1


In [12]:
#df.to_csv("processed_test.csv")
df.to_csv("processed_train.csv")

In [13]:
len(df)

320