In [1]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn

In [2]:
#Flag for processing training csv or test csv
train = True

In [3]:

if train:
    df = pd.read_csv("train.csv")
else:
    df = pd.read_csv("test_public.csv")

#Remove rows with missing data
df = df[df['MISSING_DATA'] == False]

In [4]:
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA
0,T1,B,,15.0,20000542,1408039037,A,False
1,T2,B,,57.0,20000108,1408038611,A,False
2,T3,B,,15.0,20000370,1408038568,A,False
3,T4,B,,53.0,20000492,1408039090,A,False
4,T5,B,,18.0,20000621,1408039177,A,False


In [5]:
len(df)

320

In [6]:
from datetime import datetime
def parse_time(x):
    # We are using python's builtin datetime library
    # https://docs.python.org/3/library/datetime.html#datetime.date.fromtimestamp

    # Each x is essentially a 1 row, 1 column pandas Series
    dt = datetime.fromtimestamp(x["TIMESTAMP"])
    return dt.year, dt.month, dt.day, dt.hour, dt.weekday()

# Over every single 
def polyline_to_trip_duration(polyline):
    return max(polyline.count("[") - 2, 0) * 15

# This code creates a new column, "LEN", in our dataframe. The value is
# the (polyline_length - 1) * 15, where polyline_length = count("[") - 1
if train:
    df["LEN"] = df["POLYLINE"].apply(polyline_to_trip_duration)
df[["YR", "MON", "DAY", "HR", "WK"]] = df[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

In [7]:
df["ORIGIN_STAND"] = df["ORIGIN_STAND"].fillna(0)
df["ORIGIN_CALL"] = df["ORIGIN_CALL"].fillna(0)

In [8]:
#One hot encoding functions
def encode_a(val):
    return int(val == 'A')

def encode_b(val):
    return int(val == 'B')

def encode_c(val):
    return int(val == 'C')

def encode_year(val):
    return int(val == 2014)

In [9]:
#Actual one hot encoding
df["CALL_TYPE_A"] = df["CALL_TYPE"].apply(encode_a)
df["CALL_TYPE_B"] = df["CALL_TYPE"].apply(encode_b)
df["CALL_TYPE_C"] = df["CALL_TYPE"].apply(encode_c)

df["DAY_TYPE_A"] = df["DAY_TYPE"].apply(encode_a)
df["DAY_TYPE_B"] = df["DAY_TYPE"].apply(encode_b)
df["DAY_TYPE_C"] = df["DAY_TYPE"].apply(encode_c)

In [10]:
#Remove all LEN == 0
if train:
    zero_count = len(df[df["LEN"] == 0])
    print(f"Removing {zero_count} LEN = 0 datapoints")
    df = df[df["LEN"] != 0]

In [11]:
len(df)

320

In [12]:
if train:
    #remove outliers
    outlier_threshold = 3
    mean, std = df["LEN"].mean(), df["LEN"].std()
    median = df["LEN"].median()
    df = df[df["LEN"] < mean + outlier_threshold * std]

In [13]:
len(df)

320

In [14]:
#Polyline (str type) to list
# def polyline_to_list(polyline):
#     all_pos = []
#     polyline = polyline[1:len(polyline)-1].split('],[')[:-1]
#     for pos in polyline:
#         pos = pos.strip('[').strip(']').split(',')
#         pos[0] = float(pos[0])
#         pos[1] = float(pos[1])
#         all_pos.append(pos)
#     return all_pos

#takes too long
#df_tr["POLYLINE_LIST"] = df_tr["POLYLINE"].apply(polyline_to_list)

In [15]:
taxi_ids = list(sorted(df.TAXI_ID.unique()))

In [16]:
df["TAXI_ID"] = df["TAXI_ID"].apply(taxi_ids.index)

In [17]:
#Clean out unnecessary columns
df = df.drop(columns = ["TIMESTAMP", "MISSING_DATA", "CALL_TYPE", "DAY_TYPE", "YR", "TRIP_ID", "ORIGIN_CALL"])
if train:
    df.drop(columns = ["POLYLINE"])

In [18]:
df.head()

Unnamed: 0,ORIGIN_STAND,TAXI_ID,MON,DAY,HR,WK,CALL_TYPE_A,CALL_TYPE_B,CALL_TYPE_C,DAY_TYPE_A,DAY_TYPE_B,DAY_TYPE_C
0,15.0,190,8,14,17,3,0,1,0,1,0,0
1,57.0,37,8,14,17,3,0,1,0,1,0,0
2,15.0,129,8,14,17,3,0,1,0,1,0,0
3,53.0,171,8,14,17,3,0,1,0,1,0,0
4,18.0,217,8,14,17,3,0,1,0,1,0,0


In [19]:
len(df)

320

In [20]:
if train:
    df.to_csv("processed_train.csv")
else:
    df.to_csv("processed_test.csv")