# Data Processing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from torch.distributions import Categorical
import torch.optim as optim
import ast
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
import torch.utils.data as Data
from sklearn.preprocessing import OneHotEncoder

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
data = pd.read_csv("train.csv")
data.shape

(1710670, 9)

In [4]:
df = data.copy()
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [5]:
def polyline_to_trip_duration(polyline):
      return max(polyline.count("[") - 2, 0) * 15

In [6]:
df["LEN"] = df["POLYLINE"].apply(polyline_to_trip_duration)

In [7]:
def stats(data):
    mean, std, median = data["LEN"].mean(), data["LEN"].std(), data["LEN"].median()
    return mean, median, std
mean, median, std = stats(df)
mean, median, std

(716.4264615618442, 600.0, 684.7511617510816)

## Training Data Processing

In [8]:
def parse_time(x):
  # We are using python's builtin datetime library
  # https://docs.python.org/3/library/datetime.html#datetime.date.fromtimestamp

    dt = datetime.fromtimestamp(x["TIMESTAMP"])
    return dt.year, dt.month, dt.day, dt.hour, dt.weekday()

In [9]:
# parse time from timestamp
df[["YR", "MON", "DAY", "HR", "WK"]] = df[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,LEN,YR,MON,DAY,HR,WK
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330,2013,6,30,17,6
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270,2013,6,30,17,6
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960,2013,6,30,17,6
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630,2013,6,30,17,6
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420,2013,6,30,17,6


In [10]:
# drop columns
df = df.drop(columns =["TRIP_ID", "ORIGIN_CALL", "TIMESTAMP", "DAY_TYPE", "MISSING_DATA", "POLYLINE", "YR"])
    
df.head()

Unnamed: 0,CALL_TYPE,ORIGIN_STAND,TAXI_ID,LEN,MON,DAY,HR,WK
0,C,,20000589,330,6,30,17,6
1,B,7.0,20000596,270,6,30,17,6
2,C,,20000320,960,6,30,17,6
3,C,,20000520,630,6,30,17,6
4,C,,20000337,420,6,30,17,6


In [11]:
# one hot encoding
df_drop = df.drop(columns=["LEN"])
enc = OneHotEncoder(dtype='int').fit(df_drop)

def onehot(data, enc):
    lens = data['LEN']
    data = data.drop(columns=["LEN"])
    
    trans = enc.transform(data).toarray()
    
    trans = [np.append(trans[i], lens.iloc[i]) for i in range(len(lens))]
    return np.array(trans)

In [12]:
# remove outliers from the dataset
outlier_threshold = 3
df = df[(df["LEN"] < mean + outlier_threshold * std) & (df['LEN'] > 60)]
#buckets = (int(mean + outlier_threshold * std) // 15)
#sns.histplot(data=df,x="LEN",bins = 45)

In [13]:
'''sample_size = 100000
df_sample = df.sample(n = sample_size)
df_sample.head()
df_sample.shape'''

'sample_size = 100000\ndf_sample = df.sample(n = sample_size)\ndf_sample.head()\ndf_sample.shape'

In [14]:
X = onehot(df, enc)

In [15]:
y = X[:, -1]
X = X[:, :-1]

In [16]:
# Write X and y into csv files
np.savetxt('my_train_X.csv', X, delimiter=',', fmt='%d')
np.savetxt('my_train_y.csv', y, delimiter=',')

## Testing Data Processing

In [17]:
test_public = pd.read_csv("test_public.csv")
test_public_2 = test_public.copy()
test_public.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA
0,T1,B,,15.0,20000542,1408039037,A,False
1,T2,B,,57.0,20000108,1408038611,A,False
2,T3,B,,15.0,20000370,1408038568,A,False
3,T4,B,,53.0,20000492,1408039090,A,False
4,T5,B,,18.0,20000621,1408039177,A,False


In [18]:
test_public[["YR", "MON", "DAY", "HR", "WK"]] = test_public[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
test_public = test_public.drop(columns=["TRIP_ID", "ORIGIN_CALL", "TIMESTAMP", "DAY_TYPE", "MISSING_DATA", "YR"])
test_public.head()

Unnamed: 0,CALL_TYPE,ORIGIN_STAND,TAXI_ID,MON,DAY,HR,WK
0,B,15.0,20000542,8,14,10,3
1,B,57.0,20000108,8,14,10,3
2,B,15.0,20000370,8,14,10,3
3,B,53.0,20000492,8,14,10,3
4,B,18.0,20000621,8,14,10,3


In [19]:
test_public["LEN"] = 0
test_public = onehot(test_public, enc)
test_public = test_public[:, :589]

print(test_public.shape)
print(test_public[0])

(320, 589)
[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0

In [20]:
# write processed test_public into a csv file
np.savetxt('my_test.csv', test_public, delimiter=',', fmt='%d')