# Preprocessing data from Kaggle New York Taxi Trip Duration competition 

In [1]:
#load data 

import pandas as pd
import pickle
from sklearn.cluster import KMeans
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

trip_data_file = "data/train.csv"

full_tripdata = pd.read_csv(trip_data_file)

In [2]:
# Drop some un-wanted columns
print full_tripdata.columns
full_tripdata = full_tripdata.drop(columns=['id','vendor_id'])
print full_tripdata.columns

Index([u'id', u'vendor_id', u'pickup_datetime', u'dropoff_datetime',
       u'passenger_count', u'pickup_longitude', u'pickup_latitude',
       u'dropoff_longitude', u'dropoff_latitude', u'store_and_fwd_flag',
       u'trip_duration'],
      dtype='object')
Index([u'pickup_datetime', u'dropoff_datetime', u'passenger_count',
       u'pickup_longitude', u'pickup_latitude', u'dropoff_longitude',
       u'dropoff_latitude', u'store_and_fwd_flag', u'trip_duration'],
      dtype='object')


In [3]:
#prepare time related column , weekday and hourofday
full_tripdata["pickup_datetime"] = pd.to_datetime(full_tripdata["pickup_datetime"])
full_tripdata["dropoff_datetime"] = pd.to_datetime(full_tripdata["dropoff_datetime"])

full_tripdata["weekday"] = full_tripdata["pickup_datetime"].dt.dayofweek
full_tripdata["hourofday"] = full_tripdata["pickup_datetime"].dt.hour


In [4]:
full_tripdata.describe()
full_tripdata.columns

Index([u'pickup_datetime', u'dropoff_datetime', u'passenger_count',
       u'pickup_longitude', u'pickup_latitude', u'dropoff_longitude',
       u'dropoff_latitude', u'store_and_fwd_flag', u'trip_duration',
       u'weekday', u'hourofday'],
      dtype='object')

In [5]:
#prepare location, use pre-trained k-mean model
model = pickle.load(open('location_cluster_kmean', 'rb'))
pickupLocations = full_tripdata.iloc[:,3:5]
#print pickupLocations.describe()
full_tripdata["pickupLoc"] = model.predict(pickupLocations.as_matrix())

dropoffLocations = full_tripdata.iloc[:,5:7]
full_tripdata["dropoffLoc"] = model.predict(dropoffLocations.as_matrix())
#print dropoffLocations.describe()

       pickup_longitude  pickup_latitude
count      1.458644e+06     1.458644e+06
mean      -7.397349e+01     4.075092e+01
std        7.090186e-02     3.288119e-02
min       -1.219333e+02     3.435970e+01
25%       -7.399187e+01     4.073735e+01
50%       -7.398174e+01     4.075410e+01
75%       -7.396733e+01     4.076836e+01
max       -6.133553e+01     5.188108e+01
       dropoff_longitude  dropoff_latitude
count       1.458644e+06      1.458644e+06
mean       -7.397342e+01      4.075180e+01
std         7.064327e-02      3.589056e-02
min        -1.219333e+02      3.218114e+01
25%        -7.399133e+01      4.073588e+01
50%        -7.397975e+01      4.075452e+01
75%        -7.396301e+01      4.076981e+01
max        -6.133553e+01      4.392103e+01


In [6]:
#drop all un-wanted columns
training = full_tripdata.drop(columns=['pickup_datetime',
                                       'dropoff_datetime',
                                       'dropoff_longitude',
                                       'dropoff_latitude',
                                       'pickup_longitude',
                                       'pickup_latitude'])

In [10]:
#column data type change 
training['weekday'] = training['weekday'].astype('category')
training['hourofday'] = training['hourofday'].astype('category')
training['pickupLoc'] = training['pickupLoc'].astype('category')
training['dropoffLoc'] = training['dropoffLoc'].astype('category')
training['store_and_fwd_flag'] = training['store_and_fwd_flag'].astype('category')

In [11]:
training.columns

Index([u'passenger_count', u'store_and_fwd_flag', u'trip_duration', u'weekday',
       u'hourofday', u'pickupLoc', u'dropoffLoc'],
      dtype='object')

In [12]:
#training and result data split
X = training.drop(["trip_duration"],axis=1)
Y_triptime = training["trip_duration"]

#print X.columns
#print Y_triptime

In [13]:
#convert category data to one-hot embedding
X = pd.get_dummies(X)

In [14]:
#dump the prepaired data

pickle.dump(X,open("X_Kaggle","wb"))
pickle.dump(Y_triptime,open("Y_Kaggle_triptime","wb"))