# Preprocessing data from Kaggle New York Taxi Trip Duration competition 

In [1]:
#load data 

import pandas as pd
import pickle
from sklearn.cluster import KMeans
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

trip_data_file = "data/train.csv"

full_tripdata = pd.read_csv(trip_data_file, nrows=100000)

In [2]:
# Drop some un-wanted columns
print full_tripdata.columns
full_tripdata = full_tripdata.drop(columns=['id','vendor_id','store_and_fwd_flag','passenger_count'])
print full_tripdata.columns

Index([u'id', u'vendor_id', u'pickup_datetime', u'dropoff_datetime',
       u'passenger_count', u'pickup_longitude', u'pickup_latitude',
       u'dropoff_longitude', u'dropoff_latitude', u'store_and_fwd_flag',
       u'trip_duration'],
      dtype='object')
Index([u'pickup_datetime', u'dropoff_datetime', u'pickup_longitude',
       u'pickup_latitude', u'dropoff_longitude', u'dropoff_latitude',
       u'trip_duration'],
      dtype='object')


In [3]:
#prepare time related column , weekday and hourofday
full_tripdata["pickup_datetime"] = pd.to_datetime(full_tripdata["pickup_datetime"])
full_tripdata["dropoff_datetime"] = pd.to_datetime(full_tripdata["dropoff_datetime"])

full_tripdata["weekday"] = full_tripdata["pickup_datetime"].dt.dayofweek
full_tripdata["hourofday"] = full_tripdata["pickup_datetime"].dt.hour


In [4]:
full_tripdata.describe()
full_tripdata.columns

Index([u'pickup_datetime', u'dropoff_datetime', u'pickup_longitude',
       u'pickup_latitude', u'dropoff_longitude', u'dropoff_latitude',
       u'trip_duration', u'weekday', u'hourofday'],
      dtype='object')

In [5]:
#prepare location, use pre-trained k-mean model
model = pickle.load(open('location_cluster_kmean', 'rb'))
pickupLocations = full_tripdata.iloc[:,2:4]
#print pickupLocations.describe()
full_tripdata["pickupLoc"] = model.predict(pickupLocations.as_matrix())

dropoffLocations = full_tripdata.iloc[:,4:6]
full_tripdata["dropoffLoc"] = model.predict(dropoffLocations.as_matrix())
#print dropoffLocations.describe()

In [6]:
#drop all un-wanted columns
training = full_tripdata.drop(columns=['pickup_datetime',
                                       'dropoff_datetime',
                                       'dropoff_longitude',
                                       'dropoff_latitude',
                                       'pickup_longitude',
                                       'pickup_latitude'])

In [7]:
#column data type change 
training['weekday'] = training['weekday'].astype('category')
training['hourofday'] = training['hourofday'].astype('category')
training['pickupLoc'] = training['pickupLoc'].astype('category')
training['dropoffLoc'] = training['dropoffLoc'].astype('category')

In [8]:
training.columns

Index([u'trip_duration', u'weekday', u'hourofday', u'pickupLoc',
       u'dropoffLoc'],
      dtype='object')

In [9]:
#training and result data split
X = training.iloc[:,1:]
Y_triptime = training.iloc[:,0]


In [10]:
#convert category data to one-hot embedding
X = pd.get_dummies(X)

In [11]:
#dump the prepaired data

pickle.dump(X,open("X_Kaggle","wb"))
pickle.dump(Y_triptime,open("Y_Kaggle_triptime","wb"))