In [23]:
#load data 

import pandas as pd
import pickle
from sklearn.cluster import KMeans
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

trip_data_file = "data/trip_data_4.csv"
trip_fare_file = "data/trip_fare_4.csv"

trips = pd.read_csv(trip_data_file, nrows=100000)
fares = pd.read_csv(trip_fare_file, nrows=100000)

full_tripdata = pd.merge(left=trips,right=fares, how='left', left_on=['medallion',' hack_license',' vendor_id',' pickup_datetime'], right_on=['medallion',' hack_license',' vendor_id',' pickup_datetime'])
full_tripdata = full_tripdata.drop_duplicates()

In [24]:
# Drop some un-wanted columns
full_tripdata = full_tripdata.drop(columns=['medallion',' hack_license',' vendor_id',' rate_code',' store_and_fwd_flag'])

In [25]:
#prepare time related column , weekday and hourofday
full_tripdata[" pickup_datetime"] = pd.to_datetime(full_tripdata[" pickup_datetime"])
full_tripdata[" dropoff_datetime"] = pd.to_datetime(full_tripdata[" dropoff_datetime"])

full_tripdata["weekday"] = full_tripdata[" pickup_datetime"].dt.dayofweek
full_tripdata["hourofday"] = full_tripdata[" pickup_datetime"].dt.hour


In [None]:
full_tripdata.describe()

In [26]:
#remove outliers
measuredata = pd.concat([full_tripdata[' trip_time_in_secs'], 
                         full_tripdata[' pickup_longitude'],
                         full_tripdata[' pickup_latitude'],
                         full_tripdata[' dropoff_longitude'],
                         full_tripdata[' fare_amount'],
                         full_tripdata[' tip_amount'],
                        ],axis=1)

#remove data out of two standard deviation
full_tripdata = full_tripdata[(np.abs(stats.zscore(measuredata)) < 2).all(axis=1)]

#full_tripdata = full_tripdata.loc[((full_tripdata[' pickup_longitude'] < -72.0) & (full_tripdata[' pickup_latitude'] > 30.0)) 
                                  # | ((full_tripdata[' dropoff_latitude'] < -72.0) & (full_tripdata[' dropoff_latitude'] > 30.0))]

In [21]:
full_tripdata.describe()
full_tripdata.columns

Index([u' pickup_datetime', u' dropoff_datetime', u' passenger_count',
       u' trip_time_in_secs', u' trip_distance', u' pickup_longitude',
       u' pickup_latitude', u' dropoff_longitude', u' dropoff_latitude',
       u' payment_type', u' fare_amount', u' surcharge', u' mta_tax',
       u' tip_amount', u' tolls_amount', u' total_amount', u'weekday',
       u'hourofday', u'pickupLoc', u'dropoffLoc'],
      dtype='object')

In [27]:
#prepare location, use pre-trained k-mean model
model = pickle.load(open('location_cluster_kmean', 'rb'))
pickupLocations = full_tripdata.iloc[:,5:7]
#print pickupLocations.describe()
full_tripdata["pickupLoc"] = model.predict(pickupLocations.as_matrix())

dropoffLocations = full_tripdata.iloc[:,7:9]
full_tripdata["dropoffLoc"] = model.predict(dropoffLocations.as_matrix())
#print dropoffLocations.describe()

In [28]:
#drop all un-wanted columns
training = full_tripdata.drop(columns=[' pickup_datetime',
                                       ' dropoff_datetime',
                                       ' trip_time_in_secs',
                                       ' trip_distance',
                                       ' dropoff_longitude',
                                       ' dropoff_latitude',
                                       ' surcharge',
                                       ' mta_tax',
                                       ' tolls_amount',
                                       ' total_amount',
                                       ' passenger_count',
                                       ' payment_type',
                                       ' pickup_longitude',
                                       ' pickup_latitude'])

training.describe()

Unnamed: 0,fare_amount,tip_amount,weekday,hourofday,pickupLoc,dropoffLoc
count,91196.0,91196.0,91196.0,91196.0,91196.0,91196.0
mean,10.362697,1.056253,4.625334,10.267084,23.604687,24.214055
std,5.385798,1.290242,0.49455,9.30189,14.062651,14.017428
min,2.5,0.0,0.0,0.0,0.0,0.0
25%,6.5,0.0,4.0,1.0,11.0,12.0
50%,9.0,0.9,5.0,7.0,23.0,24.0
75%,13.0,1.9,5.0,22.0,37.0,38.0
max,31.5,5.8,6.0,23.0,49.0,49.0


In [29]:
#column data type change 
training['weekday'] = training['weekday'].astype('category')
training['hourofday'] = training['hourofday'].astype('category')
training['pickupLoc'] = training['pickupLoc'].astype('category')
training['dropoffLoc'] = training['dropoffLoc'].astype('category')

In [30]:
#training and result data split
X = training.iloc[:,2:]
Y_fare = training.iloc[:,0]
Y_tip = training.iloc[:,1]

In [31]:
#convert category data to one-hot embedding
X = pd.get_dummies(X)

In [32]:
#dump the prepaired data

pickle.dump(X,open("X","wb"))
pickle.dump(Y_fare,open("Y_fare","wb"))
pickle.dump(X,open("Y_tip","wb"))