This workbook is intent to assess and clean the full_train dataset, remove missing and invalid trip.

In [1]:
import pandas as pd
import numpy as np
from geopy.distance import great_circle
pd.set_option("max_colwidth",1000000)
pd.set_option('max_columns', 50)

In [2]:
train=pd.read_csv("full_train_parsed.csv")

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 22 columns):
id                     1458644 non-null object
vendor_id              1458644 non-null int64
pickup_datetime        1458644 non-null object
dropoff_datetime       1458644 non-null object
passenger_count        1458644 non-null int64
pickup_longitude       1458644 non-null float64
pickup_latitude        1458644 non-null float64
dropoff_longitude      1458644 non-null float64
dropoff_latitude       1458644 non-null float64
store_and_fwd_flag     1458644 non-null object
trip_duration          1458644 non-null int64
pickup_date            1458644 non-null object
date                   1458644 non-null object
maximum temperature    1458644 non-null int64
minimum temperature    1458644 non-null int64
average temperature    1458644 non-null float64
precipitation          1458644 non-null object
snow fall              1458644 non-null object
snow depth             1458644 non-null o

Calculate a great circle distance between pickup location and dropoff location. Create speed columns

In [4]:
train['great_circle_distance']=train.apply(lambda row: great_circle((row['pickup_latitude'],row['pickup_longitude']),(row['dropoff_latitude'],row['dropoff_longitude'])).meters,axis=1)
train['speed']=train['distance']/train['trip_duration']*3.6
train['speed_fast_route']=train['distance']/train['duration']*3.6
train['speed_great_circle']=train['great_circle_distance']/train['trip_duration']*3.6


Remove pickup_date since it's the same day as weather report date.

In [5]:
del train['pickup_date']

Remove a row where the fastest route information is missing. 

In [6]:
train[train['distance'].isnull()]

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,date,maximum temperature,minimum temperature,average temperature,precipitation,snow fall,snow depth,distance,duration,route_cooridnates,great_circle_distance,speed,speed_fast_route,speed_great_circle
482448,id2767091,1,2016-04-29 02:46:41,2016-04-29 02:48:38,1,-73.993896,40.751396,-73.993866,40.751396,N,117,2016-04-29,58,45,51.5,0.05,0.0,0,,,,2.571391,,,0.07912


In [7]:
train=train[train['distance'].notnull()]
print(train.shape)

(1458643, 25)


### Define criteria to clean the data

#### 1. Great circle distance must be larger than zero:


     great_circle_distance>0


#### 2. Great circle speed is in the range of 0km/h to 120km/h. I think the average speed for a taxi is higher than 120km/h is almost impossible.


     0<speed_great_circle<120
     
     
#### 3. Trip duration must be larger than zero.


     trip_duration>0
 
 
#### 4. Check any route where the great circel distance is smaller than the fastest route, which is impossible. In fact, I think an measure error less than 25 meters is still acceptable.


     distance>=great_circel_distance-25
     
     
#### 5. From 2 and 4, we can infer the criteria for the speed if the driver takes the fastest route.


     speed>=speed_great_circle-90/trip_duration and speed+90/trip_duration<120
     
     
#### 6. There are extreme trips should be removed: long duration low speed and short duration high speed.


    Remove speed_great_circle<5 and trip_duration>3600*5
    Remove speed_great_circle>80 and trip_duration<30
    
    
#### 7. Finally, we should remove one trip that occurs in San Francisco.


    -80<pickup_longitude<-68
     

In [8]:
train=train[train['great_circle_distance']>0]
print(train.shape)
train=train[(train['speed_great_circle']>0) & (train['speed_great_circle']<120) ]
print(train.shape)
train=train[train['trip_duration']>0]
print(train.shape)
train=train[train['distance']>=train['great_circle_distance']-25]
print(train.shape)
train=train[(train['speed']>=train['speed_great_circle']-90/train['trip_duration']) & (train['speed']+90/train['trip_duration']<120) ]
print(train.shape)
train=train[((train['speed_great_circle']<5) & (train['trip_duration']>3600*5))!=True]
print(train.shape)
train=train[((train['speed_great_circle']>80) & (train['trip_duration']<30))!=True]
print(train.shape)
train=train[(train['pickup_longitude']>-80) & (train['pickup_longitude']<-68)]
print(train.shape)
train=train[(train['dropoff_longitude']>-80) & (train['dropoff_longitude']<-68)]
print(train.shape)

(1452746, 25)
(1452610, 25)
(1452610, 25)
(1452549, 25)
(1451532, 25)
(1449464, 25)
(1449433, 25)
(1449431, 25)
(1449431, 25)


In [14]:
train['snow fall']=train['snow fall'].apply(lambda x: 0.001 if x=='T' else x)
train['precipitation']=train['precipitation'].apply(lambda x: 0.001 if x=='T' else x)
train['snow depth']=train['snow depth'].apply(lambda x: 0.001 if x=='T' else x)

In [16]:
train.to_csv("train_full_parsed_clean2.csv",index=False)