In [258]:
#Imports
import pandas as pd
import numpy as np
import glob as glob
import datetime

In [259]:
#Reading taxi files
path = r'sampled_data/df_trips_sample_nullsdropped.parquet'
files = glob.glob(path)
files.sort()
df_trips = pd.concat([pd.read_parquet(fp) for fp in files])

In [260]:
df_trips.head()

Unnamed: 0,Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,...,Extras,Trip Total,Payment Type,Company,Pickup Centroid Latitude,Pickup Centroid Longitude,Pickup Centroid Location,Dropoff Centroid Latitude,Dropoff Centroid Longitude,Dropoff Centroid Location
15971142,65f93c35817734e3299be976d4f3b376bdd91416,561fbb8af15689fe33fb11b96905cfb95d11ac44700c1a...,08/12/2017 06:45:00 PM,08/12/2017 07:15:00 PM,1965.0,18.4,17031980000.0,17031080000.0,76.0,8.0,...,6.0,52.0,Cash,Chicago Carriage Cab Corp,41.979071,-87.90304,POINT (-87.9030396611 41.9790708201),41.892508,-87.626215,POINT (-87.6262149064 41.8925077809)
167943,7a592eba4fe74de911db4b33878eab697730ddc2,1025e3aec9a251a432a6bca2bbe7a95bc032b369fd5be1...,01/04/2017 12:30:00 PM,01/04/2017 12:45:00 PM,610.0,1.55,17031080000.0,17031840000.0,8.0,8.0,...,0.0,8.0,Cash,Chicago Carriage Cab Corp,41.890922,-87.618868,POINT (-87.6188683546 41.8909220259),41.904935,-87.649907,POINT (-87.6499072264 41.9049353016)
14807345,25e63440cd1d3596401101ffde933ec7318acfe3,e81aeae39261dd72f248f92204fd28e77269f6bcf5d5c8...,07/26/2017 07:45:00 PM,07/26/2017 07:45:00 PM,298.0,0.99,17031840000.0,17031080000.0,32.0,8.0,...,1.0,11.5,Credit Card,City Service,41.880994,-87.632746,POINT (-87.6327464887 41.8809944707),41.892042,-87.631864,POINT (-87.6318639497 41.8920421365)
3157769,a578a81d70af56cbbe627df7acc970cfd955333e,bb870d48c69eb329b88cb373037a1ee2d1c43f2d67a4b5...,02/18/2017 02:00:00 PM,02/18/2017 02:15:00 PM,805.0,1.46,17031320000.0,17031080000.0,32.0,8.0,...,1.0,9.75,Cash,Nova Taxi Affiliation Llc,41.877406,-87.621972,POINT (-87.6219716519 41.8774061234),41.898332,-87.620763,POINT (-87.6207628651 41.8983317935)
24803407,5499349d0b89bf47f44990d6380b038a48c56bc1,f86da6500edc020c8e0fef6be42bd747e0c7316b59e32f...,12/27/2017 06:00:00 PM,12/27/2017 06:00:00 PM,388.0,0.7,17031840000.0,17031080000.0,32.0,8.0,...,0.0,8.25,Credit Card,City Service,41.880994,-87.632746,POINT (-87.6327464887 41.8809944707),41.892508,-87.626215,POINT (-87.6262149064 41.8925077809)


In [261]:
#check null values 
#should be null, because the null values are sampled in Notebook 1. 
cols_dropped = df_trips.dropna(inplace=False)
print("Original frame has",len(df_trips),"rows.")
print("Original frame has",len(df_trips)-len(cols_dropped),"rows with null values.")

Original frame has 2528734 rows.
Original frame has 0 rows with null values.


In [262]:
#Min and Max Trip Start and end
print("Min start: ",min(df_trips['Trip Start Timestamp']))
print("Min end: ",min(df_trips['Trip End Timestamp']))
print("Max start: ",max(df_trips['Trip Start Timestamp']))
print("Max end: ",max(df_trips['Trip Start Timestamp']))

Min start:  01/01/2017 01:00:00 AM
Min end:  01/01/2017 01:00:00 AM
Max start:  12/31/2017 12:45:00 PM
Max end:  12/31/2017 12:45:00 PM


In [263]:
#Convert trip start and end to datetime
df_trips['Trip Start Timestamp']=pd.to_datetime(df_trips['Trip Start Timestamp'],format = '%m/%d/%Y %I:%M:%S %p')
df_trips['Trip End Timestamp']=pd.to_datetime(df_trips['Trip End Timestamp'],format = '%m/%d/%Y %I:%M:%S %p')

In [264]:
#Adding month and year columns for temp
def addCols(df):
    df_trips['DROPOFF MONTH NAME'] = df_trips['Trip End Timestamp'].dt.month_name()
    df_trips['DROPOFF YEAR'] = df_trips['Trip End Timestamp'].dt.year
    return df

df_trips = addCols(df_trips)

In [265]:
df_trips = df_trips.loc[df_trips['DROPOFF MONTH NAME']!='December']
df_trips = df_trips.loc[df_trips['DROPOFF YEAR']==2017]
print("Max start is now: ",max(df_trips['Trip Start Timestamp']))
print("Max end is now: ",max(df_trips['Trip End Timestamp']))

#Dropping month name again
df_trips.drop(columns='DROPOFF MONTH NAME', inplace = True)
df_trips.drop(columns='DROPOFF YEAR', inplace = True)

#We already see within the print, that there are too short trips - we have to check this

Max start is now:  2017-11-30 23:45:00
Max end is now:  2017-11-30 23:45:00


In [266]:
df_trips.head()

Unnamed: 0,Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,...,Extras,Trip Total,Payment Type,Company,Pickup Centroid Latitude,Pickup Centroid Longitude,Pickup Centroid Location,Dropoff Centroid Latitude,Dropoff Centroid Longitude,Dropoff Centroid Location
15971142,65f93c35817734e3299be976d4f3b376bdd91416,561fbb8af15689fe33fb11b96905cfb95d11ac44700c1a...,2017-08-12 18:45:00,2017-08-12 19:15:00,1965.0,18.4,17031980000.0,17031080000.0,76.0,8.0,...,6.0,52.0,Cash,Chicago Carriage Cab Corp,41.979071,-87.90304,POINT (-87.9030396611 41.9790708201),41.892508,-87.626215,POINT (-87.6262149064 41.8925077809)
167943,7a592eba4fe74de911db4b33878eab697730ddc2,1025e3aec9a251a432a6bca2bbe7a95bc032b369fd5be1...,2017-01-04 12:30:00,2017-01-04 12:45:00,610.0,1.55,17031080000.0,17031840000.0,8.0,8.0,...,0.0,8.0,Cash,Chicago Carriage Cab Corp,41.890922,-87.618868,POINT (-87.6188683546 41.8909220259),41.904935,-87.649907,POINT (-87.6499072264 41.9049353016)
14807345,25e63440cd1d3596401101ffde933ec7318acfe3,e81aeae39261dd72f248f92204fd28e77269f6bcf5d5c8...,2017-07-26 19:45:00,2017-07-26 19:45:00,298.0,0.99,17031840000.0,17031080000.0,32.0,8.0,...,1.0,11.5,Credit Card,City Service,41.880994,-87.632746,POINT (-87.6327464887 41.8809944707),41.892042,-87.631864,POINT (-87.6318639497 41.8920421365)
3157769,a578a81d70af56cbbe627df7acc970cfd955333e,bb870d48c69eb329b88cb373037a1ee2d1c43f2d67a4b5...,2017-02-18 14:00:00,2017-02-18 14:15:00,805.0,1.46,17031320000.0,17031080000.0,32.0,8.0,...,1.0,9.75,Cash,Nova Taxi Affiliation Llc,41.877406,-87.621972,POINT (-87.6219716519 41.8774061234),41.898332,-87.620763,POINT (-87.6207628651 41.8983317935)
18992750,546c9371344e877441cd1466296bb60bb418b588,26b43fecf9e9479444973797e89a74f559183f1cc1abf0...,2017-09-28 09:00:00,2017-09-28 09:15:00,1332.0,1.59,17031080000.0,17031080000.0,8.0,8.0,...,0.0,12.0,Cash,Nova Taxi Affiliation Llc,41.890922,-87.618868,POINT (-87.6188683546 41.8909220259),41.892508,-87.626215,POINT (-87.6262149064 41.8925077809)


### clean trip seconds

In [277]:
#Get min and max durations
print("Min duration in seconds: ",min(df_trips['Trip Seconds']))
print("Max duration in seconds: ",max(df_trips['Trip Seconds']))

Min duration in seconds:  180.0
Max duration in seconds:  4800.0


In [268]:
#Set limits for durations based on quantiles
durLimit_duration  = [np.percentile(df_trips['Trip Seconds'], 8.0), np.percentile(df_trips['Trip Seconds'], 99.7)]

#Min trip: 2 min
durLimit_duration

[180.0, 4800.0]

In [269]:
#Limit trip_duration based on durLimit_miles
df_trips = df_trips[(df_trips['Trip Seconds'] >= durLimit_duration[0] ) & (df_trips['Trip Seconds'] <= durLimit_duration[1]) ]

### clean miles

In [276]:
print("Min trip miles: ",min(df_trips['Trip Miles']))
print("Max trip miles: ",max(df_trips['Trip Miles']))

Min trip miles:  0.3
Max trip miles:  20.9


In [271]:
#Set limits for miles based on quantiles
durLimit_miles  = [np.percentile(df_trips['Trip Miles'], 13.0), np.percentile(df_trips['Trip Miles'], 99.7)]

#Min miules: 0.3 ~ 500 meter 
durLimit_miles

[0.3, 20.9]

In [272]:
#Limit trip_duration based on durLimit_miles
df_trips = df_trips[(df_trips['Trip Miles'] >= durLimit_miles[0] ) & (df_trips['Trip Miles'] <= durLimit_miles[1]) ]

In [278]:
df_trips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1917471 entries, 15971142 to 22504342
Data columns (total 23 columns):
 #   Column                      Dtype         
---  ------                      -----         
 0   Trip ID                     object        
 1   Taxi ID                     object        
 2   Trip Start Timestamp        datetime64[ns]
 3   Trip End Timestamp          datetime64[ns]
 4   Trip Seconds                float64       
 5   Trip Miles                  float64       
 6   Pickup Census Tract         float64       
 7   Dropoff Census Tract        float64       
 8   Pickup Community Area       float64       
 9   Dropoff Community Area      float64       
 10  Fare                        float64       
 11  Tips                        float64       
 12  Tolls                       float64       
 13  Extras                      float64       
 14  Trip Total                  float64       
 15  Payment Type                object        
 16  Company   

### check trip totals

In [283]:
print("Min trip total: ",min(df_trips['Trip Total']))
print("Max trip total: ",max(df_trips['Trip Total']))

Min trip total:  0.0
Max trip total:  8062.52


In [300]:
#Set limits for trip totals based on quantiles
durLimit_tt  = [np.percentile(df_trips['Trip Total'], 0.004), np.percentile(df_trips['Trip Total'], 99.9)]

#Min miules: 0.3 ~ 500 meter 
durLimit_tt

[3.25, 68.9]

In [302]:
#Limit trip_duration based on durLimit_miles
df_trips = df_trips[(df_trips['Trip Total'] >= durLimit_miles[0] ) & (df_trips['Trip Total'] <= durLimit_miles[1]) ]

In [275]:
##HOW CAN THIS BE?

print("Max start is now: ",max(df_trips['Trip Start Timestamp']))
print("Max end is now: ",max(df_trips['Trip End Timestamp']))

##to be continued....

Max start is now:  2017-11-30 23:45:00
Max end is now:  2017-11-30 23:45:00


In [304]:
df_trips['Trip Total']

167943       8.00
14807345    11.50
3157769      9.75
18992750    12.00
15539240     6.00
            ...  
8507069      7.00
1335934     12.25
9731064      5.00
11889045     6.75
15262000     8.75
Name: Trip Total, Length: 1612882, dtype: float64

In [305]:
df_trips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1612882 entries, 167943 to 15262000
Data columns (total 23 columns):
 #   Column                      Non-Null Count    Dtype         
---  ------                      --------------    -----         
 0   Trip ID                     1612882 non-null  object        
 1   Taxi ID                     1612882 non-null  object        
 2   Trip Start Timestamp        1612882 non-null  datetime64[ns]
 3   Trip End Timestamp          1612882 non-null  datetime64[ns]
 4   Trip Seconds                1612882 non-null  float64       
 5   Trip Miles                  1612882 non-null  float64       
 6   Pickup Census Tract         1612882 non-null  float64       
 7   Dropoff Census Tract        1612882 non-null  float64       
 8   Pickup Community Area       1612882 non-null  float64       
 9   Dropoff Community Area      1612882 non-null  float64       
 10  Fare                        1612882 non-null  float64       
 11  Tips              