In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import shapely
from shapely import wkt
import h3
from datetime import date
import pyarrow.parquet as pq
import pyarrow as pa
#from fastparquet import write
#from parquet

# Data Cleaning

In [2]:
#adding and dropping columns
df = pd.read_parquet("../data/df_sample_29_06_2021.parquet")
df["PU_Centroid"] = gpd.points_from_xy(df["Pickup Centroid Longitude"], df["Pickup Centroid Latitude"])
df["DO_Centroid"] = gpd.points_from_xy(df["Dropoff Centroid Longitude"], df["Dropoff Centroid Latitude"])                                                   
df = df.drop(columns = ["Pickup Centroid Location","Dropoff Centroid  Location"])
df["Trip Start Timestamp"] = pd.to_datetime(df["Trip Start Timestamp"],format = '%m/%d/%Y %I:%M:%S %p')
df["Trip End Timestamp"] = pd.to_datetime(df["Trip End Timestamp"],format = '%m/%d/%Y %I:%M:%S %p')
df.head()

Unnamed: 0,Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,...,Extras,Trip Total,Payment Type,Company,Pickup Centroid Latitude,Pickup Centroid Longitude,Dropoff Centroid Latitude,Dropoff Centroid Longitude,PU_Centroid,DO_Centroid
18066654,e37d3366d808cd182b90ca9c9679af1baccd0de1,51482afe455eeface5c7492f4dc7638fd2c3a7e10f9174...,2017-09-14 15:45:00,2017-09-14 17:00:00,5138.0,20.14,17031980000.0,17031320000.0,76.0,32.0,...,4.0,73.2,Credit Card,Chicago Carriage Cab Corp,41.979071,-87.90304,41.884987,-87.620993,POINT (-87.90304 41.97907),POINT (-87.62099 41.88499)
13657913,432494f6ae1a858d7ff8883a12d234c1986178df,16c0058a9dec4071ca1a3eedf10a184abdf86cde941d12...,2017-07-10 18:15:00,2017-07-10 18:15:00,480.0,0.6,17031840000.0,17031320000.0,32.0,32.0,...,0.0,7.5,Credit Card,Northwest Management LLC,41.880994,-87.632746,41.877406,-87.621972,POINT (-87.63275 41.88099),POINT (-87.62197 41.87741)
22216829,7e6ad6a2a05963b4266033ed32235db75393ae53,c1c6cbcd951b1a905b12d6941346e94f3923eafe53a185...,2017-11-14 05:45:00,2017-11-14 05:45:00,0.0,0.2,17031980000.0,17031980000.0,76.0,76.0,...,0.0,3.5,Cash,Taxi Affiliation Service Yellow,41.979071,-87.90304,41.979071,-87.90304,POINT (-87.90304 41.97907),POINT (-87.90304 41.97907)
20008152,b9f3e2f8f6e6d133174acd85d43cb250f82c3c97,38d9d0618cde52a194ab0d7c6617fcd4554809f368f1b5...,2017-10-12 21:15:00,2017-10-12 21:30:00,1020.0,3.7,,,8.0,24.0,...,0.0,16.5,Credit Card,Medallion Leasin,41.899602,-87.633308,41.901207,-87.676356,POINT (-87.63331 41.89960),POINT (-87.67636 41.90121)
19672098,4f35f75eeb8ea7f9668f56e51bd11bd8931f58a8,036b8cc6d5f37747d91884dfb7fa3220836b8e556c8359...,2017-10-08 09:30:00,2017-10-08 09:45:00,767.0,8.54,17031080000.0,17031410000.0,8.0,41.0,...,1.0,24.0,Cash,Patriot Taxi Dba Peace Taxi Associat,41.892508,-87.626215,41.790506,-87.583144,POINT (-87.62621 41.89251),POINT (-87.58314 41.79051)


In [3]:
#deleting null values 
df = df[df['Trip Seconds'].notna()]
df = df[df['Trip Miles'].notna()]
df = df[df['Trip Total'].notna()]
df = df[df['Pickup Centroid Latitude'].notna()]
df = df[df['Pickup Centroid Longitude'].notna()]
df = df[df['Dropoff Centroid Latitude'].notna()]
df = df[df['Dropoff Centroid Longitude'].notna()]

In [4]:
df_filtered = df.nsmallest(int(df.index.size * 0.999), "Trip Total", keep='first')
total_filter = df_filtered["Trip Total"].max()
min_total = df_filtered["Trip Total"].min()
print("Max Total:",total_filter)
print("Min Total:",min_total)

df_filtered = df.nsmallest(int(df.index.size * 0.999), "Trip Seconds", keep='first')
seconds_filter = df_filtered["Trip Seconds"].max()
min_seconds = df_filtered["Trip Seconds"].min()
print("Max Seconds:",seconds_filter)
print("Min Seconds:",min_seconds)

df = df.copy()[(df["Trip Total"] <= total_filter)&
                   (df["Trip Seconds"]<=seconds_filter)&
                   (df["Trip Seconds"]> 60)]
print("Kept",np.round(df.index.size / df.index.size,4),"percent of data")

Max Total: 86.7
Min Total: 0.0
Max Seconds: 6035.0
Min Seconds: 0.0
Kept 1.0 percent of data


In [5]:
#kick out values under 2$
print("0$ Total Trips:",df[df["Trip Total"]==0].index.size)
print("0$ Fare Trips:",df[df["Fare"]==0].index.size)


df = df.copy()[(df["Trip Total"] >= 2)&
                   (df["Fare"]>=2)]

print("New min Total: ",min(df["Trip Total"]))
print("New max Total: ",max(df["Trip Total"]))
print("New min Fare: ",min(df["Fare"]))
print("New max Fare: ",max(df["Fare"]))

print("Kept",np.round(df.index.size / df.index.size,4),"percent of data")

0$ Total Trips: 334
0$ Fare Trips: 402
New min Total:  2.0
New max Total:  86.7
New min Fare:  2.0
New max Fare:  86.6
Kept 1.0 percent of data


In [6]:
print("New min Total: ",min(df["Trip Total"]))
print("New max Total: ",max(df["Trip Total"]))

New min Total:  2.0
New max Total:  86.7


In [7]:
#For understanding 
df[["Trip Total","Tolls","Tips","Extras","Fare"]].sample(10)

Unnamed: 0,Trip Total,Tolls,Tips,Extras,Fare
5284700,8.75,,3.0,0.0,5.75
18257387,8.25,,2.0,0.0,6.25
7955149,9.5,0.0,1.0,1.0,7.5
6142449,8.0,0.0,0.0,1.0,7.0
4361769,11.0,0.0,2.0,0.0,8.5
17058524,18.11,,2.36,0.0,15.75
9483718,12.0,0.0,2.0,0.0,10.0
8445036,35.3,0.0,7.05,0.0,28.25
20358143,8.75,0.0,1.0,0.0,7.25
10299407,68.7,,11.45,5.5,51.75


In [8]:
#Kick out all values of December and 2018
df['start_month'] = df['Trip Start Timestamp'].dt.month
df['start_year'] = df['Trip Start Timestamp'].dt.year
df['end_month'] = df['Trip End Timestamp'].dt.month
df['end_year'] = df['Trip End Timestamp'].dt.year

df = df[(df['start_year'] == 2017 ) & (df['end_year'] == 2017) ]
df = df[(df['start_month'] !=12 ) & (df['end_month'] !=12) ]
print("Kept",np.round(df.index.size / df.index.size,4),"percent of data")

#Drop columns again
df.drop(columns=['start_month', 'start_year', 'end_month' , 'end_year'], inplace = True)

print()
#Min and Max Trip Start and end
print("New min start: ",min(df['Trip Start Timestamp']))
print("New min end: ",min(df['Trip End Timestamp']))
print("New min start: ",max(df['Trip Start Timestamp']))
print("New min end: ",max(df['Trip End Timestamp']))

Kept 1.0 percent of data

New min start:  2017-01-01 00:00:00
New min end:  2017-01-01 00:00:00
New min start:  2017-11-30 23:45:00
New min end:  2017-11-30 23:45:00


In [9]:
#Check null values in payment type and company
pay_verifier = df['Payment Type'].dropna()
print("Number of null values within column payment type: ",len(df)-len(pay_verifier))

com_verifier = df['Company'].dropna()
print("Number of null values within column payment type: ",len(df)-len(com_verifier))
print()
print("if 0, we don't have to drop something.")

Number of null values within column payment type:  0
Number of null values within column payment type:  0

if 0, we don't have to drop something.


In [10]:
#Miles have to be checked with the geo data

# H3 Conversion

In [11]:
#Converting Geo-Points to H3

def h3_conversion(value,h3_level):
    if isinstance(value,shapely.geometry.point.Point):
        return h3.geo_to_h3(value.y, value.x, h3_level)
    else:
        return np.nan

df["PU_H3"] = df.apply(lambda x: h3_conversion(x["PU_Centroid"],8),axis=1)
df["DO_H3"] = df.apply(lambda x: h3_conversion(x["DO_Centroid"],8),axis=1)
df["PU_H3"] = df["PU_H3"].replace("0",np.nan)
df["DO_H3"] = df["DO_H3"].replace("0",np.nan)
df

Unnamed: 0,Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,...,Payment Type,Company,Pickup Centroid Latitude,Pickup Centroid Longitude,Dropoff Centroid Latitude,Dropoff Centroid Longitude,PU_Centroid,DO_Centroid,PU_H3,DO_H3
18066654,e37d3366d808cd182b90ca9c9679af1baccd0de1,51482afe455eeface5c7492f4dc7638fd2c3a7e10f9174...,2017-09-14 15:45:00,2017-09-14 17:00:00,5138.0,20.14,1.703198e+10,1.703132e+10,76.0,32.0,...,Credit Card,Chicago Carriage Cab Corp,41.979071,-87.903040,41.884987,-87.620993,POINT (-87.90304 41.97907),POINT (-87.62099 41.88499),88275934edfffff,882664c1e3fffff
13657913,432494f6ae1a858d7ff8883a12d234c1986178df,16c0058a9dec4071ca1a3eedf10a184abdf86cde941d12...,2017-07-10 18:15:00,2017-07-10 18:15:00,480.0,0.60,1.703184e+10,1.703132e+10,32.0,32.0,...,Credit Card,Northwest Management LLC,41.880994,-87.632746,41.877406,-87.621972,POINT (-87.63275 41.88099),POINT (-87.62197 41.87741),882664c1a9fffff,882664c1abfffff
20008152,b9f3e2f8f6e6d133174acd85d43cb250f82c3c97,38d9d0618cde52a194ab0d7c6617fcd4554809f368f1b5...,2017-10-12 21:15:00,2017-10-12 21:30:00,1020.0,3.70,,,8.0,24.0,...,Credit Card,Medallion Leasin,41.899602,-87.633308,41.901207,-87.676356,POINT (-87.63331 41.89960),POINT (-87.67636 41.90121),882664c1edfffff,882664cac3fffff
19672098,4f35f75eeb8ea7f9668f56e51bd11bd8931f58a8,036b8cc6d5f37747d91884dfb7fa3220836b8e556c8359...,2017-10-08 09:30:00,2017-10-08 09:45:00,767.0,8.54,1.703108e+10,1.703141e+10,8.0,41.0,...,Cash,Patriot Taxi Dba Peace Taxi Associat,41.892508,-87.626215,41.790506,-87.583144,POINT (-87.62621 41.89251),POINT (-87.58314 41.79051),882664c1e1fffff,882664cc59fffff
11554657,c40ac743da81b580578d6e4485859ed75c2e0aad,c0591d33660deb744cef729db6457d5a0924498c9ed3bb...,2017-06-10 17:15:00,2017-06-10 18:00:00,2622.0,17.79,1.703108e+10,1.703198e+10,8.0,76.0,...,Credit Card,City Service,41.899156,-87.626211,41.979071,-87.903040,POINT (-87.62621 41.89916),POINT (-87.90304 41.97907),882664c1e1fffff,88275934edfffff
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13464502,380e343a545db460834f2c5c41f27ef205bab525,3fcd4c899db0f7f4d7dbd3b4b6370dcaa4b888f57db00f...,2017-07-07 16:15:00,2017-07-07 16:30:00,753.0,4.10,1.703104e+10,1.703106e+10,4.0,6.0,...,Credit Card,Flash Cab,41.966834,-87.684018,41.938391,-87.638575,POINT (-87.68402 41.96683),POINT (-87.63857 41.93839),882664d8b1fffff,882664c10dfffff
7045710,d4157f8b51a990a274dacd1ee3dc24dd6d1ab6ea,4e1cc47be197da1762ea9877e4b007495e43e8233dd584...,2017-04-11 16:45:00,2017-04-11 17:15:00,1685.0,10.98,1.703184e+10,1.703198e+10,32.0,56.0,...,Credit Card,Sun Taxi,41.880994,-87.632746,41.785999,-87.750934,POINT (-87.63275 41.88099),POINT (-87.75093 41.78600),882664c1a9fffff,8826645219fffff
6433674,3ad13d10e78807f798c35dc69fefeee4348abc43,919ad1eb29141ee529f65193c1347a616239d89cd8c5e1...,2017-04-03 18:45:00,2017-04-03 18:45:00,472.0,0.80,1.703132e+10,1.703128e+10,32.0,28.0,...,Cash,Yellow Cab,41.884987,-87.620993,41.885300,-87.642808,POINT (-87.62099 41.88499),POINT (-87.64281 41.88530),882664c1e3fffff,882664c1adfffff
20068656,00654e9f7dd1551943cc880fef5a262c191e4c6a,a53d5e8132e54e4852ae9bafbcdbb34eba8db5b5f5c685...,2017-10-13 17:45:00,2017-10-13 18:00:00,960.0,1.60,1.703108e+10,1.703128e+10,8.0,28.0,...,Cash,City Service,41.890922,-87.618868,41.879255,-87.642649,POINT (-87.61887 41.89092),POINT (-87.64265 41.87926),882664c1e3fffff,882664c1adfffff


# Data Preparation

In [16]:
def addDateCols(added_word, used_datetime,df_name): 

    #added word: Put your indivuword word to the string 

    #Month and weekday name
    df_name[added_word+'_MONTH_NAME'] = df_name[used_datetime].dt.month_name()
    df_name[added_word+'_WEEKDAY_NAME'] = df_name[used_datetime].dt.day_name()
    
    #Add week 
    df_name[added_word+"_WEEK"] = np.ceil(df_name[used_datetime].dt.dayofyear/7).astype(int)
    
    #Date
    df_name[added_word+"_DATE"] = df_name[used_datetime].dt.date
    
    #Year
    #df_name[added_word+'_YEAR'] = df_name[used_datetime].dt.year
    #df_name[added_word+'_YEAR'] = df_name[added_word+'_YEAR'].astype(str)

    #Month numeric
    df_name[added_word+'_MONTH'] = df_name[used_datetime].dt.month
    df_name[added_word+'_MONTH'] = pd.to_numeric(df_name[added_word+'_MONTH'])
    
    #Weekday numeric
    df_name[added_word+'_WEEKDAY'] = df_name[used_datetime].dt.dayofweek
    df_name[added_word+'_WEEKDAY'] = pd.to_numeric(df_name[added_word+'_WEEKDAY'])

    #Day numeric
    df_name[added_word+'_HOUR'] = df_name[used_datetime].dt.hour
    df_name[added_word+'_HOUR'] = pd.to_numeric(df_name[added_word+'_HOUR'])+1
    
    #Day of week
    #df_name[added_word+'_HOUR_OF_WEEK'] = (df_name[added_word+'_WEEKDAY']*24) + df_name[added_word+'_HOUR']
    
    return df_name

In [17]:
#Adding datetime columns
df_prepared = addDateCols("PU","Trip Start Timestamp",df)
df_prepared = addDateCols("DO","Trip Start Timestamp",df_prepared)

In [18]:
df_prepared.head()

Unnamed: 0,Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,...,PU_MONTH,PU_WEEKDAY,PU_HOUR,DO_MONTH_NAME,DO_WEEKDAY_NAME,DO_WEEK,DO_DATE,DO_MONTH,DO_WEEKDAY,DO_HOUR
18066654,e37d3366d808cd182b90ca9c9679af1baccd0de1,51482afe455eeface5c7492f4dc7638fd2c3a7e10f9174...,2017-09-14 15:45:00,2017-09-14 17:00:00,5138.0,20.14,17031980000.0,17031320000.0,76.0,32.0,...,9,3,16,September,Thursday,37,2017-09-14,9,3,16
13657913,432494f6ae1a858d7ff8883a12d234c1986178df,16c0058a9dec4071ca1a3eedf10a184abdf86cde941d12...,2017-07-10 18:15:00,2017-07-10 18:15:00,480.0,0.6,17031840000.0,17031320000.0,32.0,32.0,...,7,0,19,July,Monday,28,2017-07-10,7,0,19
20008152,b9f3e2f8f6e6d133174acd85d43cb250f82c3c97,38d9d0618cde52a194ab0d7c6617fcd4554809f368f1b5...,2017-10-12 21:15:00,2017-10-12 21:30:00,1020.0,3.7,,,8.0,24.0,...,10,3,22,October,Thursday,41,2017-10-12,10,3,22
19672098,4f35f75eeb8ea7f9668f56e51bd11bd8931f58a8,036b8cc6d5f37747d91884dfb7fa3220836b8e556c8359...,2017-10-08 09:30:00,2017-10-08 09:45:00,767.0,8.54,17031080000.0,17031410000.0,8.0,41.0,...,10,6,10,October,Sunday,41,2017-10-08,10,6,10
11554657,c40ac743da81b580578d6e4485859ed75c2e0aad,c0591d33660deb744cef729db6457d5a0924498c9ed3bb...,2017-06-10 17:15:00,2017-06-10 18:00:00,2622.0,17.79,17031080000.0,17031980000.0,8.0,76.0,...,6,5,18,June,Saturday,23,2017-06-10,6,5,18


## Option 1: Parrow:

In [19]:
#Write parquet of cleaned frame
df_compatible = df_prepared.drop(columns=["PU_Centroid","DO_Centroid"])
frame = pa.Table.from_pandas(df_compatible)
pq.write_table(frame, '../data/df_cleaned_{}.parquet'.format(date.today().strftime("%d_%m_%Y")))