In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import shapely
from shapely import wkt
import h3
from datetime import date
import pyarrow.parquet as pq
import pyarrow as pa
#from fastparquet import write
#from parquet

# Data Cleaning

In [2]:
df = pd.read_parquet("../data/df_sample_27_05_2021.parquet")
df["PU_Centroid"] = gpd.points_from_xy(df["Pickup Centroid Longitude"], df["Pickup Centroid Latitude"])
df["DO_Centroid"] = gpd.points_from_xy(df["Dropoff Centroid Longitude"], df["Dropoff Centroid Latitude"])                                                   
#df = df.drop(columns = ["Pickup Centroid Latitude","Pickup Centroid Longitude","Dropoff Centroid Latitude","Dropoff Centroid Longitude"])
df["Trip Start Timestamp"] = pd.to_datetime(df["Trip Start Timestamp"],format = '%m/%d/%Y %I:%M:%S %p')
df["Trip End Timestamp"] = pd.to_datetime(df["Trip End Timestamp"],format = '%m/%d/%Y %I:%M:%S %p')
df.head()

Unnamed: 0,Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,...,Payment Type,Company,Pickup Centroid Latitude,Pickup Centroid Longitude,Pickup Centroid Location,Dropoff Centroid Latitude,Dropoff Centroid Longitude,Dropoff Centroid Location,PU_Centroid,DO_Centroid
11871910,aeb44747d9cd5c4e810ed2c12631362d827ae29a,bb4ff740f6b2ffbe9aa3900f462c265eae88dadab4a095...,2017-06-14 13:15:00,2017-06-14 13:30:00,300.0,0.8,17031080000.0,17031080000.0,8.0,8.0,...,Cash,Top Cab Affiliation,41.892042,-87.631864,POINT (-87.6318639497 41.8920421365),41.899156,-87.626211,POINT (-87.6262105324 41.8991556134),POINT (-87.63186 41.89204),POINT (-87.62621 41.89916)
4278691,f3150e8061916e2cec7dad53f038f9d7cac01101,8e856e16163a85cee43d1ca81190e3b369b5a3724fe1c1...,2017-03-06 14:45:00,2017-03-06 15:00:00,1020.0,9.5,,,,,...,Cash,Chicago Independents,,,,,,,POINT (nan nan),POINT (nan nan)
962707,3bc4f08fcf057a6c2bd8b7865c1978abca3616ec,1721f8d69b5f831e0c16c51510786ca43aaa367d23bec8...,2017-01-16 11:45:00,2017-01-16 12:00:00,392.0,2.2,,,,,...,Credit Card,303 Taxi,,,,,,,POINT (nan nan),POINT (nan nan)
21548285,7235674388b12e378a2bc20ad7fb565ef6a9d317,61abb47c5869d156b0658c031a16ce139beb682fc30b5b...,2017-11-04 00:00:00,2017-11-04 00:15:00,1260.0,4.7,,,7.0,24.0,...,Cash,City Service,41.922686,-87.649489,POINT (-87.6494887289 41.9226862843),41.901207,-87.676356,POINT (-87.6763559892 41.90120699410001),POINT (-87.64949 41.92269),POINT (-87.67636 41.90121)
16198848,4ab70436ec0c42ae288afb510213264fd3c6e5c3,ea73ddfdd7cafa796ab1e254ffe3240fb6936e6f390c27...,2017-08-16 12:45:00,2017-08-16 13:00:00,423.0,1.1,17031080000.0,17031320000.0,8.0,32.0,...,Credit Card,Blue Diamond,41.895033,-87.619711,POINT (-87.6197106717 41.8950334495),41.884987,-87.620993,POINT (-87.6209929134 41.8849871918),POINT (-87.61971 41.89503),POINT (-87.62099 41.88499)


In [3]:
df_filtered = df.nsmallest(int(df.index.size * 0.999), "Trip Total", keep='first')
total_filter = df_filtered["Trip Total"].max()
min_total = df_filtered["Trip Total"].min()
print("Max Total:",total_filter)
print("Min Total:",min_total)

df_filtered = df.nsmallest(int(df.index.size * 0.999), "Trip Seconds", keep='first')
seconds_filter = df_filtered["Trip Seconds"].max()
min_seconds = df_filtered["Trip Seconds"].min()
print("Max Seconds:",seconds_filter)
print("Min Seconds:",min_seconds)

df = df.copy()[(df["Trip Total"] <= total_filter)&
                   (df["Trip Seconds"]<=seconds_filter)&
                   (df["Trip Seconds"]> 60)]
print("Kept",np.round(df.index.size / df.index.size,4),"percent of data")

Max Total: 124.5
Min Total: 0.0
Max Seconds: 7063.0
Min Seconds: 0.0
Kept 1.0 percent of data


In [4]:
#kick out values under 2$
print("0$ Total Trips:",df[df["Trip Total"]==0].index.size)
print("0$ Fare Trips:",df[df["Fare"]==0].index.size)


df = df.copy()[(df["Trip Total"] >= 2)&
                   (df["Fare"]>=2)]

print("New min Total: ",min(df["Trip Total"]))
print("New max Total: ",max(df["Trip Total"]))
print("New min Fare: ",min(df["Fare"]))
print("New max Fare: ",max(df["Fare"]))

print("Kept",np.round(df.index.size / df.index.size,4),"percent of data")

0$ Total Trips: 350
0$ Fare Trips: 414
New min Total:  2.0
New max Total:  124.5
New min Fare:  2.0
New max Fare:  124.5
Kept 1.0 percent of data


In [5]:
print("New min Total: ",min(df["Trip Total"]))
print("New max Total: ",max(df["Trip Total"]))

New min Total:  2.0
New max Total:  124.5


In [6]:
#For understanding 
df[["Trip Total","Tolls","Tips","Extras","Fare"]].sample(10)

Unnamed: 0,Trip Total,Tolls,Tips,Extras,Fare
14012220,11.5,0.0,3.0,2.0,6.5
23068385,6.25,0.0,0.0,1.0,5.25
16356841,25.5,,4.25,1.0,20.25
7216181,14.0,0.0,0.0,0.0,14.0
22006880,50.5,0.0,0.0,5.0,45.5
1858049,57.5,,11.5,0.0,46.0
20058469,15.75,0.0,3.0,0.0,12.25
23880078,12.2,,0.0,0.0,12.2
21574921,6.25,0.0,0.0,1.0,5.25
13121944,12.75,0.0,3.0,1.5,7.75


In [7]:
#Kick out all values of December and 2018
df['start_month'] = df['Trip Start Timestamp'].dt.month
df['start_year'] = df['Trip Start Timestamp'].dt.year
df['end_month'] = df['Trip End Timestamp'].dt.month
df['end_year'] = df['Trip End Timestamp'].dt.year

df = df[(df['start_year'] == 2017 ) & (df['end_year'] == 2017) ]
df = df[(df['start_month'] !=12 ) & (df['end_month'] !=12) ]
print("Kept",np.round(df.index.size / df.index.size,4),"percent of data")

#Drop columns again
df.drop(columns=['start_month', 'start_year', 'end_month' , 'end_year'], inplace = True)

print()
#Min and Max Trip Start and end
print("New min start: ",min(df['Trip Start Timestamp']))
print("New min end: ",min(df['Trip End Timestamp']))
print("New min start: ",max(df['Trip Start Timestamp']))
print("New min end: ",max(df['Trip End Timestamp']))

Kept 1.0 percent of data

New min start:  2017-01-01 00:00:00
New min end:  2017-01-01 00:00:00
New min start:  2017-11-30 23:45:00
New min end:  2017-11-30 23:45:00


In [8]:
#Check null values in payment type and company
pay_verifier = df['Payment Type'].dropna()
print("Number of null values within column payment type: ",len(df)-len(pay_verifier))

com_verifier = df['Company'].dropna()
print("Number of null values within column payment type: ",len(df)-len(com_verifier))
print()
print("if 0, we don't have to drop something.")

Number of null values within column payment type:  0
Number of null values within column payment type:  0

if 0, we don't have to drop something.


In [9]:
#Miles have to be checked with the geo data

In [10]:
#Converting Geo-Points to H3

def h3_conversion(value,h3_level):
    if isinstance(value,shapely.geometry.point.Point):
        return h3.geo_to_h3(value.y, value.x, h3_level)
    else:
        return np.nan

df["PU_H3"] = df.apply(lambda x: h3_conversion(x["PU_Centroid"],7),axis=1)
df["DO_H3"] = df.apply(lambda x: h3_conversion(x["DO_Centroid"],7),axis=1)
df

Unnamed: 0,Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,...,Pickup Centroid Latitude,Pickup Centroid Longitude,Pickup Centroid Location,Dropoff Centroid Latitude,Dropoff Centroid Longitude,Dropoff Centroid Location,PU_Centroid,DO_Centroid,PU_H3,DO_H3
11871910,aeb44747d9cd5c4e810ed2c12631362d827ae29a,bb4ff740f6b2ffbe9aa3900f462c265eae88dadab4a095...,2017-06-14 13:15:00,2017-06-14 13:30:00,300.0,0.80,1.703108e+10,1.703108e+10,8.0,8.0,...,41.892042,-87.631864,POINT (-87.6318639497 41.8920421365),41.899156,-87.626211,POINT (-87.6262105324 41.8991556134),POINT (-87.63186 41.89204),POINT (-87.62621 41.89916),872664c1effffff,872664c1effffff
4278691,f3150e8061916e2cec7dad53f038f9d7cac01101,8e856e16163a85cee43d1ca81190e3b369b5a3724fe1c1...,2017-03-06 14:45:00,2017-03-06 15:00:00,1020.0,9.50,,,,,...,,,,,,,POINT (nan nan),POINT (nan nan),0,0
962707,3bc4f08fcf057a6c2bd8b7865c1978abca3616ec,1721f8d69b5f831e0c16c51510786ca43aaa367d23bec8...,2017-01-16 11:45:00,2017-01-16 12:00:00,392.0,2.20,,,,,...,,,,,,,POINT (nan nan),POINT (nan nan),0,0
21548285,7235674388b12e378a2bc20ad7fb565ef6a9d317,61abb47c5869d156b0658c031a16ce139beb682fc30b5b...,2017-11-04 00:00:00,2017-11-04 00:15:00,1260.0,4.70,,,7.0,24.0,...,41.922686,-87.649489,POINT (-87.6494887289 41.9226862843),41.901207,-87.676356,POINT (-87.6763559892 41.90120699410001),POINT (-87.64949 41.92269),POINT (-87.67636 41.90121),872664c13ffffff,872664cacffffff
16198848,4ab70436ec0c42ae288afb510213264fd3c6e5c3,ea73ddfdd7cafa796ab1e254ffe3240fb6936e6f390c27...,2017-08-16 12:45:00,2017-08-16 13:00:00,423.0,1.10,1.703108e+10,1.703132e+10,8.0,32.0,...,41.895033,-87.619711,POINT (-87.6197106717 41.8950334495),41.884987,-87.620993,POINT (-87.6209929134 41.8849871918),POINT (-87.61971 41.89503),POINT (-87.62099 41.88499),872664c1effffff,872664c1effffff
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6902014,67f0a67269ba222aa08d9c7cec5730ffd2b570a0,e64c125ab06441db8101d7c98eec723e18c3dcd1cf39f8...,2017-04-09 12:45:00,2017-04-09 13:00:00,840.0,4.04,,,32.0,24.0,...,41.878866,-87.625192,POINT (-87.6251921424 41.8788655841),41.901207,-87.676356,POINT (-87.6763559892 41.90120699410001),POINT (-87.62519 41.87887),POINT (-87.67636 41.90121),872664c1affffff,872664cacffffff
16779234,5a9fe40b3dadf17c2c0714344682be4cd45a82ae,2afd9a6a0e05f93e05e3bb2c6ce3b1b9ac73f06bd9b28f...,2017-08-24 21:00:00,2017-08-24 21:00:00,286.0,0.90,1.703132e+10,1.703108e+10,32.0,8.0,...,41.884987,-87.620993,POINT (-87.6209929134 41.8849871918),41.895033,-87.619711,POINT (-87.6197106717 41.8950334495),POINT (-87.62099 41.88499),POINT (-87.61971 41.89503),872664c1effffff,872664c1effffff
20592229,210724145d77c2f3108ea3aa03b4094ad46435f1,14315f2687c6c0382d182c751fe40fc66439c80338ed56...,2017-10-20 21:00:00,2017-10-20 21:15:00,300.0,0.60,1.703108e+10,1.703108e+10,8.0,8.0,...,41.892042,-87.631864,POINT (-87.6318639497 41.8920421365),41.892508,-87.626215,POINT (-87.6262149064 41.8925077809),POINT (-87.63186 41.89204),POINT (-87.62621 41.89251),872664c1effffff,872664c1effffff
5974540,c704df2d858ce87d80b50bfc833ebfcb10748cd4,3524a1997de20528251b755cf6080b101c4a4ac9d3abfa...,2017-03-28 13:45:00,2017-03-28 14:00:00,580.0,1.29,1.703128e+10,1.703132e+10,28.0,32.0,...,41.879255,-87.642649,POINT (-87.642648998 41.8792550844),41.870607,-87.622173,POINT (-87.6221729369 41.8706073724),POINT (-87.64265 41.87926),POINT (-87.62217 41.87061),872664c1affffff,872664c1affffff


# Data Preparation

In [2]:
def addDateCols(added_word, used_datetime,df_name): 

    #added word: Put your indivuword word to the string 

    #Month and weekday name
    df_name[added_word+'_MONTH_NAME'] = df_name[used_datetime].dt.month_name()
    df_name[added_word+'_WEEKDAY_NAME'] = df_name[used_datetime].dt.day_name()
    
    #Year
    df_name[added_word+'_YEAR'] = df_name[used_datetime].dt.year
    df_name[added_word+'_YEAR'] = df_name[added_word+'_YEAR'].astype(str)

    #Month numeric
    df_name[added_word+'_MONTH'] = df_name[used_datetime].dt.month
    df_name[added_word+'_MONTH'] = pd.to_numeric(df_name[added_word+'_MONTH'])
    
    #Weekday numeric
    df_name[added_word+'_WEEKDAY'] = df_name[used_datetime].dt.dayofweek
    df_name[added_word+'_WEEKDAY'] = pd.to_numeric(df_name[added_word+'_WEEKDAY'])

    #Day numeric
    df_name[added_word+'_HOUR'] = df_name[used_datetime].dt.hour
    df_name[added_word+'_HOUR'] = pd.to_numeric(df_name[added_word+'_HOUR'])+1
    
    #Day of week
    df_name[added_word+'_HOUR_OF_WEEK'] = (df_name[added_word+'_WEEKDAY']*24) + df_name[added_word+'_HOUR']
    
    #Fifth of the day
    df_name[added_word+'_FOUR_HOURLY'] = np.where(df_name['PU_HOUR'].isin([1,2,3,4,5]),'1/5', -1)
    df_name[added_word+'_FOUR_HOURLY'] = np.where(df_name['PU_HOUR'].isin([6,7,8,9,10]),'2/5', df_name[added_word+'_FOUR_HOURLY'])
    df_name[added_word+'_FOUR_HOURLY'] = np.where(df_name['PU_HOUR'].isin([11,12,13,14,15]),'3/5', df_name[added_word+'_FOUR_HOURLY'])
    df_name[added_word+'_FOUR_HOURLY'] = np.where(df_name['PU_HOUR'].isin([16,17,18,19,20]),'4/5', df_name[added_word+'_FOUR_HOURLY'])
    df_name[added_word+'_FOUR_HOURLY'] = np.where(df_name['PU_HOUR'].isin([21,22,23,24,25]),'5/5', df_name[added_word+'_FOUR_HOURLY'])
    
    return df_name


In [3]:
#Adding datetime columns
df_prepared = addDateCols("PU","Trip Start Timestamp",df)

NameError: name 'df' is not defined

In [12]:
df_prepared.head()

Unnamed: 0,Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,...,PU_Centroid,DO_Centroid,PU_MONTH_NAME,PU_WEEKDAY_NAME,PU_YEAR,PU_MONTH,PU_WEEKDAY,PU_HOUR,PU_HOUR_OF_WEEK,PU_FOUR_HOURLY
11871910,aeb44747d9cd5c4e810ed2c12631362d827ae29a,bb4ff740f6b2ffbe9aa3900f462c265eae88dadab4a095...,2017-06-14 13:15:00,2017-06-14 13:30:00,300.0,0.8,17031080000.0,17031080000.0,8.0,8.0,...,POINT (41.89204 41.89204),POINT (41.89916 41.89916),June,Wednesday,2017,6,2,14,62,3/5
4278691,f3150e8061916e2cec7dad53f038f9d7cac01101,8e856e16163a85cee43d1ca81190e3b369b5a3724fe1c1...,2017-03-06 14:45:00,2017-03-06 15:00:00,1020.0,9.5,,,,,...,POINT (nan nan),POINT (nan nan),March,Monday,2017,3,0,15,15,3/5
962707,3bc4f08fcf057a6c2bd8b7865c1978abca3616ec,1721f8d69b5f831e0c16c51510786ca43aaa367d23bec8...,2017-01-16 11:45:00,2017-01-16 12:00:00,392.0,2.2,,,,,...,POINT (nan nan),POINT (nan nan),January,Monday,2017,1,0,12,12,3/5
21548285,7235674388b12e378a2bc20ad7fb565ef6a9d317,61abb47c5869d156b0658c031a16ce139beb682fc30b5b...,2017-11-04 00:00:00,2017-11-04 00:15:00,1260.0,4.7,,,7.0,24.0,...,POINT (41.92269 41.92269),POINT (41.90121 41.90121),November,Saturday,2017,11,5,1,121,1/5
16198848,4ab70436ec0c42ae288afb510213264fd3c6e5c3,ea73ddfdd7cafa796ab1e254ffe3240fb6936e6f390c27...,2017-08-16 12:45:00,2017-08-16 13:00:00,423.0,1.1,17031080000.0,17031320000.0,8.0,32.0,...,POINT (41.89503 41.89503),POINT (41.88499 41.88499),August,Wednesday,2017,8,2,13,61,3/5


## Option 1: Parrow:

In [11]:
#Write parquet of cleaned frame
df_compatible = df.drop(columns=["PU_Centroid","DO_Centroid"])
frame = pa.Table.from_pandas(df_compatible)
pq.write_table(frame, '../data/df_cleaned_{}.parquet'.format(date.today().strftime("%d_%m_%Y")))