In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import shapely
from shapely import wkt
import h3
from datetime import date
import pyarrow.parquet as pq
import pyarrow as pa
from fastparquet import write

In [2]:
sample = pd.read_parquet("../data/df_sample_22_05_2021.parquet")
sample = sample.drop(columns = ["Pickup Centroid Latitude","Pickup Centroid Longitude","Dropoff Centroid Latitude","Dropoff Centroid Longitude"])
sample["Trip Start Timestamp"] = pd.to_datetime(sample["Trip Start Timestamp"],format = '%m/%d/%Y %I:%M:%S %p')
sample["Trip End Timestamp"] = pd.to_datetime(sample["Trip End Timestamp"],format = '%m/%d/%Y %I:%M:%S %p')
sample.head()

Unnamed: 0,Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,Fare,Tips,Tolls,Extras,Trip Total,Payment Type,Company,Pickup Centroid Location,Dropoff Centroid Location
5402408,55e9f8e9d439cbe136ba82622ede4be417b4e760,c1365fc5d9c26f99b42911754818c3402e38e0094c19e4...,2017-03-20 17:45:00,2017-03-20 18:00:00,666.0,13.4,,,76.0,,33.5,0.0,,4.0,37.5,Cash,Flash Cab,POINT (-87.913624596 41.9802643146),
18781727,f2a95893fef0e4f7141d0927f219d65839ec3f97,e203f043bc67df3a0c247621faa4524d93d89a49b02457...,2017-09-25 09:00:00,2017-09-25 09:00:00,714.0,1.11,17031280000.0,17031320000.0,28.0,32.0,8.0,0.0,0.0,0.0,8.0,Cash,Chicago Carriage Cab Corp,POINT (-87.6428084655 41.8853000224),POINT (-87.6209929134 41.8849871918)
24964556,fa1da8624195cca1e67e7e24d49be3f5a6c66a38,e67eb932b76835a3010b0a51a8d0624d72714fee077c59...,2017-12-31 06:00:00,2017-12-31 06:15:00,300.0,1.5,,,24.0,7.0,7.0,4.0,0.0,0.0,11.0,Credit Card,Taxi Affiliation Services,POINT (-87.6763559892 41.90120699410001),POINT (-87.6494887289 41.9226862843)
19602125,9a3c7914c71fdb5fc3bf1794f171b712aa81b901,7b885f3c3b55b58bb3785c7b6876f9d5bf3d8a3e74486b...,2017-10-07 04:15:00,2017-10-07 04:30:00,480.0,0.0,,,28.0,7.0,12.0,0.0,0.0,0.0,12.0,Cash,Taxi Affiliation Services,POINT (-87.6635175498 41.874005383),POINT (-87.6494887289 41.9226862843)
215156,509af19342d154339c072d227c3bb648c3ad3ab2,0bd46f4637ddcdab1350b60cb03cbd9281279a7bb4a049...,2017-01-05 08:00:00,2017-01-05 08:15:00,900.0,5.6,17031060000.0,17031840000.0,6.0,32.0,17.0,0.0,0.0,0.0,17.0,Cash,City Service,POINT (-87.640698076 41.9431550855),POINT (-87.6327464887 41.8809944707)


In [3]:
sample_filtered = sample.nsmallest(int(sample.index.size * 0.999), "Trip Total", keep='first')
total_filter = sample_filtered["Trip Total"].max()
min_total = sample_filtered["Trip Total"].min()
print("Max Total:",total_filter)
print("Min Total:",min_total)

sample_filtered = sample.nsmallest(int(sample.index.size * 0.999), "Trip Seconds", keep='first')
seconds_filter = sample_filtered["Trip Seconds"].max()
min_seconds = sample_filtered["Trip Seconds"].min()
print("Max Seconds:",seconds_filter)
print("Min Seconds:",min_seconds)

df = sample.copy()[(sample["Trip Total"] <= total_filter)&
                   (sample["Trip Seconds"]<=seconds_filter)&
                   (sample["Trip Seconds"]> 60)]
print("Kept",np.round(df.index.size / sample.index.size,4),"percent of data")


Max Total: 125.5
Min Total: 0.0
Max Seconds: 6960.0
Min Seconds: 0.0
Kept 0.9424 percent of data


In [4]:
#kick out values under 2$
print("0$ Total Trips:",df[df["Trip Total"]==0].index.size)
print("0$ Fare Trips:",df[df["Fare"]==0].index.size)


df = df.copy()[(df["Trip Total"] >= 2)&
                   (sample["Fare"]>=2)]

print("New min Total: ",min(df["Trip Total"]))
print("New max Total: ",max(df["Trip Total"]))
print("New min Fare: ",min(df["Fare"]))
print("New max Fare: ",max(df["Fare"]))

print("Kept",np.round(df.index.size / sample.index.size,4),"percent of data")

0$ Total Trips: 310
0$ Fare Trips: 389


  df = df.copy()[(df["Trip Total"] >= 2)&


New min Total:  2.0
New max Total:  125.5
New min Fare:  2.0
New max Fare:  125.5
Kept 0.932 percent of data


In [5]:
print("New min Total: ",min(df["Trip Total"]))
print("New max Total: ",max(df["Trip Total"]))

New min Total:  2.0
New max Total:  125.5


In [6]:
#For understanding 
df[["Trip Total","Tolls","Tips","Extras","Fare"]].sample(10)

Unnamed: 0,Trip Total,Tolls,Tips,Extras,Fare
21550929,7.5,0.0,0.0,1.0,6.5
7358385,13.25,,1.0,1.0,11.25
9160874,60.6,0.0,10.1,4.0,46.0
17873346,43.0,0.0,0.0,6.0,37.0
1375520,10.0,,0.0,0.0,10.0
15056630,5.75,0.0,1.0,0.0,4.75
14694106,9.0,0.0,0.0,1.0,8.0
2529639,10.0,0.0,0.0,0.0,10.0
11780200,8.5,0.0,2.0,0.0,6.0
24375085,12.75,0.0,2.0,0.0,10.25


In [7]:
#Kick out all values of December and 2018
df['start_month'] = df['Trip Start Timestamp'].dt.month
df['start_year'] = df['Trip Start Timestamp'].dt.year
df['end_month'] = df['Trip End Timestamp'].dt.month
df['end_year'] = df['Trip End Timestamp'].dt.year

df_trips = df[(df['start_year'] == 2017 ) | (df['end_year'] == 2017) ]
df = df[(df['start_month'] !=12 ) | (df['end_month'] !=12) ]
print("Kept",np.round(df.index.size / sample.index.size,4),"percent of data")
#Drop columns again
df.drop(columns=['start_month', 'start_year', 'end_month' , 'end_year'], inplace = True)

print()
#Min and Max Trip Start and end
print("New min start: ",min(df['Trip Start Timestamp']))
print("New min end: ",min(df['Trip End Timestamp']))
print("New min start: ",max(df['Trip Start Timestamp']))
print("New min end: ",max(df['Trip End Timestamp']))

Kept 0.8659 percent of data

New min start:  2017-01-01 00:00:00
New min end:  2017-01-01 00:00:00
New min start:  2017-12-31 23:45:00
New min end:  2018-01-01 00:30:00


In [None]:
#Converting String Points to GDF

def load_wkt(value):
    if isinstance(value,str):
        return wkt.loads(value)
    else:
        return np.nan


df["PU_Centroid"] = df["Pickup Centroid Location"].apply(load_wkt)
df["DO_Centroid"] =  df["Dropoff Centroid  Location"].apply(load_wkt)

# Geopandas GeoDataFrame
#gdf = gpd.GeoDataFrame(df, geometry='PU_Centroid')
#gdf.drop(columns=["Pickup Centroid Location","Dropoff Centroid  Location"])

#gdf.head()

In [9]:
#Converting Geo-Points to H3

def h3_conversion(value,h3_level):
    if isinstance(value,shapely.geometry.point.Point):
        return h3.geo_to_h3(value.y, value.x, h3_level)
    else:
        return np.nan

df["PU_H3"] = df.apply(lambda x: h3_conversion(x["Pickup Centroid Location"],7),axis=1)
df["DO_H3"] = df.apply(lambda x: h3_conversion(x["Dropoff Centroid  Location"],7),axis=1)

In [10]:
#Check null values in payment type and company
pay_verifier = df['Payment Type'].dropna()
print("Number of null values within column payment type: ",len(df)-len(pay_verifier))

com_verifier = df['Company'].dropna()
print("Number of null values within column payment type: ",len(df)-len(com_verifier))
print()
print("if 0, we don't have to drop something.")

Number of null values within column payment type:  0
Number of null values within column payment type:  0

if 0, we don't have to drop something.


In [11]:
#Miles have to be checked with the geo data

## Option 1: Parrow:

In [27]:
#Write parquet of cleaned frame
frame = pa.Table.from_pandas(df)
pq.write_table(frame, '../data/df_cleaned_sample_{}.parquet'.format(date.today().strftime("%d_%m_%Y")))

ArrowInvalid: ('Could not convert POINT (-87.91362459600001 41.9802643146) with type Point: did not recognize Python value type when inferring an Arrow data type', 'Conversion failed for column PU_Centroid with type object')

## Option 2: Fastparquet

In [34]:
write(df,"test.parq")

AttributeError: 'str' object has no attribute 'reset_index'