In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import shapely
from shapely import wkt
import h3

import datetime

In [2]:
sample = pd.read_parquet("df_trips_sample.parquet")
sample = sample.drop(columns = ["Pickup Centroid Latitude","Pickup Centroid Longitude","Dropoff Centroid Latitude","Dropoff Centroid Longitude"])
sample["Trip Start Timestamp"] = pd.to_datetime(sample["Trip Start Timestamp"],format = '%m/%d/%Y %I:%M:%S %p')
sample["Trip End Timestamp"] = pd.to_datetime(sample["Trip End Timestamp"],format = '%m/%d/%Y %I:%M:%S %p')
sample.head()

Unnamed: 0,Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,Fare,Tips,Tolls,Extras,Trip Total,Payment Type,Company,Pickup Centroid Location,Dropoff Centroid Location
24357273,06bd2062d4da87baf5e6c60645adf4018ff5a1f5,3c0dce2f97f9acc211c5a886cff53caf700e8ba821ae26...,2017-12-18 11:15:00,2017-12-18 12:00:00,3000.0,11.2,,,60.0,3.0,37.5,0.0,0.0,0.0,37.5,Cash,Chicago Independents,POINT (-87.6487879519 41.8361501547),POINT (-87.6558787862 41.96581197)
23594347,636eae5de0817802f3befa320b660551e245f0af,dcfc039460c91afe6099789b8f3b8ef854815c6345723e...,2017-12-06 20:15:00,2017-12-06 20:30:00,420.0,1.1,17031840000.0,17031080000.0,32.0,8.0,6.75,2.0,0.0,0.0,8.75,Credit Card,Taxi Affiliation Services,POINT (-87.6327464887 41.8809944707),POINT (-87.6288741572 41.8920726347)
14171117,a7c5166b39b3eb990f6aa84e43b5064f43c3cc79,ca85563c7c46f07258ff388e8a43a1502115d1824dd00b...,2017-07-17 23:30:00,2017-07-17 23:45:00,900.0,5.66,,,,,17.25,0.0,0.0,1.0,18.25,Cash,Taxi Affiliation Service Yellow,,
13794016,25c415d2f38913f8a5c789e7a7002a12e1a13e75,f2701208e420d05c34de4131a15848a786b13422bf5865...,2017-07-12 17:15:00,2017-07-12 18:15:00,3420.0,21.0,17031840000.0,17031980000.0,33.0,76.0,51.5,0.0,0.0,1.5,53.0,Cash,Top Cab Affiliation,POINT (-87.6241352979 41.84924675450001),POINT (-87.9030396611 41.9790708201)
12304106,0a67be58ffe7778950205e6dc105e4a57ba58192,c81a672f30763423ddc7abac2b2e20a2f1ea045088ef77...,2017-06-20 10:45:00,2017-06-20 10:45:00,19.0,0.0,17031080000.0,17031080000.0,8.0,8.0,3.25,0.0,,0.0,3.25,Cash,Checker Taxi,POINT (-87.6262149064 41.8925077809),POINT (-87.6262149064 41.8925077809)


In [3]:
sample_filtered = sample.nsmallest(int(sample.index.size * 0.999), "Trip Total", keep='first')
total_filter = sample_filtered["Trip Total"].max()
print("Max Total:",total_filter)

sample_filtered = sample.nsmallest(int(sample.index.size * 0.999), "Trip Seconds", keep='first')
seconds_filter = sample_filtered["Trip Seconds"].max()
print("Max Seconds:",seconds_filter)

df = sample.copy()[(sample["Trip Total"] <= total_filter)&
                   (sample["Trip Seconds"]<=seconds_filter)&
                   (sample["Trip Seconds"]> 60)]
print("Kept",np.round(df.index.size / sample.index.size,4),"percent of data")


Max Total: 124.5
Max Seconds: 6900.0
Kept 0.9422 percent of data


In [4]:
#Should be checked
print("0$ Total Trips:",df[df["Trip Total"]==0].index.size)
print("0$ Fare Trips:",df[df["Trip Total"]==0].index.size)

0$ Total Trips: 167
0$ Fare Trips: 167


In [5]:
#For understanding 
df[["Trip Total","Tolls","Tips","Extras","Fare"]].sample(10)

Unnamed: 0,Trip Total,Tolls,Tips,Extras,Fare
9659508,11.75,0.0,0.0,0.0,11.75
9867657,5.25,0.0,0.0,0.0,5.25
19274984,4.5,0.0,0.0,0.0,4.5
24735892,5.5,0.0,0.0,0.0,5.5
8493685,6.5,0.0,0.0,0.0,6.5
16991285,6.5,,0.0,0.0,6.5
17612418,11.0,0.0,5.0,0.0,5.5
6542905,8.5,0.0,0.0,0.0,8.0
18239256,7.0,,0.0,1.0,6.0
21348931,5.0,0.0,0.0,0.0,5.0


In [6]:
#Converting String Points to GDF

def load_wkt(value):
    if isinstance(value,str):
        return wkt.loads(value)
    else:
        return np.nan


df['PU_Centroid'] = df["Pickup Centroid Location"].apply(load_wkt)
df["DO_Centroid"] =  df["Dropoff Centroid  Location"].apply(load_wkt)

# Geopandas GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry='PU_Centroid')
gdf.drop(columns=["Pickup Centroid Location","Dropoff Centroid  Location"])

gdf.head()

Unnamed: 0,Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,...,Tips,Tolls,Extras,Trip Total,Payment Type,Company,Pickup Centroid Location,Dropoff Centroid Location,PU_Centroid,DO_Centroid
24357273,06bd2062d4da87baf5e6c60645adf4018ff5a1f5,3c0dce2f97f9acc211c5a886cff53caf700e8ba821ae26...,2017-12-18 11:15:00,2017-12-18 12:00:00,3000.0,11.2,,,60.0,3.0,...,0.0,0.0,0.0,37.5,Cash,Chicago Independents,POINT (-87.6487879519 41.8361501547),POINT (-87.6558787862 41.96581197),POINT (-87.64879 41.83615),POINT (-87.65587878620001 41.96581197)
23594347,636eae5de0817802f3befa320b660551e245f0af,dcfc039460c91afe6099789b8f3b8ef854815c6345723e...,2017-12-06 20:15:00,2017-12-06 20:30:00,420.0,1.1,17031840000.0,17031080000.0,32.0,8.0,...,2.0,0.0,0.0,8.75,Credit Card,Taxi Affiliation Services,POINT (-87.6327464887 41.8809944707),POINT (-87.6288741572 41.8920726347),POINT (-87.63275 41.88099),POINT (-87.6288741572 41.8920726347)
14171117,a7c5166b39b3eb990f6aa84e43b5064f43c3cc79,ca85563c7c46f07258ff388e8a43a1502115d1824dd00b...,2017-07-17 23:30:00,2017-07-17 23:45:00,900.0,5.66,,,,,...,0.0,0.0,1.0,18.25,Cash,Taxi Affiliation Service Yellow,,,,
13794016,25c415d2f38913f8a5c789e7a7002a12e1a13e75,f2701208e420d05c34de4131a15848a786b13422bf5865...,2017-07-12 17:15:00,2017-07-12 18:15:00,3420.0,21.0,17031840000.0,17031980000.0,33.0,76.0,...,0.0,0.0,1.5,53.0,Cash,Top Cab Affiliation,POINT (-87.6241352979 41.84924675450001),POINT (-87.9030396611 41.9790708201),POINT (-87.62414 41.84925),POINT (-87.9030396611 41.9790708201)
1739241,edd3e4c4cb0a1096d23b4c4134595c6b4ba2bc14,cc5330b266a2b3e042e5bc50b1dadb4f4e03db62f8cbd1...,2017-01-27 23:30:00,2017-01-27 23:30:00,120.0,0.7,17031840000.0,17031320000.0,32.0,32.0,...,2.0,0.0,1.0,7.75,Credit Card,Taxi Affiliation Services,POINT (-87.6327464887 41.8809944707),POINT (-87.6219716519 41.8774061234),POINT (-87.63275 41.88099),POINT (-87.6219716519 41.8774061234)


In [7]:
#Converting Geo-Points to H3

def h3_conversion(value,h3_level):
    if isinstance(value,shapely.geometry.point.Point):
        return h3.geo_to_h3(value.y, value.x, h3_level)
    else:
        return np.nan

gdf["PU_H3"] = gdf.apply(lambda x: h3_conversion(x["PU_Centroid"],7),axis=1)
gdf["DO_H3"] = gdf.apply(lambda x: h3_conversion(x["DO_Centroid"],7),axis=1)

In [8]:
#check null values 
#should be null, because the null values are sampled in Notebook 1. 
cols_dropped = df.dropna(inplace=False)
print("Original frame has",len(df),"rows.")
print("Original frame has",len(df)-len(cols_dropped),"rows with null values.")


Original frame has 2354348 rows.
Original frame has 1129369 rows with null values.


In [9]:
#Min and Max Trip Start and end
print("Min start: ",min(df['Trip Start Timestamp']))
print("Min end: ",min(df['Trip End Timestamp']))
print("Max start: ",max(df['Trip Start Timestamp']))
print("Max end: ",max(df['Trip Start Timestamp']))

Min start:  2017-01-01 00:00:00
Min end:  2017-01-01 00:00:00
Max start:  2017-12-31 23:45:00
Max end:  2017-12-31 23:45:00
