In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import shapely
from shapely import wkt
import h3
from datetime import date
import pyarrow.parquet as pq
import pyarrow as pa
#from fastparquet import write
#from parquet

# Data Cleaning

In [2]:
df = pd.read_parquet("../data/df_sample_21_06_2021.parquet")
df["PU_Centroid"] = gpd.points_from_xy(df["Pickup Centroid Longitude"], df["Pickup Centroid Latitude"])
df["DO_Centroid"] = gpd.points_from_xy(df["Dropoff Centroid Longitude"], df["Dropoff Centroid Latitude"])                                                   
df = df.drop(columns = ["Pickup Centroid Location","Dropoff Centroid  Location"])
df["Trip Start Timestamp"] = pd.to_datetime(df["Trip Start Timestamp"],format = '%m/%d/%Y %I:%M:%S %p')
df["Trip End Timestamp"] = pd.to_datetime(df["Trip End Timestamp"],format = '%m/%d/%Y %I:%M:%S %p')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/df_sample_27_06_2021.parquet'

In [None]:
df = df[df['Trip Seconds'].notna()]
df = df[df['Trip Miles'].notna()]
df = df[df['Trip Total'].notna()]

In [None]:
df_filtered = df.nsmallest(int(df.index.size * 0.999), "Trip Total", keep='first')
total_filter = df_filtered["Trip Total"].max()
min_total = df_filtered["Trip Total"].min()
print("Max Total:",total_filter)
print("Min Total:",min_total)

df_filtered = df.nsmallest(int(df.index.size * 0.999), "Trip Seconds", keep='first')
seconds_filter = df_filtered["Trip Seconds"].max()
min_seconds = df_filtered["Trip Seconds"].min()
print("Max Seconds:",seconds_filter)
print("Min Seconds:",min_seconds)

df = df.copy()[(df["Trip Total"] <= total_filter)&
                   (df["Trip Seconds"]<=seconds_filter)&
                   (df["Trip Seconds"]> 60)]
print("Kept",np.round(df.index.size / df.index.size,4),"percent of data")

In [None]:
#kick out values under 2$
print("0$ Total Trips:",df[df["Trip Total"]==0].index.size)
print("0$ Fare Trips:",df[df["Fare"]==0].index.size)


df = df.copy()[(df["Trip Total"] >= 2)&
                   (df["Fare"]>=2)]

print("New min Total: ",min(df["Trip Total"]))
print("New max Total: ",max(df["Trip Total"]))
print("New min Fare: ",min(df["Fare"]))
print("New max Fare: ",max(df["Fare"]))

print("Kept",np.round(df.index.size / df.index.size,4),"percent of data")

In [None]:
print("New min Total: ",min(df["Trip Total"]))
print("New max Total: ",max(df["Trip Total"]))

In [None]:
#For understanding 
df[["Trip Total","Tolls","Tips","Extras","Fare"]].sample(10)

In [None]:
#Kick out all values of December and 2018
df['start_month'] = df['Trip Start Timestamp'].dt.month
df['start_year'] = df['Trip Start Timestamp'].dt.year
df['end_month'] = df['Trip End Timestamp'].dt.month
df['end_year'] = df['Trip End Timestamp'].dt.year

df = df[(df['start_year'] == 2017 ) & (df['end_year'] == 2017) ]
df = df[(df['start_month'] !=12 ) & (df['end_month'] !=12) ]
print("Kept",np.round(df.index.size / df.index.size,4),"percent of data")

#Drop columns again
df.drop(columns=['start_month', 'start_year', 'end_month' , 'end_year'], inplace = True)

print()
#Min and Max Trip Start and end
print("New min start: ",min(df['Trip Start Timestamp']))
print("New min end: ",min(df['Trip End Timestamp']))
print("New min start: ",max(df['Trip Start Timestamp']))
print("New min end: ",max(df['Trip End Timestamp']))

In [None]:
#Check null values in payment type and company
pay_verifier = df['Payment Type'].dropna()
print("Number of null values within column payment type: ",len(df)-len(pay_verifier))

com_verifier = df['Company'].dropna()
print("Number of null values within column payment type: ",len(df)-len(com_verifier))
print()
print("if 0, we don't have to drop something.")

In [None]:
#Miles have to be checked with the geo data

# H3 Conversion

In [None]:
#Converting Geo-Points to H3

def h3_conversion(value,h3_level):
    if isinstance(value,shapely.geometry.point.Point):
        return h3.geo_to_h3(value.y, value.x, h3_level)
    else:
        return np.nan

df["PU_H3"] = df.apply(lambda x: h3_conversion(x["PU_Centroid"],8),axis=1)
df["DO_H3"] = df.apply(lambda x: h3_conversion(x["DO_Centroid"],8),axis=1)
df["PU_H3"] = df["PU_H3"].replace("0",np.nan)
df["DO_H3"] = df["DO_H3"].replace("0",np.nan)
df

# Data Preparation

In [None]:
def addDateCols(added_word, used_datetime,df_name): 

    #added word: Put your indivuword word to the string 

    #Month and weekday name
    df_name[added_word+'_MONTH_NAME'] = df_name[used_datetime].dt.month_name()
    df_name[added_word+'_WEEKDAY_NAME'] = df_name[used_datetime].dt.day_name()
    
    
    #Date
    df_name[added_word+"_DATE"] = df_name[used_datetime].dt.date
    #Year
    #df_name[added_word+'_YEAR'] = df_name[used_datetime].dt.year
    #df_name[added_word+'_YEAR'] = df_name[added_word+'_YEAR'].astype(str)

    #Month numeric
    df_name[added_word+'_MONTH'] = df_name[used_datetime].dt.month
    df_name[added_word+'_MONTH'] = pd.to_numeric(df_name[added_word+'_MONTH'])
    
    #Weekday numeric
    df_name[added_word+'_WEEKDAY'] = df_name[used_datetime].dt.dayofweek
    df_name[added_word+'_WEEKDAY'] = pd.to_numeric(df_name[added_word+'_WEEKDAY'])

    #Day numeric
    df_name[added_word+'_HOUR'] = df_name[used_datetime].dt.hour
    df_name[added_word+'_HOUR'] = pd.to_numeric(df_name[added_word+'_HOUR'])+1
    
    #Day of week
    #df_name[added_word+'_HOUR_OF_WEEK'] = (df_name[added_word+'_WEEKDAY']*24) + df_name[added_word+'_HOUR']
    
    return df_name

In [None]:
#Adding datetime columns
df_prepared = addDateCols("PU","Trip Start Timestamp",df)
df_prepared = addDateCols("DO","Trip Start Timestamp",df_prepared)

In [None]:
df_prepared.head()

## Option 1: Parrow:

In [None]:
#Write parquet of cleaned frame
df_compatible = df_prepared.drop(columns=["PU_Centroid","DO_Centroid"])
frame = pa.Table.from_pandas(df_compatible)
pq.write_table(frame, '../data/df_cleaned_{}.parquet'.format(date.today().strftime("%d_%m_%Y")))