In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

In [8]:
parquet_file = "all_fhv_tripdata_filtered.parquet"
parquet_reader = pq.ParquetFile(parquet_file)

In [9]:
table = pq.read_table(parquet_file)
df = table.to_pandas()


In [10]:
num_rows = parquet_reader.metadata.num_rows
num_row_groups = parquet_reader.num_row_groups
df.shape[0]

49111129

In [11]:
print("Number of rows:", num_rows)
print("Number of cols:", df.shape[1])
print("Data types:") 
print(df.dtypes)

Number of rows: 49111129
Number of cols: 4
Data types:
pickup_datetime     datetime64[us]
dropOff_datetime    datetime64[us]
PUlocationID               float64
DOlocationID               float64
dtype: object


In [12]:
category_columns = ['PUlocationID', 'DOlocationID']
for column in category_columns:
    df[column] = df[column].astype('category')

In [13]:
# Numeric
numeric_columns = df.select_dtypes(['int64', 'float64', 'datetime64']).columns

# Data Cleaning

Drop duplicates

In [14]:
df.drop_duplicates(inplace=True)

In [15]:
df.shape

(47277519, 4)

Drop rows where pickup time before dropoff

In [16]:
df.drop(df[df['dropOff_datetime']<df['pickup_datetime']].index, inplace=True)

In [17]:
df.shape

(47277518, 4)

Drop rows where pickup time is before 2021

In [18]:
df.drop(df[df['pickup_datetime']<pd.to_datetime('2021-01-01')].index, inplace=True)

In [19]:
df.shape

(47277518, 4)

Drop rows where dropoff time is after 31st Mar 2024

In [20]:
df.drop(df[df['dropOff_datetime']>pd.to_datetime('2024-04-01')].index, inplace=True)

In [21]:
df.shape

(47276810, 4)

Create passenger count column

In [22]:
df["passenger_count"] = 1

In [23]:
df.shape

(47276810, 5)

Drop rows where trip duration is > 5hrs

In [24]:
time_diff = pd.Timedelta(hours=5)
df.drop(df[df['dropOff_datetime']-df['pickup_datetime'] > time_diff].index, inplace=True)

In [25]:
df.shape

(47152203, 5)

In [26]:
df.head()

Unnamed: 0,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,passenger_count
0,2021-01-01 00:27:00,2021-01-01 00:44:00,,,1
1,2021-01-01 00:50:00,2021-01-01 01:07:00,,,1
2,2021-01-01 00:01:00,2021-01-01 01:51:00,,,1
3,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,1
4,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,1


Drop all columns where both PU and DO location is NaN

In [40]:
df.drop(df[df['DOlocationID'].isna() &df['PUlocationID'].isna()].index, inplace=True)

In [41]:
df.to_parquet("all_fhv_tripdata_cleaned.parquet", engine='pyarrow', index=False)

### Logical integrity tests
1. dropoff not before pickup
2. Pickup date before 2021
3. Dropoff date > 31st Mar 2024
4. Duration of taxi ride > 5hrs
5. PU location ID not an allowed value
6. DO location ID not an allowed value



In [28]:
# Test 1: 
df[df['dropOff_datetime']<df['pickup_datetime']]

Unnamed: 0,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,passenger_count


In [29]:
# Test 2: 
df[df['pickup_datetime']<pd.to_datetime('2021-01-01')]

Unnamed: 0,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,passenger_count


In [30]:
df[df['pickup_datetime']<pd.to_datetime('2021-01-01')].count()

pickup_datetime     0
dropOff_datetime    0
PUlocationID        0
DOlocationID        0
passenger_count     0
dtype: int64

In [31]:
# Test 3: 
df[df['dropOff_datetime']>pd.to_datetime('2024-04-01')]

Unnamed: 0,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,passenger_count


In [32]:
late_dropoff = df[df['dropOff_datetime']>pd.to_datetime('2024-04-01')]
late_dropoff.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
pickup_datetime,0.0,NaT,NaT,NaT,NaT,NaT,NaT,
dropOff_datetime,0.0,NaT,NaT,NaT,NaT,NaT,NaT,
passenger_count,0.0,,,,,,,


In [33]:
# Test 4:
time_diff = pd.Timedelta(hours=5)
df[df['dropOff_datetime']-df['pickup_datetime'] > time_diff]

Unnamed: 0,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,passenger_count


In [34]:
long_rides = df[df['dropOff_datetime']-df['pickup_datetime'] > time_diff]
long_rides['time_diff'] = long_rides['dropOff_datetime']-long_rides['pickup_datetime']
long_rides.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
pickup_datetime,0.0,NaT,NaT,NaT,NaT,NaT,NaT,
dropOff_datetime,0.0,NaT,NaT,NaT,NaT,NaT,NaT,
passenger_count,0.0,,,,,,,
time_diff,0.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT


In [35]:
# Test 7:
taxi_lookup = pd.read_csv('taxi_zone_lookup.csv')
taxi_zones = set(taxi_lookup['LocationID'])
df[~(df['PUlocationID'].isin(taxi_zones) | df['PUlocationID'].isna())]

Unnamed: 0,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,passenger_count


In [36]:
df[~(df['DOlocationID'].isin(taxi_zones) | df['DOlocationID'].isna()) ]

Unnamed: 0,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,passenger_count
