In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

In [2]:
parquet_file = "all_fhv_tripdata_filtered.parquet"
parquet_reader = pq.ParquetFile(parquet_file)

In [3]:
table = pq.read_table(parquet_file)
df = table.to_pandas()


In [4]:
num_rows = parquet_reader.metadata.num_rows
num_row_groups = parquet_reader.num_row_groups
df.shape[0]

51908257

In [5]:
print("Number of rows:", num_rows)
print("Number of cols:", df.shape[1])
print("Data types:") 
print(df.dtypes)

Number of rows: 51908257
Number of cols: 4
Data types:
pickup_datetime     datetime64[us]
dropOff_datetime    datetime64[us]
PUlocationID               float64
DOlocationID               float64
dtype: object


In [6]:
category_columns = ['PUlocationID', 'DOlocationID']
for column in category_columns:
    df[column] = df[column].astype('category')

In [7]:
# Numeric
numeric_columns = df.select_dtypes(['int64', 'float64', 'datetime64']).columns

# Data Cleaning

Drop duplicates

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.shape

(49988945, 4)

Drop rows where pickup time before dropoff

In [10]:
df.drop(df[df['dropOff_datetime']<df['pickup_datetime']].index, inplace=True)

In [11]:
df.shape

(49988944, 4)

Drop rows where pickup time is before 2021

In [12]:
df.drop(df[df['pickup_datetime']<pd.to_datetime('2021-01-01')].index, inplace=True)

In [13]:
df.shape

(49988944, 4)

Drop rows where dropoff time is after 31st Mar 2024

In [14]:
df.drop(df[df['dropOff_datetime']>pd.to_datetime('2024-04-01')].index, inplace=True)

In [15]:
df.shape

(47276810, 4)

Create passenger count column

In [16]:
df["passenger_count"] = 1

In [17]:
df.shape

(47276810, 5)

Drop rows where trip duration is > 5hrs

In [18]:
time_diff = pd.Timedelta(hours=5)
df.drop(df[df['dropOff_datetime']-df['pickup_datetime'] > time_diff].index, inplace=True)

In [19]:
df.shape

(47152203, 5)

In [20]:
df.head()

Unnamed: 0,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,passenger_count
0,2021-01-01 00:27:00,2021-01-01 00:44:00,,,1
1,2021-01-01 00:50:00,2021-01-01 01:07:00,,,1
2,2021-01-01 00:01:00,2021-01-01 01:51:00,,,1
3,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,1
4,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,1


Drop all columns where both PU and DO location is NaN

In [21]:
df.drop(df[df['DOlocationID'].isna() &df['PUlocationID'].isna()].index, inplace=True)

In [22]:
df["passenger_count"] = df["passenger_count"].astype('int64')

In [23]:
col_names = {
       'PUlocationID':"pickup_loc", 'DOlocationID':"dropoff_loc", "dropOff_datetime": "dropoff_datetime"
}
df.rename(columns = col_names, inplace=True)
df = df[["pickup_datetime","dropoff_datetime","pickup_loc","dropoff_loc", "passenger_count"]]

In [24]:
df.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_loc,dropoff_loc,passenger_count
3,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,1
4,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,1
5,2021-01-01 00:59:02,2021-01-01 01:08:05,,71.0,1
6,2021-01-01 00:18:12,2021-01-01 00:30:04,,91.0,1
7,2021-01-01 00:36:15,2021-01-01 00:45:08,,39.0,1


In [25]:
df.to_parquet("all_fhv_tripdata_cleaned.parquet", engine='pyarrow', index=False)

### Logical integrity tests
1. dropoff not before pickup
2. Pickup date before 2021
3. Dropoff date > 31st Mar 2024
4. Duration of taxi ride > 5hrs
5. PU location ID not an allowed value
6. DO location ID not an allowed value



In [26]:
# Test 1: 
df[df['dropoff_datetime']<df['pickup_datetime']]

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_loc,dropoff_loc,passenger_count


In [27]:
# Test 2: 
df[df['pickup_datetime']<pd.to_datetime('2021-01-01')]

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_loc,dropoff_loc,passenger_count


In [28]:
df[df['pickup_datetime']<pd.to_datetime('2021-01-01')].count()

pickup_datetime     0
dropoff_datetime    0
pickup_loc          0
dropoff_loc         0
passenger_count     0
dtype: int64

In [29]:
# Test 3: 
df[df['dropoff_datetime']>pd.to_datetime('2024-04-01')]

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_loc,dropoff_loc,passenger_count


In [30]:
late_dropoff = df[df['dropoff_datetime']>pd.to_datetime('2024-04-01')]
late_dropoff.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
pickup_datetime,0.0,NaT,NaT,NaT,NaT,NaT,NaT,
dropoff_datetime,0.0,NaT,NaT,NaT,NaT,NaT,NaT,
passenger_count,0.0,,,,,,,


In [31]:
# Test 4:
time_diff = pd.Timedelta(hours=5)
df[df['dropoff_datetime']-df['pickup_datetime'] > time_diff]

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_loc,dropoff_loc,passenger_count


In [32]:
long_rides = df[df['dropoff_datetime']-df['pickup_datetime'] > time_diff]
long_rides['time_diff'] = long_rides['dropoff_datetime']-long_rides['pickup_datetime']
long_rides.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
pickup_datetime,0.0,NaT,NaT,NaT,NaT,NaT,NaT,
dropoff_datetime,0.0,NaT,NaT,NaT,NaT,NaT,NaT,
passenger_count,0.0,,,,,,,
time_diff,0.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT


In [33]:
# Test 7:
taxi_lookup = pd.read_csv('taxi_zone_lookup.csv')
taxi_zones = set(taxi_lookup['LocationID'])
df[~(df['PUlocationID'].isin(taxi_zones) | df['PUlocationID'].isna())]

KeyError: 'PUlocationID'

In [None]:
df[~(df['DOlocationID'].isin(taxi_zones) | df['DOlocationID'].isna()) ]

Unnamed: 0,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,passenger_count
