In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

In [2]:
parquet_file = "all_fhvhv_tripdata.parquet"
parquet_reader = pq.ParquetFile(parquet_file)

In [3]:
num_rows = parquet_reader.metadata.num_rows
num_row_groups = parquet_reader.num_row_groups
# df.shape[0]
print(num_rows)
print(num_row_groups)

679806621
669


In [4]:
# Because FHVHV data is so big it'll have to be split into 100 row group 
row_groups = np.linspace(0, num_row_groups, 5)
row_groups = [int(num) for num in row_groups]

first_row = 0
idx = 0
for last_row in row_groups[1:]:
    table = parquet_reader.read_row_groups([i for i in range(first_row, last_row)])
    df = table.to_pandas()

    print(f"Row group start {first_row} end {last_row}")
    print("Num rows:", df.shape[0])
    print("Data types:")
    print(df.dtypes)

    category_columns = ['PULocationID', 'DOLocationID']

    for column in category_columns:
        df[column] = df[column].astype('category')

    numeric_columns = df.select_dtypes(['int64', 'float64', 'datetime64']).columns

    # Drop duplicates
    df.drop_duplicates(inplace=True)
    print("After dropping duplicates", df.shape)

    # dropoff time before pickup time
    df.drop(df[df['dropoff_datetime']<df['pickup_datetime']].index, inplace=True)
    print("After dropping pickup after dropoff", df.shape)
    	
    # Pickup before 2021
    df.drop(df[df['pickup_datetime']<pd.to_datetime('2021-01-01')].index, inplace=True)
    print("After dropping early pickup", df.shape)

    # Dropoff after Mar 2024
    df.drop(df[df['dropoff_datetime']>pd.to_datetime('2024-04-01')].index, inplace=True)
    print("After dropping early pickup", df.shape)

    # Passenger count col
    df["passenger_count"] = 1
    print("Create passenger count", df.shape)

    # Trips too long
    time_diff = pd.Timedelta(hours=5)
    df.drop(df[df['dropoff_datetime']-df['pickup_datetime'] > time_diff].index, inplace=True)
    print("Remove long trips", df.shape)

    # Pickup and DO both NAN
    df.drop(df[df['DOLocationID'].isna() &df['PULocationID'].isna()].index, inplace=True)
    print("Drop NaN pickup and dropoff", df.shape)
    
    df.to_parquet(f"all_fhvhv_tripdata_cleaned_{idx}.parquet", engine='pyarrow', index=False)

    print(f"df {idx} written to file")
    first_row = last_row
    idx += 1



Row group start 0 end 167
Num rows: 167979341
Data types:
pickup_datetime     datetime64[us]
dropoff_datetime    datetime64[us]
PULocationID                 int64
DOLocationID                 int64
dtype: object
After dropping duplicates (167978460, 4)
After dropping pickup after dropoff (167970902, 4)
After dropping early pickup (167970902, 4)
After dropping early pickup (167970902, 4)
Create passenger count (167970902, 5)
Remove long trips (167969299, 5)
Drop NaN pickup and dropoff (167969299, 5)
df 0 written to file
Row group start 167 end 334
Num rows: 170364169
Data types:
pickup_datetime     datetime64[us]
dropoff_datetime    datetime64[us]
PULocationID                 int64
DOLocationID                 int64
dtype: object
After dropping duplicates (170361806, 4)
After dropping pickup after dropoff (170361806, 4)
After dropping early pickup (170361806, 4)
After dropping early pickup (170361806, 4)
Create passenger count (170361806, 5)
Remove long trips (170358206, 5)
Drop NaN pic

Drop rows where trip duration is > 5hrs

Drop all columns where both PU and DO location is NaN

### Logical integrity tests
1. dropoff not before pickup
2. Pickup date before 2021
3. Dropoff date > 31st Mar 2024
4. Duration of taxi ride > 5hrs
5. PU location ID not an allowed value
6. DO location ID not an allowed value



In [7]:
# Test 1: 
df[df['dropoff_datetime']<df['pickup_datetime']]

Unnamed: 0,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,passenger_count


In [8]:
# Test 2: 
df[df['pickup_datetime']<pd.to_datetime('2021-01-01')]

Unnamed: 0,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,passenger_count


In [9]:
df[df['pickup_datetime']<pd.to_datetime('2021-01-01')].count()

pickup_datetime     0
dropoff_datetime    0
PULocationID        0
DOLocationID        0
passenger_count     0
dtype: int64

In [10]:
# Test 3: 
df[df['dropoff_datetime']>pd.to_datetime('2024-04-01')]

Unnamed: 0,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,passenger_count


In [11]:
late_dropoff = df[df['dropoff_datetime']>pd.to_datetime('2024-04-01')]
late_dropoff.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
pickup_datetime,0.0,NaT,NaT,NaT,NaT,NaT,NaT,
dropoff_datetime,0.0,NaT,NaT,NaT,NaT,NaT,NaT,
passenger_count,0.0,,,,,,,


In [12]:
# Test 5:
time_diff = pd.Timedelta(hours=5)
df[df['dropoff_datetime']-df['pickup_datetime'] > time_diff]

Unnamed: 0,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,passenger_count


In [13]:
long_rides = df[df['dropoff_datetime']-df['pickup_datetime'] > time_diff]
long_rides['time_diff'] = long_rides['dropoff_datetime']-long_rides['pickup_datetime']
long_rides.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
pickup_datetime,0.0,NaT,NaT,NaT,NaT,NaT,NaT,
dropoff_datetime,0.0,NaT,NaT,NaT,NaT,NaT,NaT,
passenger_count,0.0,,,,,,,
time_diff,0.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT


In [14]:
# Test 6:
taxi_lookup = pd.read_csv('taxi_zone_lookup.csv')
taxi_zones = set(taxi_lookup['LocationID'])
df[~(df['PULocationID'].isin(taxi_zones) | df['PULocationID'].isna())]

Unnamed: 0,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,passenger_count


In [15]:
# Test 7:
df[~(df['DOLocationID'].isin(taxi_zones) | df['DOLocationID'].isna()) ]

Unnamed: 0,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,passenger_count
