In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

In [2]:
parquet_file = "all_fhvhv_tripdata.parquet"
parquet_reader = pq.ParquetFile(parquet_file)

In [4]:
num_rows = parquet_reader.metadata.num_rows
num_row_groups = parquet_reader.num_row_groups
# df.shape[0]
print(num_rows)
print(num_row_groups)

679806621
669


In [None]:
# Because FHVHV data is so big it'll have to be split into 100 row group 
row_groups = np.linspace(0, num_row_groups, 5)
row_groups = [int(num) for num in row_groups]

first_row = 0

for group in row_groups[1:]:
    

In [3]:
table = pq.read_table(parquet_file)
df = table.to_pandas()


KeyboardInterrupt: 

In [None]:
num_rows = parquet_reader.metadata.num_rows
num_row_groups = parquet_reader.num_row_groups
df.shape[0]

49111129

In [None]:
print("Number of rows:", num_rows)
print("Number of cols:", df.shape[1])
print("Data types:") 
print(df.dtypes)

Number of rows: 49111129
Number of cols: 4
Data types:
pickup_datetime     datetime64[us]
dropOff_datetime    datetime64[us]
PUlocationID               float64
DOlocationID               float64
dtype: object


In [None]:
category_columns = ['PULocationID', 'DOLocationID']
for column in category_columns:
    df[column] = df[column].astype('category')

In [None]:
# Numeric
numeric_columns = df.select_dtypes(['int64', 'float64', 'datetime64']).columns

# Data Cleaning

Drop duplicates

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

(47277519, 4)

Drop rows where pickup time before dropoff

In [None]:
df.drop(df[df['dropoff_datetime']<df['pickup_datetime']].index, inplace=True)

In [None]:
df.shape

(47277518, 4)

Drop rows where pickup time is before 2021

In [None]:
df.drop(df[df['pickup_datetime']<pd.to_datetime('2021-01-01')].index, inplace=True)

In [None]:
df.shape

(47277518, 4)

Drop rows where dropoff time is after 31st Mar 2024

In [None]:
df.drop(df[df['dropoff_datetime']>pd.to_datetime('2024-04-01')].index, inplace=True)

In [None]:
df.shape

(47276810, 4)

Create passenger count column

In [None]:
df["passenger_count"] = 1

In [None]:
df.shape

(47276810, 5)

Drop rows where trip duration is > 5hrs

In [None]:
time_diff = pd.Timedelta(hours=5)
df.drop(df[df['dropoff_datetime']-df['pickup_datetime'] > time_diff].index, inplace=True)

In [None]:
df.shape

(47152203, 5)

In [None]:
df.head()

Unnamed: 0,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,passenger_count
0,2021-01-01 00:27:00,2021-01-01 00:44:00,,,1
1,2021-01-01 00:50:00,2021-01-01 01:07:00,,,1
2,2021-01-01 00:01:00,2021-01-01 01:51:00,,,1
3,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,1
4,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,1


Drop all columns where both PU and DO location is NaN

In [None]:
df.drop(df[df['DOLocationID'].isna() &df['PULocationID'].isna()].index, inplace=True)

In [None]:
df.to_parquet("all_fhvhv_tripdata_cleaned.parquet", engine='pyarrow', index=False)

### Logical integrity tests
1. dropoff not before pickup
2. Pickup date before 2021
3. Dropoff date > 31st Mar 2024
4. Duration of taxi ride > 5hrs
5. PU location ID not an allowed value
6. DO location ID not an allowed value



In [None]:
# Test 1: 
df[df['dropoff_datetime']<df['pickup_datetime']]

Unnamed: 0,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID
29449058,2021-11-07 01:40:31,2021-11-07 01:07:44,258,209
29449063,2021-11-07 01:58:09,2021-11-07 01:46:35,223,265
29449065,2021-11-07 01:47:47,2021-11-07 01:17:00,148,68
29449072,2021-11-07 01:40:55,2021-11-07 01:10:12,158,146
29449084,2021-11-07 01:52:38,2021-11-07 01:02:13,100,158
...,...,...,...,...
29504060,2021-11-07 01:56:34,2021-11-07 01:05:09,211,33
29504068,2021-11-07 01:53:12,2021-11-07 01:03:40,79,249
29504072,2021-11-07 01:48:47,2021-11-07 01:05:16,87,255
29504074,2021-11-07 01:48:46,2021-11-07 01:20:35,61,75


In [None]:
# Test 2: 
df[df['pickup_datetime']<pd.to_datetime('2021-01-01')]

Unnamed: 0,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID


In [None]:
df[df['pickup_datetime']<pd.to_datetime('2021-01-01')].count()

pickup_datetime     0
dropoff_datetime    0
PULocationID        0
DOLocationID        0
dtype: int64

In [None]:
# Test 3: 
df[df['dropoff_datetime']>pd.to_datetime('2024-04-01')]

Unnamed: 0,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID


In [None]:
late_dropoff = df[df['dropoff_datetime']>pd.to_datetime('2024-04-01')]
late_dropoff.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max
pickup_datetime,0,NaT,NaT,NaT,NaT,NaT,NaT
dropoff_datetime,0,NaT,NaT,NaT,NaT,NaT,NaT


In [None]:
# Test 5:
time_diff = pd.Timedelta(hours=5)
df[df['dropoff_datetime']-df['pickup_datetime'] > time_diff]

Unnamed: 0,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID
222737,2021-01-01 13:56:33,2021-01-01 19:42:10,213,18
299230,2021-01-01 17:00:39,2021-01-01 22:22:44,92,265
512478,2021-01-02 12:26:35,2021-01-02 17:30:12,132,265
576758,2021-01-02 15:30:35,2021-01-02 21:50:02,241,265
654809,2021-01-02 19:06:59,2021-01-03 00:44:06,220,259
...,...,...,...,...
137122394,2024-03-20 12:53:03,2024-03-20 17:58:44,42,146
137212560,2024-03-20 15:19:02,2024-03-20 21:22:12,220,220
137591817,2024-03-27 08:54:23,2024-03-27 14:48:02,89,95
138133014,2024-03-28 03:19:47,2024-03-28 09:04:21,186,91


In [None]:
long_rides = df[df['dropoff_datetime']-df['pickup_datetime'] > time_diff]
long_rides['time_diff'] = long_rides['dropoff_datetime']-long_rides['pickup_datetime']
long_rides.describe().T

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  long_rides['time_diff'] = long_rides['dropoff_datetime']-long_rides['pickup_datetime']


Unnamed: 0,count,mean,min,25%,50%,75%,max,std
pickup_datetime,1606,2022-08-27 00:20:29.123910,2021-01-01 13:56:33,2022-04-04 12:38:55.250000,2022-07-21 14:10:27,2023-04-13 17:44:52.500000,2024-03-28 11:52:02,
dropoff_datetime,1606,2022-08-27 07:22:11.336239,2021-01-01 19:42:10,2022-04-04 20:48:31.250000,2022-07-21 21:19:10.500000,2023-04-14 00:32:36.750000,2024-03-28 17:26:55,
time_diff,1606,0 days 07:01:42.212328,0 days 05:00:08,0 days 05:38:05.750000,0 days 06:41:25,0 days 08:04:24.750000,1 days 17:05:18,0 days 02:00:03.536886


In [None]:
# Test 6:
taxi_lookup = pd.read_csv('taxi_zone_lookup.csv')
taxi_zones = set(taxi_lookup['LocationID'])
df[~(df['PULocationID'].isin(taxi_zones) | df['PULocationID'].isna())]

Unnamed: 0,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID


In [None]:
# Test 7:
df[~(df['DOLocationID'].isin(taxi_zones) | df['DOLocationID'].isna()) ]

Unnamed: 0,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID
