In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

In [52]:
parquet_file = "all_green_tripdata_cleaned.parquet"
parquet_reader = pq.ParquetFile(parquet_file)

In [53]:
table = pq.read_table(parquet_file)
df = table.to_pandas()


In [54]:
num_rows = parquet_reader.metadata.num_rows
num_row_groups = parquet_reader.num_row_groups
df.shape[0]

2266461

In [55]:
print("Number of rows:", num_rows)
print("Number of cols:", df.shape[1])
print("Data types:") 
print(df.dtypes)

Number of rows: 2266461
Number of cols: 5
Data types:
lpep_pickup_datetime     datetime64[us]
lpep_dropoff_datetime    datetime64[us]
PULocationID                      int32
DOLocationID                      int32
passenger_count                 float64
dtype: object


In [6]:
category_columns = ['PULocationID', 'DOLocationID']
for column in category_columns:
    df[column] = df[column].astype('category')

In [7]:
# Numeric
numeric_columns = df.select_dtypes(['int64', 'float64', 'datetime64']).columns

# Data Cleaning

Drop duplicates

In [8]:
df.drop_duplicates(inplace=True)

In [12]:
df.shape

(2856867, 5)

Drop rows where pickup time before dropoff

In [11]:
df.drop(df[df['lpep_dropoff_datetime']<df['lpep_pickup_datetime']].index, inplace=True)

In [14]:
df.shape

(2856867, 5)

Drop rows where pickup time is before 2021

In [16]:
df.drop(df[df['lpep_pickup_datetime']<pd.to_datetime('2021-01-01')].index, inplace=True)

In [17]:
df.shape

(2856824, 5)

Drop rows where dropoff time is after 31st Mar 2024

In [19]:
df.drop(df[df['lpep_dropoff_datetime']>pd.to_datetime('2024-04-01')].index, inplace=True)

In [20]:
df.shape

(2856819, 5)

Drop rows where passenger count is missing

In [21]:
df[df['passenger_count'].isnull()]

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count
40471,2021-01-01 00:29:00,2021-01-01 00:34:00,193,193,
40472,2021-01-01 00:52:00,2021-01-01 01:09:00,35,181,
40473,2021-01-01 00:18:00,2021-01-01 00:38:00,174,69,
40474,2021-01-01 00:29:00,2021-01-01 00:47:00,61,256,
40475,2021-01-01 00:54:00,2021-01-01 01:17:00,76,108,
...,...,...,...,...,...
2863797,2024-03-31 21:19:00,2024-03-31 21:30:00,25,61,
2863798,2024-03-31 22:30:00,2024-03-31 22:35:00,41,42,
2863799,2024-03-31 22:43:00,2024-03-31 22:48:00,223,7,
2863800,2024-03-31 22:48:00,2024-03-31 23:12:00,42,249,


In [22]:
df.drop(df[df['passenger_count'].isnull()].index, inplace=True)

In [23]:
df.shape

(2290002, 5)

Drop rows where trip duration is > 5hrs

In [25]:
time_diff = pd.Timedelta(hours=5)
df.drop(df[df['lpep_dropoff_datetime']-df['lpep_pickup_datetime'] > time_diff].index, inplace=True)

In [26]:
df.shape

(2279524, 5)

Drop rows where passenger count is 0

In [27]:
df.drop(df[df['passenger_count']==0].index, inplace=True)


In [28]:
df.shape

(2266853, 5)

Drop rows where passenger count is >6

In [29]:
df.drop(df[df['passenger_count']>6].index, inplace=True)

In [30]:
df.shape

(2266461, 5)

In [31]:
df.head()

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count
0,2021-01-01 00:15:56,2021-01-01 00:19:52,43,151,1.0
1,2021-01-01 00:25:59,2021-01-01 00:34:44,166,239,1.0
2,2021-01-01 00:45:57,2021-01-01 00:51:55,41,42,1.0
4,2021-01-01 00:16:36,2021-01-01 00:16:40,265,265,3.0
6,2021-01-01 00:19:14,2021-01-01 00:19:21,265,265,1.0


In [33]:
df.to_parquet("all_green_tripdata_cleaned.parquet", engine='pyarrow', index=False)

### Logical integrity tests
1. dropoff not before pickup
2. Passenger count not negative
3. Passenger count not greater than 6
4. Pickup date before 2021
5. Dropoff date > 31st Mar 2024
6. Duration of taxi ride > 5hrs
7. PU location ID not an allowed value
8. DO location ID not an allowed value



In [34]:
# Test 1: 
df[df['lpep_dropoff_datetime']<df['lpep_pickup_datetime']]

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count


In [35]:
# Test 2:
df[df['passenger_count']<0]

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count


In [36]:
# Test 3:
df[df['passenger_count']>6]

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count


In [37]:
# Test 4: 
df[df['lpep_pickup_datetime']<pd.to_datetime('2021-01-01')]

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count


In [38]:
# Test 5: 
df[df['lpep_dropoff_datetime']>pd.to_datetime('2024-04-01')]

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count


In [39]:
late_dropoff = df[df['lpep_dropoff_datetime']>pd.to_datetime('2024-04-01')]
late_dropoff.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
lpep_pickup_datetime,0.0,NaT,NaT,NaT,NaT,NaT,NaT,
lpep_dropoff_datetime,0.0,NaT,NaT,NaT,NaT,NaT,NaT,
passenger_count,0.0,,,,,,,


### Logical integrity tests
1. dropoff not before pickup
2. Passenger count not negative
3. Passenger count not greater than 6
4. Pickup date before 2021
5. Dropoff date > 31st Mar 2024
6. Duration of taxi ride > 5hrs
7. PU location ID not an allowed value
8. DO location ID not an allowed value



In [40]:
# Test 1: 
df[df['lpep_dropoff_datetime']<df['lpep_pickup_datetime']]

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count


In [41]:
# Test 2:
df[df['passenger_count']<0]

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count


In [42]:
# Test 3:
df[df['passenger_count']>6]

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count


In [43]:
df[df['passenger_count']>6]['passenger_count'].values

array([], dtype=float64)

In [44]:
# Test 4: 
df[df['lpep_pickup_datetime']<pd.to_datetime('2021-01-01')]

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count


In [45]:
df[df['lpep_pickup_datetime']<pd.to_datetime('2021-01-01')].count()

lpep_pickup_datetime     0
lpep_dropoff_datetime    0
PULocationID             0
DOLocationID             0
passenger_count          0
dtype: int64

In [46]:
# Test 5: 
df[df['lpep_dropoff_datetime']>pd.to_datetime('2024-04-01')]

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count


In [47]:
late_dropoff = df[df['lpep_dropoff_datetime']>pd.to_datetime('2024-04-01')]
late_dropoff.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
lpep_pickup_datetime,0.0,NaT,NaT,NaT,NaT,NaT,NaT,
lpep_dropoff_datetime,0.0,NaT,NaT,NaT,NaT,NaT,NaT,
passenger_count,0.0,,,,,,,


In [48]:
# Test 6:
time_diff = pd.Timedelta(hours=5)
df[df['lpep_dropoff_datetime']-df['lpep_pickup_datetime'] > time_diff]

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count


In [49]:
long_rides = df[df['lpep_dropoff_datetime']-df['lpep_pickup_datetime'] > time_diff]
long_rides['time_diff'] = long_rides['lpep_dropoff_datetime']-long_rides['lpep_pickup_datetime']
long_rides.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
lpep_pickup_datetime,0.0,NaT,NaT,NaT,NaT,NaT,NaT,
lpep_dropoff_datetime,0.0,NaT,NaT,NaT,NaT,NaT,NaT,
passenger_count,0.0,,,,,,,
time_diff,0.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT


In [50]:
# Test 7:
taxi_lookup = pd.read_csv('taxi_zone_lookup.csv')
taxi_zones = set(taxi_lookup['LocationID'])
df[~df['PULocationID'].isin(taxi_zones)]

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count


In [51]:
df[~df['DOLocationID'].isin(taxi_zones)]

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count
