In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

In [36]:
parquet_file = "all_green_tripdata_filtered.parquet"
parquet_reader = pq.ParquetFile(parquet_file)

In [37]:
table = pq.read_table(parquet_file)
df = table.to_pandas()


In [38]:
num_rows = parquet_reader.metadata.num_rows
num_row_groups = parquet_reader.num_row_groups
df.shape[0]

2981276

In [39]:
print("Number of rows:", num_rows)
print("Number of cols:", df.shape[1])
print("Data types:") 
print(df.dtypes)

Number of rows: 2981276
Number of cols: 5
Data types:
lpep_pickup_datetime     datetime64[us]
lpep_dropoff_datetime    datetime64[us]
PULocationID                      int32
DOLocationID                      int32
passenger_count                 float64
dtype: object


In [40]:
category_columns = ['PULocationID', 'DOLocationID']
for column in category_columns:
    df[column] = df[column].astype('category')

In [41]:
# Numeric
numeric_columns = df.select_dtypes(['int64', 'float64', 'datetime64']).columns

# Data Cleaning

Drop duplicates

In [42]:
df.drop_duplicates(inplace=True)

In [43]:
df.shape

(2973905, 5)

Drop rows where pickup time before dropoff

In [44]:
df.drop(df[df['lpep_dropoff_datetime']<df['lpep_pickup_datetime']].index, inplace=True)

In [45]:
df.shape

(2973902, 5)

Drop rows where pickup time is before 2021

In [46]:
df.drop(df[df['lpep_pickup_datetime']<pd.to_datetime('2021-01-01')].index, inplace=True)

In [47]:
df.shape

(2973857, 5)

Drop rows where dropoff time is after 30th May 2024

In [48]:
df.drop(df[df['lpep_dropoff_datetime']>pd.to_datetime('2024-06-01')].index, inplace=True)

In [49]:
df.shape

(2973838, 5)

Drop rows where passenger count is missing

In [50]:
df[df['passenger_count'].isnull()]

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count
40471,2021-01-01 00:29:00,2021-01-01 00:34:00,193,193,
40472,2021-01-01 00:52:00,2021-01-01 01:09:00,35,181,
40473,2021-01-01 00:18:00,2021-01-01 00:38:00,174,69,
40474,2021-01-01 00:29:00,2021-01-01 00:47:00,61,256,
40475,2021-01-01 00:54:00,2021-01-01 01:17:00,76,108,
...,...,...,...,...,...
2981271,2024-05-31 22:20:00,2024-05-31 22:34:00,166,239,
2981272,2024-05-31 22:08:00,2024-05-31 22:22:00,149,89,
2981273,2024-05-31 22:24:00,2024-05-31 22:46:00,89,97,
2981274,2024-05-31 23:27:14,2024-05-31 23:52:15,80,61,


In [51]:
df.drop(df[df['passenger_count'].isnull()].index, inplace=True)

In [52]:
df.shape

(2403141, 5)

Drop rows where trip duration is > 5hrs

In [53]:
time_diff = pd.Timedelta(hours=5)
df.drop(df[df['lpep_dropoff_datetime']-df['lpep_pickup_datetime'] > time_diff].index, inplace=True)

In [54]:
df.shape

(2392224, 5)

Drop rows where passenger count is 0

In [55]:
df.drop(df[df['passenger_count']==0].index, inplace=True)


In [56]:
df.shape

(2378576, 5)

Drop rows where passenger count is >6

In [57]:
df.drop(df[df['passenger_count']>6].index, inplace=True)

In [58]:
df.shape

(2378160, 5)

In [59]:
df.head()

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count
0,2021-01-01 00:15:56,2021-01-01 00:19:52,43,151,1.0
1,2021-01-01 00:25:59,2021-01-01 00:34:44,166,239,1.0
2,2021-01-01 00:45:57,2021-01-01 00:51:55,41,42,1.0
4,2021-01-01 00:16:36,2021-01-01 00:16:40,265,265,3.0
6,2021-01-01 00:19:14,2021-01-01 00:19:21,265,265,1.0


In [60]:
df["passenger_count"] = df["passenger_count"].astype('int64')

In [61]:
col_names = {
    'lpep_pickup_datetime':"pickup_datetime", 'lpep_dropoff_datetime':"dropoff_datetime",
       'PULocationID':"pickup_loc", 'DOLocationID':"dropoff_loc"
}
df.rename(columns = col_names, inplace=True)
df = df[["pickup_datetime","dropoff_datetime","pickup_loc","dropoff_loc", "passenger_count"]]

In [62]:
df.to_parquet("all_green_tripdata_cleaned.parquet", engine='pyarrow', index=False)

### Logical integrity tests
1. dropoff not before pickup
2. Passenger count not negative
3. Passenger count not greater than 6
4. Pickup date before 2021
5. Dropoff date > 30th May 2024
6. Duration of taxi ride > 5hrs
7. PU location ID not an allowed value
8. DO location ID not an allowed value



In [63]:
# Test 1: 
df[df['dropoff_datetime']<df['pickup_datetime']]

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_loc,dropoff_loc,passenger_count


In [64]:
# Test 2:
df[df['passenger_count']<0]

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_loc,dropoff_loc,passenger_count


In [65]:
# Test 3:
df[df['passenger_count']>6]

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_loc,dropoff_loc,passenger_count


In [66]:
# Test 4: 
df[df['pickup_datetime']<pd.to_datetime('2021-01-01')]

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_loc,dropoff_loc,passenger_count


In [71]:
# Test 5: 
df[df['dropoff_datetime']>pd.to_datetime('2024-06-01')]

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_loc,dropoff_loc,passenger_count


In [68]:
late_dropoff = df[df['dropoff_datetime']>pd.to_datetime('2024-04-01')]
late_dropoff.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
pickup_datetime,111696.0,2024-05-01 18:01:42.393478,2024-03-31 23:38:13,2024-04-16 17:14:27.500000,2024-05-02 07:47:22,2024-05-16 16:34:37.500000,2024-05-31 23:54:55,
dropoff_datetime,111696.0,2024-05-01 18:16:24.527879,2024-04-01 00:02:03,2024-04-16 17:30:34.750000,2024-05-02 08:00:05.500000,2024-05-16 16:53:16,2024-05-31 23:58:32,
passenger_count,111696.0,1.324228,1.0,1.0,1.0,1.0,6.0,0.958184
