# NYC taxi data cleaning

## Load packages and NYC taxi data from January 2021.

Load packages.

In [1]:
import datetime
from pathlib import Path

import pandas as pd

Load NYC taxi data from January 2021.

In [2]:
DATA_PATH = Path("/home/fmerino/Documents/data-engineering-zoomcamp-2024/01-docker-terraform/02-docker-sql/data-nyc-taxi")

In [3]:
nyc_taxi = pd.read_parquet(DATA_PATH/"yellow_tripdata_2021-01.parquet")

nyc_taxi

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.10,1.0,N,142,43,2,8.00,3.00,0.5,0.00,0.00,0.3,11.80,2.5,
1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.20,1.0,N,238,151,2,3.00,0.50,0.5,0.00,0.00,0.3,4.30,0.0,
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1.0,14.70,1.0,N,132,165,1,42.00,0.50,0.5,8.65,0.00,0.3,51.95,0.0,
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0.0,10.60,1.0,N,138,132,1,29.00,0.50,0.5,6.05,0.00,0.3,36.35,0.0,
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1.0,4.94,1.0,N,68,33,1,16.50,0.50,0.5,4.06,0.00,0.3,24.36,2.5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1369764,2,2021-01-31 23:03:00,2021-01-31 23:33:00,,8.89,,,229,181,0,27.78,0.00,0.5,7.46,0.00,0.3,38.54,,
1369765,2,2021-01-31 23:29:00,2021-01-31 23:51:00,,7.43,,,41,70,0,32.58,0.00,0.5,0.00,6.12,0.3,39.50,,
1369766,2,2021-01-31 23:25:00,2021-01-31 23:38:00,,6.26,,,74,137,0,16.85,0.00,0.5,3.90,0.00,0.3,24.05,,
1369767,6,2021-01-31 23:01:06,2021-02-01 00:02:03,,19.70,,,265,188,0,53.68,0.00,0.5,0.00,0.00,0.3,54.48,,


In [4]:
nyc_taxi.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object

Discard `store_and_fwd_flag` details because of its lack of relevance.

In [5]:
del nyc_taxi["store_and_fwd_flag"]

Check number of unique values per column/attribute and identify potential categorical values.

In [6]:
for column in nyc_taxi.columns:
    if nyc_taxi[column].nunique() < 10:
        print(f"Column {column} includes {nyc_taxi[column].nunique()} unique values ({nyc_taxi[column].unique()}).")
    else:
        print(f"Column {column} includes {nyc_taxi[column].nunique()} unique values.")

Column VendorID includes 3 unique values ([1 2 6]).
Column tpep_pickup_datetime includes 939020 unique values.
Column tpep_dropoff_datetime includes 935992 unique values.
Column passenger_count includes 9 unique values ([ 1.  0.  2.  3.  5.  4.  6.  8.  7. nan]).
Column trip_distance includes 3787 unique values.
Column RatecodeID includes 7 unique values ([ 1.  2.  4.  5.  3. 99.  6. nan]).
Column PULocationID includes 258 unique values.
Column DOLocationID includes 260 unique values.
Column payment_type includes 5 unique values ([2 1 4 3 0]).
Column fare_amount includes 6017 unique values.
Column extra includes 91 unique values.
Column mta_tax includes 3 unique values ([ 0.5  0.  -0.5]).
Column tip_amount includes 2155 unique values.
Column tolls_amount includes 307 unique values.
Column improvement_surcharge includes 3 unique values ([ 0.3  0.  -0.3]).
Column total_amount includes 8321 unique values.
Column congestion_surcharge includes 5 unique values ([ 2.5   0.   -2.5   0.75  3.  

In [7]:
nyc_taxi.describe()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
count,1369769.0,1369769,1369769,1271417.0,1369769.0,1271417.0,1369769.0,1369769.0,1369769.0,1369769.0,1369769.0,1369769.0,1369769.0,1369769.0,1369769.0,1369769.0,1271417.0,5.0
mean,1.721725,2021-01-17 01:48:30.539849,2021-01-17 02:02:25.240572,1.411508,4.631983,1.035081,165.2474,161.4957,1.188578,12.09663,0.9705133,0.4930412,1.918098,0.2477473,0.2969412,17.4744,2.239047,0.0
min,1.0,2008-12-31 23:05:14,2008-12-31 23:07:22,0.0,0.0,1.0,1.0,1.0,0.0,-490.0,-5.5,-0.5,-100.0,-31.12,-0.3,-492.8,-2.5,0.0
25%,1.0,2021-01-09 13:42:49,2021-01-09 13:55:05,1.0,1.0,1.0,124.0,107.0,1.0,6.0,0.0,0.5,0.0,0.0,0.3,10.8,2.5,0.0
50%,2.0,2021-01-16 20:10:32,2021-01-16 20:24:06,1.0,1.7,1.0,162.0,162.0,1.0,8.5,0.0,0.5,1.86,0.0,0.3,13.8,2.5,0.0
75%,2.0,2021-01-24 15:32:45,2021-01-24 15:44:59,1.0,3.02,1.0,236.0,236.0,1.0,13.5,2.5,0.5,2.75,0.0,0.3,19.12,2.5,0.0
max,6.0,2021-02-22 16:52:16,2021-02-22 16:56:15,8.0,263163.3,99.0,265.0,265.0,4.0,6960.5,8.25,0.5,1140.44,811.75,0.3,7661.28,3.0,0.0
std,0.5925347,,,1.059831,393.9037,0.599483,67.83854,72.10795,0.5776546,12.91337,1.231258,0.07632059,2.597151,1.672761,0.04222168,14.69342,0.7989435,0.0


Compute delta time (time elapsed between pickup and dropoff).

In [8]:
nyc_taxi["dt"] = (
    nyc_taxi["tpep_dropoff_datetime"]
    - nyc_taxi["tpep_pickup_datetime"]
)

In [9]:
nyc_taxi["avg_speed"] = (
    nyc_taxi["trip_distance"]
    / (nyc_taxi["dt"]/pd.Timedelta(hours=1))
)

In [10]:
relevant_cols = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    "dt",
    "trip_distance",
    "avg_speed",
    "PULocationID",
    "DOLocationID",
    "passenger_count",
    "total_amount",
]

Reorder columns/attributes based on its relevance.

In [11]:
nyc_taxi = nyc_taxi[
    [
        "tpep_pickup_datetime",
        "tpep_dropoff_datetime",
        "dt",
        "trip_distance",
        "avg_speed",
        "PULocationID",
        "DOLocationID",
        "RatecodeID",
        "passenger_count",
        "total_amount",
        "fare_amount",
        "tip_amount",
        "tolls_amount",
        "extra",
        "mta_tax",
        "improvement_surcharge",
        "congestion_surcharge",
        "airport_fee",
        "payment_type",
        "VendorID",
    ]
].copy()

## Discard trips considered bad data.

Note that we cannot discuss with the business experts how to identify bad data and, therefore, our hability to do so is limited.
Next, we propose several scenarios that could identify bad data using our shallow understanding in this sector.

- Discard trips outside the analyzed time period (January 2021).

In [12]:
nyc_taxi.drop(
    nyc_taxi[
        (nyc_taxi["tpep_pickup_datetime"] < datetime.datetime(year=2021, month=1, day=1))
        | (nyc_taxi["tpep_pickup_datetime"] > datetime.datetime(year=2021, month=2, day=1))
        | (nyc_taxi["tpep_dropoff_datetime"] < datetime.datetime(year=2021, month=1, day=1))
        | (nyc_taxi["tpep_dropoff_datetime"] > datetime.datetime(year=2021, month=2, day=1))
    ].index,
    inplace=True,
)

- Discard trips with invalid `VendorID` values.

In [13]:
nyc_taxi.drop(nyc_taxi[nyc_taxi["VendorID"] == 6].index, inplace=True)

- Discard trips with invalid `RatecodeID` values and convert to `int64` this column/attribute.

In [14]:
nyc_taxi.drop(
    nyc_taxi[
        (nyc_taxi["RatecodeID"].isna())
        | (nyc_taxi["RatecodeID"] == 99.0)
    ].index,
    inplace=True,
)

In [15]:
nyc_taxi["RatecodeID"] = nyc_taxi["RatecodeID"].astype("int64")

- By law, a maximum of 4 passengers are allowed in standard NYC taxis. A child under 7 is allowed to sit on a passenger's lap in the rear seat in addition to the passenger limit. Therefore, discard trips with more than 5 passengers. Also, discard trips with no passengers.

In [16]:
nyc_taxi.drop(nyc_taxi[(nyc_taxi["passenger_count"] > 5) | (nyc_taxi["passenger_count"] == 0)].index, inplace=True)

In [17]:
nyc_taxi["passenger_count"] = nyc_taxi["passenger_count"].astype("int64")

- Discard trips with negative or nil distance.

In [18]:
nyc_taxi.drop(nyc_taxi[nyc_taxi["trip_distance"] <= 0].index, inplace=True)

- Discard trips with a negligible duration (lower than 1 minute).

In [19]:
nyc_taxi.drop(nyc_taxi[nyc_taxi["dt"]/pd.Timedelta(minutes=1) < 1].index, inplace=True)

- Discard trips with a negative average speed (i.e., the trip distance or duration is negative).

In [20]:
nyc_taxi.drop(nyc_taxi[nyc_taxi["avg_speed"] < 0].index, inplace=True)

- Discard trips from or to outside NYC with an average speed higher than 75 mph (max freeway speed limit in the surrounding states).

In [21]:
nyc_taxi.drop(
    nyc_taxi[
        (nyc_taxi["avg_speed"] > 75)
        & (
            (nyc_taxi["PULocationID"] > 263)
            | (nyc_taxi["DOLocationID"] > 263)
        )
    ].index,
    inplace=True,
)

- Discard trips within NYC with an average speed higher than 50 mph (max speed limit in NYC).

In [22]:
nyc_taxi.drop(
    nyc_taxi[
        (nyc_taxi["avg_speed"] > 50)
        & (
            (nyc_taxi["PULocationID"] < 264)
            & (nyc_taxi["DOLocationID"] < 264)
        )
    ].index,
    inplace=True,
)

- Discard trips taking more than 1 hour at an average speed lower than 3 mph, as it is assumed these slow trips cannot even be associated with traffic jams, even in NYC.

In [23]:
nyc_taxi.drop(
    nyc_taxi[
        (nyc_taxi["dt"]/pd.Timedelta(hours=1) > 1)
        & (nyc_taxi["avg_speed"] < 3)
    ].index,
    inplace=True,
)

Check value ranges for the most relevant columns/attributes.

In [24]:
nyc_taxi[relevant_cols].describe()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,dt,trip_distance,avg_speed,PULocationID,DOLocationID,passenger_count,total_amount
count,1192568,1192568,1192568,1192568.0,1192568.0,1192568.0,1192568.0,1192568.0,1192568.0
mean,2021-01-17 03:15:39.289621,2021-01-17 03:26:31.888103,0 days 00:10:52.598480,2.647733,13.24183,166.6691,163.6726,1.349718,16.33649
min,2021-01-01 00:00:04,2021-01-01 00:02:49,0 days 00:01:00,0.01,0.01304821,1.0,1.0,1.0,-176.42
25%,2021-01-09 14:25:05,2021-01-09 14:35:12.250000,0 days 00:05:32,1.0,9.571765,132.0,112.0,1.0,10.7
50%,2021-01-16 22:42:49,2021-01-16 22:55:13.500000,0 days 00:08:47,1.65,11.80328,162.0,162.0,1.0,13.5
75%,2021-01-24 16:04:21.500000,2021-01-24 16:15:21.250000,0 days 00:13:43,2.8,15.0,236.0,236.0,1.0,17.8
max,2021-01-31 23:52:20,2021-02-01 00:00:00,0 days 10:16:36,427.7,74.72062,265.0,265.0,5.0,7661.28
std,,,0 days 00:08:12.601154,3.395515,6.047447,66.37592,71.26245,0.8299882,13.61957


In [25]:
nyc_taxi.query("passenger_count == 0")[relevant_cols].describe()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,dt,trip_distance,avg_speed,PULocationID,DOLocationID,passenger_count,total_amount
count,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
mean,NaT,NaT,NaT,,,,,,
min,NaT,NaT,NaT,,,,,,
25%,NaT,NaT,NaT,,,,,,
50%,NaT,NaT,NaT,,,,,,
75%,NaT,NaT,NaT,,,,,,
max,NaT,NaT,NaT,,,,,,
std,,,NaT,,,,,,


Reset index after data processing.

In [26]:
nyc_taxi.reset_index(drop=True, inplace=True)

## Save processed data on disk.

Save processed NYC taxi data from January 2021 on disk (PARQUET format, as the original data).

In [27]:
nyc_taxi.to_parquet(DATA_PATH/"yellow_tripdata_2021-01_prepared.parquet")