# NYC Yellow Taxi — Mini EDA (100 k trips, Jan 2019)

### Impoting libraries and reading processed data

In [129]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

path = Path("..") / "data" / "processed"/ "taxi_clean.parquet"
df = pd.read_parquet(path)
cols_to_cat = ['VendorID', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'passenger_count']  # ---columns to change type of to category
df[cols_to_cat] = df[cols_to_cat].astype('category') # ---already done in pipe.py, but saving and loading .parquet loses category convertation except for 'store_and_fwd_flag'

### Quick shape

```text
Data cleaned by such parameters:
    ['trip_distance'] > 0
    ['passenger_count'] != 0
    ['fare_amount'] > 0
    ['tpep_pickup_datetime'] >= '2019-01-01'
    ['trip_duration_min'] < 186

In [130]:
df.shape

(96914, 20)

In [131]:
df.dtypes.value_counts()

float64           11
datetime64[ns]     2
category           1
category           1
category           1
category           1
category           1
category           1
category           1
Name: count, dtype: int64

### Main descriptives

In [132]:
df.head(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,trip_duration_min,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tip_percent,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2019-01-01 00:46:40,2019-01-01 00:53:20,6.666667,1,1.5,1,N,151,239,1,7.0,0.5,0.5,1.65,23.571429,0.0,0.3,9.95,
1,1,2019-01-01 00:59:47,2019-01-01 01:18:59,19.2,1,2.6,1,N,239,246,1,14.0,0.5,0.5,1.0,7.142857,0.0,0.3,16.3,
2,1,2019-01-01 00:21:28,2019-01-01 00:28:37,7.15,1,1.3,1,N,163,229,1,6.5,0.5,0.5,1.25,19.230769,0.0,0.3,9.05,
3,1,2019-01-01 00:32:01,2019-01-01 00:45:39,13.633333,1,3.7,1,N,229,7,1,13.5,0.5,0.5,3.7,27.407407,0.0,0.3,18.5,
4,1,2019-01-01 00:57:32,2019-01-01 01:09:32,12.0,2,2.1,1,N,141,234,1,10.0,0.5,0.5,1.7,17.0,0.0,0.3,13.0,
5,1,2019-01-01 00:24:04,2019-01-01 00:47:06,23.033333,2,2.8,1,N,246,162,1,15.0,0.5,0.5,3.25,21.666667,0.0,0.3,19.55,
6,1,2019-01-01 00:21:59,2019-01-01 00:28:24,6.416667,1,0.7,1,N,238,151,1,5.5,0.5,0.5,1.7,30.909091,0.0,0.3,8.5,
7,1,2019-01-01 00:45:21,2019-01-01 01:31:05,45.733333,1,8.7,1,N,163,25,1,34.5,0.5,0.5,7.15,20.724638,0.0,0.3,42.95,
8,1,2019-01-01 00:43:19,2019-01-01 01:07:42,24.383333,1,6.3,1,N,224,25,1,21.5,0.5,0.5,5.7,26.511628,0.0,0.3,28.5,
9,1,2019-01-01 00:58:24,2019-01-01 01:15:18,16.9,1,2.7,1,N,141,234,1,13.0,0.5,0.5,1.0,7.692308,0.0,0.3,15.3,


In [133]:
df.describe()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_duration_min,trip_distance,fare_amount,extra,mta_tax,tip_amount,tip_percent,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
count,96914,96914,96914.0,96914.0,96914.0,96914.0,96914.0,96914.0,96914.0,96914.0,96914.0,96914.0,0.0
mean,2019-01-01 05:26:59.549280768,2019-01-01 05:39:32.696287488,12.55245,3.489183,13.472577,0.311192,0.497039,1.719715,13.625489,0.32339,0.299991,16.635396,
min,2019-01-01 00:00:00,2019-01-01 00:01:33,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.31,
25%,2019-01-01 01:42:56,2019-01-01 01:57:20,5.916667,1.12,6.5,0.0,0.5,0.0,0.0,0.0,0.3,8.3,
50%,2019-01-01 03:42:11,2019-01-01 03:54:58.500000,10.2,2.05,9.5,0.5,0.5,1.15,16.129032,0.0,0.3,12.0,
75%,2019-01-01 09:56:45,2019-01-01 10:06:53,16.983333,4.2,16.0,0.5,0.5,2.35,22.888889,0.0,0.3,19.55,
max,2019-01-01 14:02:42,2019-01-01 14:55:19,185.983333,128.73,450.0,17.5,0.5,100.0,1818.181818,75.0,0.3,453.44,
std,,,8.980256,3.979922,11.740042,0.261739,0.038366,2.571724,18.633275,1.583831,0.001669,14.056709,


In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96914 entries, 0 to 96913
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               96914 non-null  category      
 1   tpep_pickup_datetime   96914 non-null  datetime64[ns]
 2   tpep_dropoff_datetime  96914 non-null  datetime64[ns]
 3   trip_duration_min      96914 non-null  float64       
 4   passenger_count        96914 non-null  category      
 5   trip_distance          96914 non-null  float64       
 6   RatecodeID             96914 non-null  category      
 7   store_and_fwd_flag     96914 non-null  category      
 8   PULocationID           96914 non-null  category      
 9   DOLocationID           96914 non-null  category      
 10  payment_type           96914 non-null  category      
 11  fare_amount            96914 non-null  float64       
 12  extra                  96914 non-null  float64       
 13  m

In [135]:
print(df.isna().sum())

VendorID                     0
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
trip_duration_min            0
passenger_count              0
trip_distance                0
RatecodeID                   0
store_and_fwd_flag           0
PULocationID                 0
DOLocationID                 0
payment_type                 0
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tip_percent                  0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge     96914
dtype: int64


### Aggregations

```text
Revenue by location

In [136]:
path_rev = Path("..") / "data" / "processed" / "revenue_by_loc.parquet"
rev = pd.read_parquet(path_rev)
rev

Unnamed: 0_level_0,Total Amount
PULocationID,Unnamed: 1_level_1
132,115149.53
48,71080.43
79,67726.07
138,52127.01
230,50804.67
...,...
73,9.80
187,8.30
57,5.80
23,5.30


```text
Average check by hour

In [137]:
path_by_hour = Path("..") / "data" / "processed" / "check_by_hour.parquet"
by_hour = pd.read_parquet(path_by_hour)
by_hour

Unnamed: 0_level_0,Total Amount
Hour,Unnamed: 1_level_1
0,16.059378
1,16.335993
2,16.313336
3,16.029889
4,16.983514
5,19.012112
6,19.982956
7,19.510838
8,19.82294
9,17.699316


![Total amount by PULocationID barplot](../plots/revenue_top10.png)

#### Most revenue is generated in regions 132, 48, 79

![Mean check lineplot](../plots/check_by_hour.png)

#### Average check tends to skyrocket starting at 4:00, reaching its peak at around 6:00 to 8:00 and heading down from there

![Duration boxplot](../plots/duration_distribution.png)

#### Duration Distribution Summary

- **Median trip duration** is around **10 minutes**.
- Most rides (IQR) fall within **5–17 minutes**, confirming that typical taxi trips are short.
- A large number of **outliers** extend beyond 30 minutes, with extreme values reaching **180+ minutes**.
- These long trips may indicate:
  - Rides to/from airports or outer boroughs,
  - Unusual traffic conditions,
  - Potential data errors.