In [1]:
# Loading in all of my necessary libraries
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import pyarrow.parquet as pq
pd.set_option('display.max_columns', None)

# Reading in  Data

## Taxi zones lookup table

In [2]:
# Reading in the Taxi Zones Lookup table and taking a look at it
Taxi_Zones = pd.read_csv("taxi_zone_lookup.csv")
Taxi_Zones

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
...,...,...,...,...
260,261,Manhattan,World Trade Center,Yellow Zone
261,262,Manhattan,Yorkville East,Yellow Zone
262,263,Manhattan,Yorkville West,Yellow Zone
263,264,Unknown,,


## Yellow taxi dataset

In [3]:
# Reading in my Yellow Taxi Cab dataset and inspecting its shape
Taxi_Data = pd.read_parquet("yellow_tripdata_2020-01.parquet")
print(Taxi_Data.shape)

(6405008, 19)


# Data cleaning

## Using Location ID to find borough and zone

In [4]:
# Build lookup dictionaries
borough_lookup = Taxi_Zones.set_index("LocationID")["Borough"].to_dict()
zone_lookup    = Taxi_Zones.set_index("LocationID")["Zone"].to_dict()

# Map directly into Taxi_Data
Taxi_Data["PU_Borough"] = Taxi_Data["PULocationID"].map(borough_lookup)
Taxi_Data["PU_Zone"]    = Taxi_Data["PULocationID"].map(zone_lookup)
Taxi_Data["DO_Borough"] = Taxi_Data["DOLocationID"].map(borough_lookup)
Taxi_Data["DO_Zone"]    = Taxi_Data["DOLocationID"].map(zone_lookup)

# Optional: reorder so new cols sit after IDs
cols = list(Taxi_Data.columns)
for loc, boro, zone in [
    ("PULocationID", "PU_Borough", "PU_Zone"),
    ("DOLocationID", "DO_Borough", "DO_Zone")
]:
    idx = cols.index(loc)
    cols = cols[:idx+1] + [boro, zone] + [c for c in cols[idx+1:] if c not in [boro, zone]]

Taxi_Data = Taxi_Data[cols]

### Inspecting columns and rows

In [5]:
# Taking a Look at the dataset to see if my columpns updated correctly
Taxi_Data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,PU_Borough,PU_Zone,DOLocationID,DO_Borough,DO_Zone,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2020-01-01 00:28:15,2020-01-01 00:33:03,1.0,1.2,1.0,N,238,Manhattan,Upper West Side North,239,Manhattan,Upper West Side South,1,6.0,3.0,0.5,1.47,0.0,0.3,11.27,2.5,
1,1,2020-01-01 00:35:39,2020-01-01 00:43:04,1.0,1.2,1.0,N,239,Manhattan,Upper West Side South,238,Manhattan,Upper West Side North,1,7.0,3.0,0.5,1.5,0.0,0.3,12.3,2.5,
2,1,2020-01-01 00:47:41,2020-01-01 00:53:52,1.0,0.6,1.0,N,238,Manhattan,Upper West Side North,238,Manhattan,Upper West Side North,1,6.0,3.0,0.5,1.0,0.0,0.3,10.8,2.5,
3,1,2020-01-01 00:55:23,2020-01-01 01:00:14,1.0,0.8,1.0,N,238,Manhattan,Upper West Side North,151,Manhattan,Manhattan Valley,1,5.5,0.5,0.5,1.36,0.0,0.3,8.16,0.0,
4,2,2020-01-01 00:01:58,2020-01-01 00:04:16,1.0,0.0,1.0,N,193,Queens,Queensbridge/Ravenswood,193,Queens,Queensbridge/Ravenswood,2,3.5,0.5,0.5,0.0,0.0,0.3,4.8,0.0,


## Inspecting null values

In [6]:
# Count of the null values seen in each column of the dataset
Taxi_Data.isnull().sum()

VendorID                       0
tpep_pickup_datetime           0
tpep_dropoff_datetime          0
passenger_count            65441
trip_distance                  0
RatecodeID                 65441
store_and_fwd_flag         65441
PULocationID                   0
PU_Borough                  3160
PU_Zone                    43958
DOLocationID                   0
DO_Borough                 15100
DO_Zone                    39782
payment_type                   0
fare_amount                    0
extra                          0
mta_tax                        0
tip_amount                     0
tolls_amount                   0
improvement_surcharge          0
total_amount                   0
congestion_surcharge       65441
airport_fee              6405008
dtype: int64

In [7]:
# Count rows with Y vs N in store_and_fwd_flag
print(Taxi_Data['store_and_fwd_flag'].value_counts(dropna=False))

store_and_fwd_flag
N       6271447
Y         68120
None      65441
Name: count, dtype: int64


###  NAN's  from 'store_and_fwd_flag' 

In [8]:
# Creating a df of all the rows with missing values in its "store_and_fwd_column"
missing_fwd = Taxi_Data[Taxi_Data['store_and_fwd_flag'].isna()]

# Count how many of these also have passenger_count or RatecodeID missing
print("Total rows with store_and_fwd_flag NaN:", missing_fwd.shape[0])
print("... with passenger_count NaN:", missing_fwd['passenger_count'].isna().sum())
print("... with RatecodeID NaN:", missing_fwd['RatecodeID'].isna().sum())

# Rows where all 3 are missing
all_three_missing = missing_fwd[
    missing_fwd['passenger_count'].isna() & missing_fwd['RatecodeID'].isna()
]
print("Rows with all three missing:", all_three_missing.shape[0])

print("Rows dropped:", Taxi_Data['store_and_fwd_flag'].isna().sum())

# Drop rows where passenger_count, RatecodeID, and store_and_fwd_flag are all NaN
Taxi_Data = Taxi_Data.dropna(subset=['passenger_count', 'RatecodeID', 'store_and_fwd_flag'])

Total rows with store_and_fwd_flag NaN: 65441
... with passenger_count NaN: 65441
... with RatecodeID NaN: 65441
Rows with all three missing: 65441
Rows dropped: 65441


### NA's from Location ID

In [9]:
# Location ID's 264 and 265 are special ID's
# 264's Borough is "unknown" and its zone is "NaN"
# 265's borough is NaN and its zone is "Outside of NYC"
special_ids = [264, 265]

summary = []

# Tabling the amount of rows with 264 o4 265 as its DO or PU ID 
# Checking if Nan's in Borough or Zone are due to its location ID being 264 or 265
for loc_id in special_ids:
    # PU stats
    pu_rows = Taxi_Data[Taxi_Data['PULocationID'] == loc_id]
    pu_total = len(pu_rows)
    pu_missing_borough = pu_rows['PU_Borough'].isna().sum()
    pu_missing_zone = pu_rows['PU_Zone'].isna().sum()
    
    # DO stats
    do_rows = Taxi_Data[Taxi_Data['DOLocationID'] == loc_id]
    do_total = len(do_rows)
    do_missing_borough = do_rows['DO_Borough'].isna().sum()
    do_missing_zone = do_rows['DO_Zone'].isna().sum()
    
    summary.append({
        "LocationID": loc_id,
        "PU_Count": pu_total,
        "PU_Borough_NaN": pu_missing_borough,
        "PU_Zone_NaN": pu_missing_zone,
        "DO_Count": do_total,
        "DO_Borough_NaN": do_missing_borough,
        "DO_Zone_NaN": do_missing_zone
    })

summary_df = pd.DataFrame(summary)
summary_df


Unnamed: 0,LocationID,PU_Count,PU_Borough_NaN,PU_Zone_NaN,DO_Count,DO_Borough_NaN,DO_Zone_NaN
0,264,43779,0,43779,39678,0,39678
1,265,3090,3090,0,14858,14858,0


In [10]:
# Counting the amount of rows with Location ID of 264
# Removing these rows, and printing the new shape of the dataset
Taxi_Data = Taxi_Data[
    (Taxi_Data['PULocationID'] != 264) & (Taxi_Data['DOLocationID'] != 264)
]
count_264 = Taxi_Data[
    (Taxi_Data['PULocationID'] == 264) | (Taxi_Data['DOLocationID'] == 264)
].shape[0]
print("Rows with PU or DO LocationID = 264:", count_264)
print("The New Shape of the dataset is now: ", Taxi_Data.shape)

Rows with PU or DO LocationID = 264: 0
The New Shape of the dataset is now:  (6284006, 23)


### Removing the 'aiport_fee' column

In [27]:
# Drop airport_fee column
Taxi_Data = Taxi_Data.drop(columns=['airport_fee'])

print("Remaining columns:", Taxi_Data.shape[1])


Remaining columns: 22


## Signs of invalid data

### Trips with passenger count of 0

In [11]:
# DF of rows with passenger count == 0
zero_passengers = Taxi_Data[Taxi_Data['passenger_count'] == 0]
print("Rows with passenger_count == 0:", zero_passengers.shape[0])

# Seeing how invalid these rows could be based of of the number of NAN's in these rows
na_counts_zero_passengers = zero_passengers.isna().sum()

print(na_counts_zero_passengers)

Rows with passenger_count == 0: 113085
VendorID                      0
tpep_pickup_datetime          0
tpep_dropoff_datetime         0
passenger_count               0
trip_distance                 0
RatecodeID                    0
store_and_fwd_flag            0
PULocationID                  0
PU_Borough                   83
PU_Zone                       0
DOLocationID                  0
DO_Borough                  298
DO_Zone                       0
payment_type                  0
fare_amount                   0
extra                         0
mta_tax                       0
tip_amount                    0
tolls_amount                  0
improvement_surcharge         0
total_amount                  0
congestion_surcharge          0
airport_fee              113085
dtype: int64


In [12]:
# Drop rows where passenger_count is 0
Taxi_Data = Taxi_Data[Taxi_Data['passenger_count'] != 0]
print("Remaining rows:", Taxi_Data.shape[0])

Remaining rows: 6170921


### Trips with a later pickup than drop off 

In [13]:
# Rows in which the pickup time was later than the Dropoff Time
# Keep only rows where dropoff is after pickup

# Count rows where dropoff is before pickup
invalid_time_rows = (Taxi_Data['tpep_dropoff_datetime'] < Taxi_Data['tpep_pickup_datetime']).sum()

print("Number of rows with dropoff before pickup:", invalid_time_rows)

Taxi_Data = Taxi_Data[
    Taxi_Data['tpep_dropoff_datetime'] >= Taxi_Data['tpep_pickup_datetime']
]

Number of rows with dropoff before pickup: 1


###  Trips with negative distance

In [15]:
# Count negative trip distances
negative_count = (Taxi_Data['trip_distance'] < 0).sum()
print("Number of trips with negative distance:", negative_count)

# Filter for negative trip distances
negative_trips = Taxi_Data[Taxi_Data['trip_distance'] < 0]

# Taking a look at these rows
negative_trips

Taxi_Data = Taxi_Data[Taxi_Data['trip_distance'] >= 0]

Number of trips with negative distance: 13


### Trips with signifigantly large distances

In [17]:
# Count trips with distance > 100 miles
extra_long_count = (Taxi_Data['trip_distance'] > 100).sum()

# Selecting all the rows with signifigantly large distances
extra_long_trips = Taxi_Data[Taxi_Data['trip_distance'] > 100]
print("Number of trips with distance > 100 miles:", extra_long_count)

# Taking a look at rows with large distances
extra_long_trips


Number of trips with distance > 100 miles: 20


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,PU_Borough,PU_Zone,DOLocationID,DO_Borough,DO_Zone,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
74730,2,2020-01-01 10:45:44,2020-01-01 12:24:33,1.0,100.15,1.0,N,132,Queens,JFK Airport,265,,Outside of NYC,1,253.0,0.0,0.5,5.0,22.74,0.3,281.54,0.0,
486968,2,2020-01-03 21:49:03,2020-01-03 23:40:33,2.0,110.51,5.0,N,132,Queens,JFK Airport,265,,Outside of NYC,1,370.0,0.0,0.0,0.0,0.0,0.3,370.3,0.0,
565751,2,2020-01-04 12:52:02,2020-01-04 17:23:52,4.0,259.22,5.0,N,140,Manhattan,Lenox Hill East,265,,Outside of NYC,2,575.0,0.0,0.0,0.0,0.0,0.3,577.8,2.5,
771402,1,2020-01-05 13:45:52,2020-01-05 16:34:09,1.0,168.4,5.0,N,92,Queens,Flushing,265,,Outside of NYC,2,575.0,0.0,0.0,0.0,21.12,0.3,596.42,0.0,
1251389,2,2020-01-08 07:36:13,2020-01-08 10:19:15,1.0,124.26,5.0,N,219,Queens,Springfield Gardens South,265,,Outside of NYC,1,270.0,0.0,0.0,0.0,0.0,0.3,270.3,0.0,
1439691,2,2020-01-08 23:03:27,2020-01-09 00:56:39,1.0,110.55,4.0,N,132,Queens,JFK Airport,265,,Outside of NYC,1,479.5,0.5,0.5,0.0,0.0,0.3,480.8,0.0,
1442784,1,2020-01-09 00:27:13,2020-01-09 03:37:32,1.0,211.7,5.0,N,161,Manhattan,Midtown Center,265,,Outside of NYC,1,550.0,0.0,0.0,70.0,0.0,0.3,620.3,0.0,
1444167,2,2020-01-09 00:26:08,2020-01-09 02:22:51,1.0,102.59,5.0,N,132,Queens,JFK Airport,265,,Outside of NYC,1,300.0,0.0,0.0,5.0,48.99,0.3,354.29,0.0,
1924522,2,2020-01-11 06:34:14,2020-01-11 08:33:46,4.0,111.21,5.0,N,132,Queens,JFK Airport,265,,Outside of NYC,2,400.0,0.0,0.0,0.0,23.99,0.3,424.29,0.0,
2154680,2,2020-01-12 09:19:23,2020-01-12 13:49:59,4.0,262.88,5.0,N,132,Queens,JFK Airport,265,,Outside of NYC,1,600.0,0.0,0.0,0.0,55.55,0.3,658.35,2.5,


### Trips with negative fares

In [18]:
# Selecting all the rows with negative fares to maybe take a look at them
negative_fares = Taxi_Data[Taxi_Data['fare_amount'] < 0]

# Number of rows with negative fares
print("Rows with negative fares:", negative_fares.shape[0])

# Drop rows where fare_amount is negative
Taxi_Data = Taxi_Data[Taxi_Data['fare_amount'] >= 0]

Rows with negative fares: 18790


### Trips with signifigantly large fares

In [19]:
# Getting the sum of all rows with fares more than $500
high_fares_count = (Taxi_Data['fare_amount'] > 500).sum()
print("Number of trips with fare_amount > 500:", high_fares_count)

# Selecting these rows
high_fares = Taxi_Data[Taxi_Data['fare_amount'] > 500]

# Taking a look at these rows
high_fares

Number of trips with fare_amount > 500: 15


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,PU_Borough,PU_Zone,DOLocationID,DO_Borough,DO_Zone,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
471401,2,2020-01-03 19:29:51,2020-01-05 12:28:15,1.0,8.27,1.0,N,186,Manhattan,Penn Station/Madison Sq West,152,Manhattan,Manhattanville,2,1238.0,1.0,0.5,0.0,0.0,0.3,1242.3,2.5,
565751,2,2020-01-04 12:52:02,2020-01-04 17:23:52,4.0,259.22,5.0,N,140,Manhattan,Lenox Hill East,265,,Outside of NYC,2,575.0,0.0,0.0,0.0,0.0,0.3,577.8,2.5,
771402,1,2020-01-05 13:45:52,2020-01-05 16:34:09,1.0,168.4,5.0,N,92,Queens,Flushing,265,,Outside of NYC,2,575.0,0.0,0.0,0.0,21.12,0.3,596.42,0.0,
1024290,2,2020-01-06 22:10:47,2020-01-06 22:10:58,1.0,0.0,5.0,N,33,Brooklyn,Brooklyn Heights,33,Brooklyn,Brooklyn Heights,1,800.0,0.0,0.0,240.09,0.0,0.3,1040.39,0.0,
1352625,1,2020-01-08 16:19:09,2020-01-08 16:19:34,1.0,0.0,6.0,N,132,Queens,JFK Airport,132,Queens,JFK Airport,4,655.35,0.0,0.5,0.0,0.0,0.3,656.15,0.0,
1442784,1,2020-01-09 00:27:13,2020-01-09 03:37:32,1.0,211.7,5.0,N,161,Manhattan,Midtown Center,265,,Outside of NYC,1,550.0,0.0,0.0,70.0,0.0,0.3,620.3,0.0,
2154680,2,2020-01-12 09:19:23,2020-01-12 13:49:59,4.0,262.88,5.0,N,132,Queens,JFK Airport,265,,Outside of NYC,1,600.0,0.0,0.0,0.0,55.55,0.3,658.35,2.5,
2449406,1,2020-01-13 18:18:30,2020-01-13 19:07:31,1.0,16.8,2.0,N,132,Queens,JFK Airport,161,Manhattan,Midtown Center,3,520.02,13.8,0.0,0.0,0.0,0.3,534.12,2.5,
3356299,2,2020-01-17 19:26:15,2020-01-17 23:13:11,1.0,154.45,4.0,N,262,Manhattan,Yorkville East,265,,Outside of NYC,2,765.5,1.0,0.5,0.0,0.0,0.3,769.8,2.5,
3813213,2,2020-01-20 11:31:59,2020-01-20 16:08:25,2.0,241.64,1.0,N,48,Manhattan,Clinton East,265,,Outside of NYC,2,614.0,0.0,0.5,0.0,0.0,0.3,617.3,2.5,


### Trips with a large amount of passengers

In [20]:
# Filter rows where passenger_count > 6
too_many_passengers = Taxi_Data[Taxi_Data['passenger_count'] > 6]

# Number of rows
print("Rows with passenger_count > 6:", too_many_passengers.shape[0])

# Taking a look at these rows
too_many_passengers

Rows with passenger_count > 6: 45


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,PU_Borough,PU_Zone,DOLocationID,DO_Borough,DO_Zone,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
33432,2,2020-01-01 02:25:50,2020-01-01 02:26:47,8.0,1.21,5.0,N,140,Manhattan,Lenox Hill East,262,Manhattan,Yorkville East,2,88.88,0.0,0.0,0.0,0.0,0.3,91.68,2.5,
42270,2,2020-01-01 03:01:46,2020-01-01 03:01:49,8.0,0.0,5.0,N,48,Manhattan,Clinton East,48,Manhattan,Clinton East,1,8.3,0.0,0.5,1.82,0.0,0.3,10.92,0.0,
50523,1,2020-01-01 03:53:38,2020-01-01 03:58:57,8.0,1.0,1.0,N,231,Manhattan,TriBeCa/Civic Center,79,Manhattan,East Village,1,5.5,3.0,0.5,1.0,0.0,0.3,10.3,2.5,
54229,2,2020-01-01 04:35:19,2020-01-01 04:35:23,8.0,0.0,5.0,N,265,,Outside of NYC,265,,Outside of NYC,1,80.0,0.0,0.5,10.88,0.0,0.3,91.68,0.0,
89415,1,2020-01-01 13:40:06,2020-01-01 13:59:08,9.0,7.9,2.0,Y,82,Queens,Elmhurst,114,Manhattan,Greenwich Village South,2,52.0,2.5,0.5,0.0,6.12,0.3,61.42,2.5,
299943,2,2020-01-02 19:56:07,2020-01-02 19:56:11,9.0,0.0,5.0,N,229,Manhattan,Sutton Place/Turtle Bay North,229,Manhattan,Sutton Place/Turtle Bay North,1,95.0,0.0,0.0,19.06,0.0,0.3,114.36,0.0,
535234,2,2020-01-04 07:29:31,2020-01-04 07:29:35,9.0,0.0,5.0,N,1,EWR,Newark Airport,1,EWR,Newark Airport,1,95.0,0.0,0.5,0.0,0.0,0.3,95.8,0.0,
554292,2,2020-01-04 10:16:23,2020-01-04 10:59:07,7.0,19.49,5.0,N,132,Queens,JFK Airport,158,Manhattan,Meatpacking/West Village West,1,70.0,0.0,0.0,19.73,6.12,0.3,98.65,2.5,
624299,2,2020-01-04 17:03:45,2020-01-04 17:12:06,7.0,3.25,5.0,N,145,Queens,Long Island City/Hunters Point,79,Manhattan,East Village,2,72.0,0.0,0.5,0.0,6.12,0.3,81.42,2.5,
718483,2,2020-01-05 04:38:08,2020-01-05 04:38:11,7.0,0.0,5.0,N,265,,Outside of NYC,265,,Outside of NYC,1,75.0,0.0,0.5,0.0,0.0,0.3,75.8,0.0,


# Further Data Inspection

## Data types

In [41]:
# Or more detailed summary
print(Taxi_Data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 6152117 entries, 0 to 6339566
Data columns (total 22 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   PU_Borough             object        
 9   PU_Zone                object        
 10  DOLocationID           int64         
 11  DO_Borough             object        
 12  DO_Zone                object        
 13  payment_type           int64         
 14  fare_amount            float64       
 15  extra                  float64       
 16  mta_tax                float64       
 17  tip_amount             float64       
 18  tolls_amount           floa

## Inspecting the number of unique boroughs and zones

In [30]:
# Count unique categories in boroughs and zones
print("Unique PU_Borough:", Taxi_Data['PU_Borough'].nunique())
print("Unique DO_Borough:", Taxi_Data['DO_Borough'].nunique())
print("Unique PU_Zone:", Taxi_Data['PU_Zone'].nunique())
print("Unique DO_Zone:", Taxi_Data['DO_Zone'].nunique())


Unique PU_Borough: 6
Unique DO_Borough: 6
Unique PU_Zone: 258
Unique DO_Zone: 259


### Inspecting most common pickup and drop off zones

In [32]:
# Pickup zones - full list sorted by frequency
pu_counts = Taxi_Data['PU_Zone'].value_counts()
print("Pickup Zone Counts:\n", pu_counts)

# Dropoff zones - full list sorted by frequency
do_counts = Taxi_Data['DO_Zone'].value_counts()
print("Dropoff Zone Counts:\n", do_counts)

Pickup Zone Counts:
 PU_Zone
Upper East Side South        286005
Midtown Center               275488
Upper East Side North        266387
Midtown East                 229987
Times Sq/Theatre District    222803
                              ...  
Freshkills Park                   2
New Dorp/Midland Beach            1
West Brighton                     1
Broad Channel                     1
Rossville/Woodrow                 1
Name: count, Length: 258, dtype: int64
Dropoff Zone Counts:
 DO_Zone
Upper East Side North                            282457
Upper East Side South                            257301
Midtown Center                                   243178
Murray Hill                                      191574
Midtown East                                     189286
                                                  ...  
Rossville/Woodrow                                    30
Crotona Park                                         29
Freshkills Park                                      15
Go

### Counting the pickup and drop offs for each zone

In [33]:
# Pickup zone counts
pu_counts_df = Taxi_Data['PU_Zone'].value_counts().reset_index()
pu_counts_df.columns = ['PU_Zone', 'PU_Count']

# Dropoff zone counts
do_counts_df = Taxi_Data['DO_Zone'].value_counts().reset_index()
do_counts_df.columns = ['DO_Zone', 'DO_Count']

# Show first few rows
print(pu_counts_df.head())
print(do_counts_df.head())


                     PU_Zone  PU_Count
0      Upper East Side South    286005
1             Midtown Center    275488
2      Upper East Side North    266387
3               Midtown East    229987
4  Times Sq/Theatre District    222803
                 DO_Zone  DO_Count
0  Upper East Side North    282457
1  Upper East Side South    257301
2         Midtown Center    243178
3            Murray Hill    191574
4           Midtown East    189286


###  Top 150 zones

In [39]:
# Pickup zone counts
pu_counts = Taxi_Data['PU_Zone'].value_counts().reset_index()
pu_counts.columns = ['Zone', 'PU_Count']

# Dropoff zone counts
do_counts = Taxi_Data['DO_Zone'].value_counts().reset_index()
do_counts.columns = ['Zone', 'DO_Count']

# Merge pickup + dropoff counts into one table
zone_counts = pd.merge(pu_counts, do_counts, on='Zone', how='outer').fillna(0)

# Add total appearances
zone_counts['Total_Count'] = zone_counts['PU_Count'] + zone_counts['DO_Count']

# Sort by total
zone_counts = zone_counts.sort_values(by='Total_Count', ascending=False)

print(zone_counts.head(150))

                          Zone  PU_Count  DO_Count  Total_Count
2        Upper East Side North  266387.0    282457     548844.0
0        Upper East Side South  286005.0    257301     543306.0
1               Midtown Center  275488.0    243178     518666.0
3                 Midtown East  229987.0    189286     419273.0
4    Times Sq/Theatre District  222803.0    188606     411409.0
..                         ...       ...       ...          ...
127      Soundview/Castle Hill     233.0       960       1193.0
185                  Woodhaven      96.0      1089       1185.0
150                 Highbridge     155.0      1022       1177.0
164           Bensonhurst West     126.0      1045       1171.0
136                 Mount Hope     189.0       969       1158.0

[150 rows x 4 columns]


### Zones with less than 2000 trips, and between 2000 and 10000

In [40]:
# Zones with < 2000 total trips
low_zones = zone_counts[zone_counts['Total_Count'] < 2000]
print("Zones with < 2000 trips:", low_zones.shape[0])

# Zones between 2000 and 10000 total trips
mid_zones = zone_counts[(zone_counts['Total_Count'] >= 2000) & 
                        (zone_counts['Total_Count'] <= 10000)]
print("Zones with 2000–10000 trips:", mid_zones.shape[0])

Zones with < 2000 trips: 132
Zones with 2000–10000 trips: 48
