# MTA NYC Subway Dataset Evaluation 2025

Andrew Chung, hc893

In [47]:
import pandas as pd
import numpy as np

# Line-Specific Datasets

The project will focus on **weekday, peak hours** data from **Jan-Feb 2025**.

NOTE: I have decided to remove the W and Rockaway Shuttle Lines due to missing data.

In [48]:
# impute shuttle line names
def impute(data, 
          gc_shuttle = 'S 42nd', 
          fk_shuttle = 'S Fkln',
          jz = 'J'):
  df = data.copy()
  df.loc[df['line'] == gc_shuttle, 'line'] = 'SG'
  df.loc[df['line'] == fk_shuttle, 'line'] = 'SF'
  df.loc[df['line'] == jz, 'line'] = 'JZ' # merge J/Z
  return df

### Customer Journey

In [49]:
# customer journey
customer_journey = pd.read_csv("MTA_Subway_Customer_Journey-Focused_Metrics__Beginning_2025_20250324.csv").query(
  "period == \"peak\" & line not in ['W', 'S Rock']"
).reset_index().drop(columns = ['division', 'index', 'period'])
customer_journey = impute(customer_journey).groupby('line').agg({
  'num_passengers': 'sum', # sum over 2 months
  'additional platform time': 'mean', 
  'additional train time': 'mean', 
  'over_five_mins_perc': 'mean'
}).reset_index().rename(columns = {'index': 'line'})

### Wait Assessment

In [50]:
# wait assessment 
wait_assessment = pd.read_csv("MTA_Subway_Wait_Assessment__Beginning_2025_20250324.csv").query(
  "day_type == 1 and period == \"peak\" & line not in ['H', 'W', 'S Rock']"
).reset_index().drop(columns = ['division', 'index', 'day_type', 'period'])
wait_assessment = impute(wait_assessment, gc_shuttle = 'GS', fk_shuttle = 'FS').groupby('line').agg({
  'wait assessment': 'mean'
}).reset_index().rename(columns = {'index': 'line'})

### Service Delivered

In [51]:
# service delivered
service_delivered = pd.read_csv("MTA_Subway_Service_Delivered__Beginning_2025_20250324.csv").query(
  "day_type == 1 & line not in ['H', 'W', 'S Rock']"
).reset_index().drop(columns = ['division', 'index', 'day_type'])
service_delivered = impute(service_delivered, gc_shuttle = 'GS', fk_shuttle = 'FS').groupby('line').agg({
  'service delivered': 'mean'
}).reset_index().rename(columns = {'index': 'line'})

### Major Incidents

In [52]:
# major incidents
major_incidents = pd.read_csv("MTA_Subway_Major_Incidents__Beginning_2025_20250324.csv").query(
  "day_type == 1"
).dropna().reset_index().drop(columns = ['index']).dropna()

### Terminal On-Time Performance

In [53]:
# terminal on time
terminal_ontime = pd.read_csv("MTA_Subway_Terminal_On-Time_Performance__Beginning_2025_20250324.csv").query(
  "day_type == 1 & line not in ['W', 'S Rock']"
).reset_index().drop(columns = ['division', 'index', 'day_type'])
terminal_ontime = impute(terminal_ontime).groupby('line').agg({
  'terminal_on_time_performance': 'mean'
}).reset_index().rename(columns = {'index': 'line'})

### Trains Delayed

In [54]:
# trains delayed
trains_delayed = pd.read_csv("MTA_Subway_Trains_Delayed__Beginning_2025_20250324.csv").query(
  "day_type == 1 & line not in ['W', 'S Rock']"
).reset_index().drop(columns = ['division', 'index', 'day_type'])
trains_delayed = impute(trains_delayed, gc_shuttle = 'GS')

### 4-5 Minute Late Arrivals

In [55]:
# 4-5 minute late arrivala
late_arrivals = pd.read_csv("MTA_Subway_4_to_5_Minute_Late_Arriving_Trains__Beginning_2025_20250324.csv").query(
  "day_type == 1 & line not in ['SI', 'W', 'S Rock']"
).reset_index().drop(columns = ['Division', 'index', 'day_type'])
late_arrivals = impute(
  late_arrivals[late_arrivals['month']\
    .isin(['2025-01-01', '2025-02-01'])]\
    .reset_index()\
    .drop(columns = ['index'])
)
late_arrivals.loc[late_arrivals['line'] == 'NW', 'line'] = 'N' # NW -> N
late_arrivals = late_arrivals.groupby('line').agg({
  'Percent Late': 'mean'
}).reset_index().rename(columns = {'index': 'line'})

### Aggregate Line-Specific Data

In [86]:
from functools import reduce

# lines in the subway system
subway_lines = np.concatenate((
  np.arange(1,8).astype(str), # numbered lines
  np.array([
    "SG","A","B","C","D","E","F","G","JZ","L","M","N","Q","R","SF"
  ]) # lettered lines
))

# initialize dataset, assign lines and divisions
line_data = pd.DataFrame(columns =  ['line']).assign(line = subway_lines, division = None)
line_data.loc[:8, 'division'] = "A"
line_data.loc[8:, 'division'] = "B"

datasets = [
  line_data, 
  customer_journey, 
  wait_assessment,
  service_delivered, 
  terminal_ontime,
  late_arrivals
]

for dataset in range(len(datasets)):
  assert 'line' in datasets[dataset].columns, "line does not exist in {}".format(dataset)

# Join Datasets
line_data = reduce(lambda left, right: pd.merge(left, right, on = 'line', how = 'left'), datasets)
line_data

Unnamed: 0,line,division,num_passengers,additional platform time,additional train time,over_five_mins_perc,wait assessment,service delivered,terminal_on_time_performance,Percent Late
0,1,A,11835657.0,0.851951,0.559153,0.086699,0.763432,0.982663,0.82285,0.037705
1,2,A,7686449.5,1.1443,0.559926,0.137993,0.664238,0.939205,0.715958,0.042829
2,3,A,5991257.0,0.796375,0.551399,0.102888,0.701703,0.948494,0.811624,0.035561
3,4,A,8645926.8,0.882173,0.530361,0.120674,0.678916,0.96076,0.809313,0.030185
4,5,A,7388072.6,0.998782,0.496228,0.124685,0.647009,0.914606,0.784967,0.035657
5,6,A,12414366.0,1.135244,0.593191,0.115997,0.709513,0.959814,0.84181,0.032783
6,7,A,10437489.5,1.075596,0.542185,0.09621,0.687133,0.925503,0.920091,0.039703
7,SG,A,1408022.46,0.384641,-0.186734,0.001125,0.984829,0.998733,0.999078,0.0
8,A,B,9342499.0,1.14663,0.416928,0.136582,0.659597,0.953511,0.817995,0.033255
9,B,B,5887042.8,2.070564,1.134214,0.231646,0.593112,0.880769,0.638448,0.056799


### Integrate Major Incidents and Delay Data

In [87]:
# major incidents: indicator variables
# I will group the incidents into 2 types
## 1. Infrastructural -- signal malfunction, subway car, track, stations and structural
## 2. Personal/civil: Persons on trackbed/police/medical, other
incidents = major_incidents['category'].unique()

major_incidents['class'] = major_incidents['category'].map({
  'Signals': 'Infrastructure',
  'Subway Car': 'Infrastructure',
  'Track': 'Infrastructure',
  'Stations and Structure': 'Infrastructure',
  'Persons on Trackbed/Police/Medical': 'Non-Infrastructure',
  'Other': 'Non-Infrastructure'
})
incident_data = major_incidents.groupby(['line', 'class']).agg({
  'count': 'sum'
}).reset_index().pivot_table(
  index = 'line', 
  columns = 'class', 
  values = 'count', 
  aggfunc ='sum'
).reset_index().rename(columns = {
  'index': 'line',
  'Infrastructure': 'infra_critical',
  'Non-Infrastructure': 'noninfra_critical'
}).fillna(0)
incident_data['infra_critical'] = incident_data['infra_critical'].astype('Int64')
incident_data['noninfra_critical'] = incident_data['noninfra_critical'].astype('Int64')
incident_data

class,line,infra_critical,noninfra_critical
0,1,4,1
1,2,4,4
2,3,3,2
3,4,8,1
4,5,2,4
5,6,1,11
6,7,3,2
7,A,1,1
8,B,2,5
9,C,2,0


In [88]:
# Delays: in similar fashion, except the reports are already categorized.
# These events have not spurred major incidents but have nonetheless slowed service.
## 1. Infrastructural: Crew Availability, Infra/Equipment, Operating Conditions, Planned ROW work
## 2. Non-Infrastructural: Police & Medical, External Factors
delays = trains_delayed['reporting_category'].unique()
trains_delayed['class'] = trains_delayed['reporting_category'].map({
  'Crew Availability': 'Infrastructure',
  'Infrastructure & Equipment': 'Infrastructure',
  'Operating Conditions': 'Infrastructure',
  'Planned ROW Work': 'Infrastructure',
  'External Factors': 'Non-Infrastructure',
  'Police & Medical': 'Non-Infrastructure'
})
delay_data = trains_delayed.groupby(['line', 'class']).agg({
  'delays': 'sum'
}).reset_index().pivot_table(
  index = 'line',
  columns = 'class',
  values = 'delays',
  aggfunc = 'sum'
).reset_index().rename(columns = {
  'index': 'line',
  'Infrastructure': 'infra_noncritical',
  'Non-Infrastructure': 'noninfra_noncritical'
}).fillna(0)
delay_data['infra_noncritical'] = delay_data['infra_noncritical'].astype('Int64')
delay_data['noninfra_noncritical'] = delay_data['noninfra_noncritical'].astype('Int64')
delay_data

class,line,infra_noncritical,noninfra_noncritical
0,1,2282,1049
1,2,2187,1570
2,3,1279,992
3,4,1645,1259
4,5,1407,1476
5,6,1834,1775
6,7,1448,627
7,A,2149,718
8,B,2303,894
9,C,1761,731


Merge Major Incidents and Train Delays Data

In [89]:
line_data = line_data.merge(
  incident_data, on = 'line', how = 'left'
).merge(
  delay_data, on = 'line', how = 'left'
).fillna(0) # note there is no existing data for major incidents in shuttle services.
line_data

Unnamed: 0,line,division,num_passengers,additional platform time,additional train time,over_five_mins_perc,wait assessment,service delivered,terminal_on_time_performance,Percent Late,infra_critical,noninfra_critical,infra_noncritical,noninfra_noncritical
0,1,A,11835657.0,0.851951,0.559153,0.086699,0.763432,0.982663,0.82285,0.037705,4,1,2282,1049
1,2,A,7686449.5,1.1443,0.559926,0.137993,0.664238,0.939205,0.715958,0.042829,4,4,2187,1570
2,3,A,5991257.0,0.796375,0.551399,0.102888,0.701703,0.948494,0.811624,0.035561,3,2,1279,992
3,4,A,8645926.8,0.882173,0.530361,0.120674,0.678916,0.96076,0.809313,0.030185,8,1,1645,1259
4,5,A,7388072.6,0.998782,0.496228,0.124685,0.647009,0.914606,0.784967,0.035657,2,4,1407,1476
5,6,A,12414366.0,1.135244,0.593191,0.115997,0.709513,0.959814,0.84181,0.032783,1,11,1834,1775
6,7,A,10437489.5,1.075596,0.542185,0.09621,0.687133,0.925503,0.920091,0.039703,3,2,1448,627
7,SG,A,1408022.46,0.384641,-0.186734,0.001125,0.984829,0.998733,0.999078,0.0,0,0,3,16
8,A,B,9342499.0,1.14663,0.416928,0.136582,0.659597,0.953511,0.817995,0.033255,1,1,2149,718
9,B,B,5887042.8,2.070564,1.134214,0.231646,0.593112,0.880769,0.638448,0.056799,2,5,2303,894


### Save Lines Data

In [90]:
line_data.to_csv('MTA_Subway_Line_Data_2025.csv', index = False)

## Import Station-Specific Data

Station Data are measured hourly and are very large, requiring careful pruning.

### Hourly Ridership

In [94]:
hourly_ridership = pd.read_csv("MTA_Subway_Hourly_Ridership__Beginning_2025_20250324.csv").query(
  "transit_mode == \'subway\'"
).reset_index().drop(columns = ['index'])
hourly_ridership['transit_timestamp'] = pd.to_datetime(hourly_ridership['transit_timestamp'])

  hourly_ridership = pd.read_csv("MTA_Subway_Hourly_Ridership__Beginning_2025_20250324.csv").query(
  hourly_ridership['transit_timestamp'] = pd.to_datetime(hourly_ridership['transit_timestamp'])


I need to confine the dataset to AM and PM rush hours (peak hours), which the MTA defines as:
- AM Peak: 6:30am-9:30am
- PM Peak: 3:30pm-8:00pm

Additionally, remove irrelevant columns (e.g. station ID, transit method, payment method (MetroCard/OMNY)) and group/aggregate data by hour and station.

In [95]:
# define peak time blocks
start_time_am = pd.to_datetime('06:30:00').time()
end_time_am = pd.to_datetime('09:30:00').time()
start_time_pm = pd.to_datetime('15:30:00').time()
end_time_pm = pd.to_datetime('20:00:00').time()

# first, filter by month (Jan-Feb)
hourly_ridership = hourly_ridership[
  hourly_ridership['transit_timestamp'].dt.month < 3
]
# filter hourly ridership data by peak status
hourly_ridership = hourly_ridership[
  hourly_ridership['transit_timestamp'].dt.time.between(start_time_am, end_time_am) |
  hourly_ridership['transit_timestamp'].dt.time.between(start_time_pm, end_time_pm)
].sort_values(by = 'transit_timestamp').reset_index().drop(columns = ['index'])\
                                    .groupby(['transit_timestamp', 'station_complex'], as_index = False)\
                                    .agg({
                                      'borough': lambda x: x.mode()[0], # stations do not span different boroughs
                                      'ridership': 'sum'
                                    })

Additional filtering: remove weekends/holidays. Notable holidays in January-February:

- New Years (1/1)
- MLK Day (1/20)
- Presidents' Day (2/17)

In [96]:
# remove weekends
hourly_ridership = hourly_ridership[hourly_ridership['transit_timestamp'].dt.weekday < 5] # 5,6 are Sat/Sun
# remove holidays
hourly_ridership = hourly_ridership[~hourly_ridership['transit_timestamp'].dt.date.isin(pd.to_datetime([
  '2025-01-01', '2025-01-20', '2025-02-17'
]))]

### Extract Lines

I will use Regular Expression (regex) to extract the set of subway lines for each station.

In [None]:
import re

def extract_lines(text):
  matches = re.findall(r'\((.*?)\)', text)
  items = np.concatenate([item.split(',') for item in matches])
   # sometimes, station names in parentheses get thrown in the mix
  return ','.join(map(str, items[np.char.str_len(items) <= 2]))

In [109]:
hourly_ridership['lines'] = hourly_ridership['station_complex'].apply(extract_lines)

Save as CSV

In [110]:
hourly_ridership.to_csv("MTA_Subway_Ridership_Summarized.csv", index = False)