In [1]:
import pandas as pd
import os

In [2]:
# load data from bronze
filepath = '..\\data\\bronze\\L_Station_Entries_Daily_Totals_20250201.parquet'
col_names = {
    'station_id': 'StationID',
    'stationname': 'StationName',
    'date': 'Date',
    'daytype': 'DayType',
    'rides': 'Rides'
}
df = pd.read_parquet(filepath).rename(columns=col_names)
df['Date'] = pd.to_datetime(df.Date)

print(f'Loaded {len(df)/1e6}M rows')
print(df.dtypes)
print(f'Missing value counts: {df.isna().sum(axis=0)}')
display(df.head())

FileNotFoundError: [Errno 2] No such file or directory: '..\\data\\bronze\\L_Station_Entries_Daily_Totals_20250201.parquet'

In [35]:
# change the station IDs to INT and drop the 4, from parent station, per documentation here, page 24: 
# https://www.transitchicago.com/assets/1/6/cta_Train_Tracker_API_Developer_Guide_and_Documentation.pdf

# drop the 4
df['StationID'] = df.StationID.astype(str)
df['StationID'] = df.StationID.str[1:]

# convert back to int to drop leading zeroes
df['StationID'] = df.StationID.astype(int)

display(df.head())

Unnamed: 0,StationID,StationName,Date,DayType,Rides
0,1280,Jefferson Park,2017-12-22,W,6104
1,1000,Cermak-Chinatown,2017-12-18,W,3636
2,280,Central-Lake,2017-12-02,A,1270
3,140,Dempster-Skokie,2017-12-19,W,1759
4,690,Dempster,2017-12-03,U,499


In [36]:
# extract helpful date cols
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['YearMonth'] = df['Date'].dt.to_period('M')
df['MonthAbb'] = df['Date'].dt.strftime('%b')
df['DayOfWeek'] = df['Date'].dt.day_name()

# Display the resulting DataFrame
display(df.head())

Unnamed: 0,StationID,StationName,Date,DayType,Rides,Year,Month,Day,YearMonth,MonthAbb,DayOfWeek
0,1280,Jefferson Park,2017-12-22,W,6104,2017,12,22,2017-12,Dec,Friday
1,1000,Cermak-Chinatown,2017-12-18,W,3636,2017,12,18,2017-12,Dec,Monday
2,280,Central-Lake,2017-12-02,A,1270,2017,12,2,2017-12,Dec,Saturday
3,140,Dempster-Skokie,2017-12-19,W,1759,2017,12,19,2017-12,Dec,Tuesday
4,690,Dempster,2017-12-03,U,499,2017,12,3,2017-12,Dec,Sunday


In [37]:
# drop no longer needed columns
select_cols = [
    'StationID',
    'Date',
    'Rides',
    'Year',
    'Month',
    'Day',
    'YearMonth',
    'MonthAbb',
    'DayOfWeek',
    'DayType'
]
df = df[select_cols]

# save to file
filepath = 'C:\\Users\\estel\\Documents\\python\\projects\\cta\\data\\silver\\'
filename = 'FactStationEntries.parquet'
df.to_parquet(filepath+filename)

# verify
os.listdir(filepath)

['DimParkRide.parquet',
 'DimRailLine.parquet',
 'DimRailStation.parquet',
 'FactStationEntries.parquet']