In [1]:
%run functions.ipynb

In [2]:
import geopandas as gpd
import pandas as pd
import os

In [3]:
# schema
col_names = {
    'Name': 'StationName',
    'Description': 'HTMLMetaData',
    'geometry': 'geometry' # don't mess with this - core part of gpd geodataframe
}

# load stations dataframe
filepath = '..\\data\\bronze\\CTA_RailStations.kml'
gdf = gpd.read_file(filepath).rename(columns=col_names)

display(gdf.head())

Unnamed: 0,StationName,HTMLMetaData,geometry
0,54th/Cermak,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...",POINT Z (-87.7567 41.85178 0)
1,Oakton-Skokie,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...",POINT Z (-87.74723 42.02625 0)
2,Pulaski,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...",POINT Z (-87.72433 41.85386 0)
3,Logan Square,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...",POINT Z (-87.70855 41.92974 0)
4,Irving Park,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...",POINT Z (-87.72918 41.95295 0)


In [4]:
# extract metadata from the html column
metadata_items = [
    'Station ID', 
    'Rail Line', 
    'Address', 
    'ADA',
    'Park and Ride',
]

gdf['ExtractedMetaData'] = gdf.HTMLMetaData.apply(extract_metadata)

display(gdf.head())


Unnamed: 0,StationName,HTMLMetaData,geometry,ExtractedMetaData
0,54th/Cermak,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...",POINT Z (-87.7567 41.85178 0),"[580, Pink, 2134 S. 54th Avenue, ADA Accessibl..."
1,Oakton-Skokie,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...",POINT Z (-87.74723 42.02625 0),"[1680, Yellow Line, 4802 W Oakton, ADA Accessi..."
2,Pulaski,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...",POINT Z (-87.72433 41.85386 0),"[150, Pink, 2005 S. Pulaski Road, ADA Accessib..."
3,Logan Square,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...",POINT Z (-87.70855 41.92974 0),"[1020, Blue Line, 2620 N. Milwaukee Av, ADA Ac..."
4,Irving Park,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...",POINT Z (-87.72918 41.95295 0),"[550, Blue Line, 4131 W. Irving Park Road, Not..."


In [5]:
# explode the metadata out into columns
new_columns = ['StationID', 'RailLine', 'StationAddress', 'ADAAccessible', 'ParkAndRide']

metadata_col = gdf.ExtractedMetaData
for i, col in enumerate(new_columns):
    gdf[col] = metadata_col.str[i]

display(gdf.head())


Unnamed: 0,StationName,HTMLMetaData,geometry,ExtractedMetaData,StationID,RailLine,StationAddress,ADAAccessible,ParkAndRide
0,54th/Cermak,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...",POINT Z (-87.7567 41.85178 0),"[580, Pink, 2134 S. 54th Avenue, ADA Accessibl...",580,Pink,2134 S. 54th Avenue,ADA Accessible,Yes
1,Oakton-Skokie,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...",POINT Z (-87.74723 42.02625 0),"[1680, Yellow Line, 4802 W Oakton, ADA Accessi...",1680,Yellow Line,4802 W Oakton,ADA Accessible,No
2,Pulaski,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...",POINT Z (-87.72433 41.85386 0),"[150, Pink, 2005 S. Pulaski Road, ADA Accessib...",150,Pink,2005 S. Pulaski Road,ADA Accessible,No
3,Logan Square,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...",POINT Z (-87.70855 41.92974 0),"[1020, Blue Line, 2620 N. Milwaukee Av, ADA Ac...",1020,Blue Line,2620 N. Milwaukee Av,ADA Accessible,No
4,Irving Park,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...",POINT Z (-87.72918 41.95295 0),"[550, Blue Line, 4131 W. Irving Park Road, Not...",550,Blue Line,4131 W. Irving Park Road,Not ADA Accessible,No


In [6]:
# drop no longer needed columns
select_cols = [
    'StationID',
    'StationName',
    'geometry',
    'RailLine',
    'StationAddress',
    'ADAAccessible',
    'ParkAndRide'
]
gdf = gdf[select_cols]

# save to file
filepath = '..\\data\\silver\\'
filename = 'DimRailStation.parquet'
gdf.to_parquet(filepath+filename)

# verify
os.listdir(filepath)

['DimParkRide.parquet', 'DimRailLine.parquet', 'DimRailStation.parquet']