In [37]:
%run functions.ipynb

In [38]:
import geopandas as gpd
import pandas as pd
import os

In [39]:
# schema
col_names = {
    'Name': 'LineName',
    'Description': 'HTMLMetaData',
    'geometry': 'geometry' # don't mess with this - core part of gpd geodataframe
}

# load stations dataframe
filepath = '..\\data\\bronze\\CTA_RailLines.kml'
gdf = gpd.read_file(filepath).rename(columns=col_names)

display(gdf.head())

Unnamed: 0,LineName,HTMLMetaData,geometry
0,Yellow Line,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...","MULTILINESTRING Z ((-87.6729 42.01907 0, -87.6..."
1,Orange Line,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...","MULTILINESTRING Z ((-87.70433 41.80436 0, -87...."
2,Blue Line (O'Hare),"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...","MULTILINESTRING Z ((-87.70855 41.92974 0, -87...."
3,Blue Line (Forest Park),"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...","MULTILINESTRING Z ((-87.68844 41.87554 0, -87...."
4,Pink Line,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...","MULTILINESTRING Z ((-87.66943 41.88525 0, -87...."


In [40]:
# extract metadata from the html column
metadata_items = [
    'DESCRIPTIO', 
    'TYPE', 
    'LEGEND',
    'BRANCH',
    'OWL'
]

gdf['ExtractedMetaData'] = gdf.HTMLMetaData.apply(extract_metadata)

display(gdf.head())


Unnamed: 0,LineName,HTMLMetaData,geometry,ExtractedMetaData
0,Yellow Line,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...","MULTILINESTRING Z ((-87.6729 42.01907 0, -87.6...","[Oakton-Skokie to Howard, Elevated or at Grade..."
1,Orange Line,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...","MULTILINESTRING Z ((-87.70433 41.80436 0, -87....","[Pulaski-Midway to Kedzie-Midway, Elevated or ..."
2,Blue Line (O'Hare),"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...","MULTILINESTRING Z ((-87.70855 41.92974 0, -87....","[Belmont-O'Hare to Logan Square, Subway, BL, B..."
3,Blue Line (Forest Park),"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...","MULTILINESTRING Z ((-87.68844 41.87554 0, -87....","[Kedzie-Homan to Western-Congress, Elevated or..."
4,Pink Line,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...","MULTILINESTRING Z ((-87.66943 41.88525 0, -87....","[Paulina Junction to Polk, Elevated or at Grad..."


In [41]:
# explode the metadata out into columns
new_columns = ['Description', 'Type', 'Legend', 'Branch', 'NightOwlService']

metadata_col = gdf.ExtractedMetaData
for i, col in enumerate(new_columns):
    gdf[col] = metadata_col.str[i]

display(gdf.head())


Unnamed: 0,LineName,HTMLMetaData,geometry,ExtractedMetaData,Description,Type,Legend,Branch,NightOwlService
0,Yellow Line,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...","MULTILINESTRING Z ((-87.6729 42.01907 0, -87.6...","[Oakton-Skokie to Howard, Elevated or at Grade...",Oakton-Skokie to Howard,Elevated or at Grade,YL,Yellow Line,No
1,Orange Line,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...","MULTILINESTRING Z ((-87.70433 41.80436 0, -87....","[Pulaski-Midway to Kedzie-Midway, Elevated or ...",Pulaski-Midway to Kedzie-Midway,Elevated or at Grade,OR,Orange Line,No
2,Blue Line (O'Hare),"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...","MULTILINESTRING Z ((-87.70855 41.92974 0, -87....","[Belmont-O'Hare to Logan Square, Subway, BL, B...",Belmont-O'Hare to Logan Square,Subway,BL,Blue Line O'Hare,Yes
3,Blue Line (Forest Park),"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...","MULTILINESTRING Z ((-87.68844 41.87554 0, -87....","[Kedzie-Homan to Western-Congress, Elevated or...",Kedzie-Homan to Western-Congress,Elevated or at Grade,BL,Blue Line Forest Park,Yes
4,Pink Line,"<html xmlns:fo=""http://www.w3.org/1999/XSL/For...","MULTILINESTRING Z ((-87.66943 41.88525 0, -87....","[Paulina Junction to Polk, Elevated or at Grad...",Paulina Junction to Polk,Elevated or at Grade,PK,Pink Line,No


In [42]:
# drop no longer needed columns
select_cols = [
    'LineName',
    'geometry',
    'Description',
    'Type',
    'Legend',
    'Branch'
]
gdf = gdf[select_cols]

# save to file
filepath = '..\\data\\silver\\'
filename = 'DimRailLine.parquet'
gdf.to_parquet(filepath+filename)

# verify
os.listdir(filepath)

['DimRailLine.parquet', 'DimRailStation.parquet']