To do & possible extensions:
 - Clean up preprocessing & data cleaning steps as they're a mess
 - Implement interactive map so that user can select specific year/month/hour
 - ML to predict future traffic
 - Utilize API to automatically scrape traffic data for future instead of reading from csv file
 - Get accurate daily average for each month instead of merely dividing by 30
 - English translations for lines, stations (scrape from OSM?) (nvm, there is an English version of the map in the site...)
 - Analyze data using graph-theoretic functions as only constructed graph in Networkx, turned out visualization didn't need Networkx

In [1]:
# import packages

import requests
import json
import re
import pandas as pd
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
import lxml.html
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import itertools
import plotly.graph_objects as go
import plotly.express as px

In [2]:
# load data from Seoul Data, beware of encoding
# subway_data contains data on stations
# traffic_data contains data on hourly traffic

with open('서울시 역사마스터 정보.json', encoding = 'utf-8-sig') as file:
    station_data = json.load(file)

with open('서울시 지하철 호선별 역별 시간대별 승하차 인원 정보.json', encoding = 'utf-8-sig') as file:
    traffic_data = json.load(file)

In [3]:
# turn subway data, traffic_df into Pandas DataFrames

station_df = pd.DataFrame(station_data['DATA'])

traffic_df = pd.DataFrame(traffic_data['DATA'])

In [4]:
# Column in form 'hr_n_get_off_nope' represent total # of people for each month getting off from hr_n to hr_n+1, same for 'hr_n_get_on_nope'
# 'sttn' represents station name
# 'use_mm' represents timeframe for data (yr-mm ie 202503)
# 'sbwy_rout_ln_nm' represents line connected to the station (for stations connected to more than one line, only one shown)

# 'job_ymd' represents date data posted, so remove

traffic_df = traffic_df.drop(['job_ymd'], axis = 1, errors = 'ignore')

# as trains generally run from around 05:00 to 00:30, remove data for 01:00 ~ 05:00

remove_timeframes = [f'hr_{i}_get_off_nope' for i in range(1, 5)]
remove_timeframes.extend([f'hr_{i}_get_on_nope' for i in range(1, 5)])
traffic_df = traffic_df.drop(remove_timeframes, axis = 1)

# rename columns

traffic_df_rename_cols = list(traffic_df.columns)
traffic_df_rename_cols.remove('use_mm')
traffic_df_rename_cols.remove('sttn')
traffic_df_rename_cols.remove('sbwy_rout_ln_nm')
traffic_df_rename_cols_dict = dict()
for col_name in traffic_df_rename_cols:
    traffic_df_rename_cols_dict[col_name] = re.sub(pattern = r'hr_(\d+)_get_(off|on)_nope', repl = r'\1_\2',string = col_name)

traffic_df = traffic_df.rename(columns = traffic_df_rename_cols_dict)

In [5]:
# want to fit, clean subway lines to map shown here: http://www.seoulmetro.co.kr/kr/cyberStation.do

traffic_df['sbwy_rout_ln_nm'] = traffic_df['sbwy_rout_ln_nm'].replace({'9호선2단계': '9호선', '9호선2~3단계': '9호선', '경의선': '경의중앙선', '중앙선': '경의중앙선', '과천선': '4호선', '경부선': '1호선', '경원선': '1호선', '경인선': '1호선', '장항선': '1호선', '수인선': '수인분당선', '분당선': '수인분당선', '안산선': '4호선', '일산선': '3호선', '공항철도 1호선': '공항철도1호선'})

In [6]:
# save relevant lines for later use

relevant_lines = traffic_df['sbwy_rout_ln_nm'].unique()

In [7]:
# notice transfer stations represented multiple times in station_df, per each connected line
# clean route column, eliminate irrelevant rows (stations)

station_df['route'] = station_df['route'].replace({'9호선(연장)': '9호선', '진접선': '4호선', '7호선(인천)': '7호선', '별내선': '8호선', '경의선': '경의중앙선', '중앙선': '경의중앙선', '과천선': '4호선', '경부선': '1호선', '경원선': '1호선', '경인선': '1호선', '장항선': '1호선', '수인선': '수인분당선', '분당선': '수인분당선', '안산선': '4호선', '일산선': '3호선'})

station_df = station_df[station_df['route'].isin(relevant_lines)]

# change coordinate data into numeric

station_df[['lot', 'lat']] = station_df[['lot', 'lat']].astype(float)

In [8]:
# scrape data from http://www.seoulmetro.co.kr/kr/cyberStation.do

opts = Options()
opts.add_argument('--headless')

browser = Firefox(options = opts)
browser.get('http://www.seoulmetro.co.kr/kr/cyberStation.do')

html = browser.page_source

tree = lxml.html.fromstring(html)

In [9]:
# from inspecting elements in page, find subway map is div element with class name = subway-map

subway_map = tree.xpath('//div[@class = "subway-map"]')[0]

In [10]:
# find stations (nodes) info contained in g elements with class = label-group

sttn_elements = subway_map.xpath('//g[@class = "label-group"]')[0]

In [11]:
# notice station ids have S appended in front (eg S0339), likewise line ids have L appended in front (eg L4-3)
# text id has station id (eg S0339), class attribute contains list of associated station and line ids (eg S0008 L1-GA S0339 L4-3 S1030 LB), uid attribute has station id without the S (eg 0339), lineID attribute contains list of lines without L (eg 1-GA 4-3 B)
# also notice transfer stations have more than one station id associated, so use names instead of ids as dict keys instead for unique identification

sttn_id_dict = dict()
sttn_line_dict = dict()

for element in sttn_elements.cssselect('text'):
    # station names contained in tspan element
    # some station names have more than one word so need to iterate for each tspan element
    station_name = ''.join(tspan_element.text.strip() for tspan_element in element.cssselect('tspan'))

    # get line IDs
    line_ids = element.attrib['lineid']

    # Lines 1 ~ 9 as '{n + 1} - {n}', clean (eg Line 9 written as 10-9)
    line_ids = re.sub(pattern = r'\d+-(\d)', repl = r'\1', string = line_ids)

    # split by space in case of transfer station
    line_ids = line_ids.split(' ')

    # get station IDs
    station_ids = element.attrib['class']
    station_ids = re.findall(pattern = r'S(\S{4})', string = station_ids)

    # update dicts
    sttn_id_dict.update({f'{station_name}': station_ids})
    sttn_line_dict.update({f'{station_name}': line_ids})

In [12]:
# lines (edges) info contained in multiple g elements with class = line, each individual line between two stations is a path element
# colors of lines info contained in stroke attribute of each g element

line_dict = dict()
line_color_dict = dict()

for line in subway_map.xpath('//g[@class = "line"]'):
    line_color = line.attrib['stroke']
    # extract line from first path element
    extract_line = line.cssselect('path')[0].attrib['class']
    extract_line = re.findall(pattern = r'L(\S+)', string = extract_line)[0]
    extract_line = re.sub(pattern = r'\d+-(\d)', repl = r'\1', string = extract_line)
    line_color_dict[extract_line] = line_color
    
    for element in line.cssselect('path'):
        # class attribute in form 'path (line id) P(station 1 id)(station 2 id) P(station 2 id)(station 1 id)' (eg path L1-GA P00080009 P00090008)
        class_attrib = element.attrib['class']

        try:
            line_id, station_id1, station_id2 = re.findall(pattern = r'L(\S+) P(\S{4})(\S{4})', string = class_attrib)[0]

            line_id = re.sub(pattern = r'\d+-(\d)', repl = r'\1', string = line_id)

            line_dict[(station_id1, station_id2)] = line_id

        except IndexError:
            # some line elements in map merely for help visualizing instead of representing actual subway lines (eg path L1-GA), ignore
            continue

In [13]:
# manually add data for Seoul Station (서울역) for each dict as represented twice in subway map, thus omitting one

sttn_id_dict['서울역'] = ['1251', '9005', '0150', '0426', '4201']
sttn_line_dict['서울역'] = ['K', '1', '4', 'A']
line_dict[('0151', '0150')] = '1'
line_dict[('0150', '1002')] = '1'
line_dict[('0425', '0426')] = '4'
line_dict[('0426', '0427')] = '4'
line_dict[('4202', '4201')] = 'A'

# manually add data for Sinchon Station_Underground (신촌역_지하) for each dict as there are two different stations with the same name, rename as Yangpyeong_Seoul (신촌역_지하)

sttn_id_dict['신촌_지하'] = ['0240']
sttn_line_dict['신촌_지하'] = ['2']

# manually add data for Yangpyeong Station (양평역) in Seoul as there are two different stations with the same name, rename as Yangpyeong_Seoul (양평_서울)

sttn_id_dict['양평_서울'] = ['2523']
sttn_line_dict['양평_서울'] = ['5']

In [15]:
# update line_ids in sttn_line_dict, line_dict, and line_color_dict to actual names of lines

line_name_dict = {f'{i}': f'{i}호선' for i in range(1, 10)}
line_name_dict.update({'B': '수인분당선', 'G': '경춘선', 'KK': '경강선', 'K': '경의중앙선', 'A': '공항철도1호선', 'SH': '서해선', 'SL': '신림선', 'W': '우이신설선'})

sttn_line_dict = {station: [line_name_dict.get(line, line) for line in sttn_line_dict[station]] for station in sttn_line_dict.keys()}

line_dict = {key: line_name_dict.get(value, value) for key, value in line_dict.items()}

line_color_dict = {line_name_dict.get(key, key): value for key, value in line_color_dict.items()}

In [16]:
# filter sttn_line_dict for relevant lines, remove empty stations after filtering

sttn_line_dict = {station: [line for line in lines if line in relevant_lines] for station, lines in sttn_line_dict.items()}
sttn_line_dict = dict((k, v) for k, v in sttn_line_dict.items() if v != [])

# do the same for sttn_id_dict, filter for relevant lines (by referencing sttn_line_dict)

sttn_id_dict = {station: id for station, id in sttn_id_dict.items() if station in sttn_line_dict.keys()}

In [17]:
# fit station names in station_df to station names in sttn_id_dict/sttn_line_dict (so it fits with original subway map)

station_df['bldn_nm'] = station_df['bldn_nm'].str.replace(r'\(\S+\)', '', regex = True)

# rename Yangpyeong, Sinchon stations for station_df

station_df.loc[station_df['bldn_id'] == '2523', 'bldn_nm'] = '양평_서울'
station_df.loc[(station_df['bldn_nm'] == '신촌') & (station_df['route'] == '2호선'), 'bldn_nm'] = '신촌_지하'

# rename Yangpyeong, Sinchon stations for traffic_df

traffic_df.loc[(traffic_df['sttn'] == '양평') & (traffic_df['sbwy_rout_ln_nm'] == '5호선'), 'sttn'] = '양평_서울'
traffic_df.loc[(traffic_df['sttn'] == '신촌') & (traffic_df['sbwy_rout_ln_nm'] == '2호선'), 'sttn'] = '신촌_지하'

# additional name cleaning (as of 2025-04-29)

sttn_id_dict['4.19민주묘지'] = sttn_id_dict.pop('4·19민주묘지')
sttn_id_dict['관악산'] = sttn_id_dict.pop('관악산(서울대)')
sttn_line_dict['4.19민주묘지'] = sttn_line_dict.pop('4·19민주묘지')
sttn_line_dict['관악산'] = sttn_line_dict.pop('관악산(서울대)')

clean_station_df_names = {'남동인더스파크': '인더스파크남동', '당고개': '불암산', '동대문역사문화공원': '문화공원동대문역사', '뚝섬유원지': '자양', '신대방삼거리': '삼거리신대방', '원곡': '시우', '이수': '총신대입구', '지제': '평택지제', '초성리': '청산', '화전': '한국항공대'}
station_df['bldn_nm'] = station_df['bldn_nm'].replace(clean_station_df_names)

# do the same for traffic_df

traffic_df['sttn'] = traffic_df['sttn'].str.replace(r'\(\S+\)', '', regex = True)
clean_traffic_df_names = clean_station_df_names
clean_traffic_df_names.update({'동두천 중앙': '동두천중앙', '쌍용동': '쌍용', '인천국제공항': '인천공항1터미널'})
traffic_df['sttn'] = traffic_df['sttn'].replace(clean_traffic_df_names)

# additional stations in original subway map not covered in traffic_df (outside scope of data collection), delete

s1 = set(sttn_id_dict.keys())
s2 = set(traffic_df['sttn'])

for name in (s1 - s2):
    del sttn_id_dict[name]
    del sttn_line_dict[name]
    station_df = station_df[station_df['bldn_nm'] != name]

In [18]:
# filter line_dict, line_color_dict for relevant lines

for key in list(line_dict.keys()):
    if line_dict[key] not in relevant_lines:
        del line_dict[key]

for key in list(line_color_dict.keys()):
    if key not in relevant_lines:
        del line_color_dict[key]

# filter again as some stations in relevant lines excluded from data collection (step analogous to previous cell)

relevant_ids = [id for id_list in sttn_id_dict.values() for id in id_list]

for key in list(line_dict.keys()):
    if any(station_id not in relevant_ids for station_id in key):
        del line_dict[key]

In [19]:
# new_line_color_dict, {station - station: color} instead of {line: color}

new_line_color_dict = {key: line_color_dict.get(value, value) for key, value in line_dict.items()}

In [20]:
# going back to station_df

# turn transfer stations into one station, take average of coordinates for position, assign all connected lines to route (after completing project, realized could have simply deleted everything except bldn_nm and coordinates as station_df turned out to be mostly unnecessary)

def custom_agg_function(x):
    return [set([route for route in x])]

station_df_agg_functions = {'bldn_id': 'first', 'route': custom_agg_function, 'lot': 'mean', 'lat': 'mean'}

station_df = station_df.groupby(['bldn_nm']).aggregate(station_df_agg_functions).reset_index()

In [21]:
# change original ids in subway_df to ids collected from subway map for compatibility with lines_dict when creating graph (turned out to be unnecessary step)

station_df['ids'] = station_df.apply(lambda x: sttn_id_dict[x['bldn_nm']], axis = 1)
station_df = station_df.drop('bldn_id', axis = 1)

In [22]:
# get dictionary of station coordinates to add as node attributes

sttn_pos_dict = dict()

for key in sttn_id_dict:
    sttn_pos_dict.update({key: (station_df['lat'][station_df['bldn_nm'] == key].values[0], station_df['lot'][station_df['bldn_nm'] == key].values[0])})

In [23]:
# transform line_dict from {(station_id, station_id): line_id} to {(station_name, station_name): line_id}

for stations, line in zip(list(line_dict.keys()), list(line_dict.values())):
    for station_name, station_ids in sttn_id_dict.items():
        if stations[0] in station_ids:
            x_station = station_name
        elif stations[1] in station_ids:
            y_station = station_name
    line_dict[x_station, y_station] = line_dict.pop(stations)

# transform new_line_color_dict from {(station_id, station_id): color} to {(station_name, station_name): color}

for stations, color in zip(list(new_line_color_dict.keys()), list(new_line_color_dict.values())):
    for station_name, station_ids in sttn_id_dict.items():
        if stations[0] in station_ids:
            x_station = station_name
        elif stations[1] in station_ids:
            y_station = station_name
    new_line_color_dict[x_station, y_station] = new_line_color_dict.pop(stations)

In [24]:
# create base graph (lines attribute for nodes not really needed, commented out)

base_seoul_subway_graph = nx.Graph()

base_seoul_subway_graph.add_nodes_from(sttn_id_dict.keys())

nx.set_node_attributes(base_seoul_subway_graph, sttn_id_dict, 'ids')
nx.set_node_attributes(base_seoul_subway_graph, sttn_line_dict, 'lines')
nx.set_node_attributes(base_seoul_subway_graph, sttn_pos_dict, 'pos')

# add lines (edges)

base_seoul_subway_graph.add_edges_from(line_dict.keys())
nx.set_edge_attributes(base_seoul_subway_graph, line_dict, 'line')
nx.set_edge_attributes(base_seoul_subway_graph, new_line_color_dict, 'color')

In [25]:
# groupby sttn (thus combining transfer stations into one, similarly to how we did on station_df), use_mm on traffic_df for data sorted into use_mm

traffic_df_agg_functions = {col: 'sum' for col in traffic_df_rename_cols_dict.values()}
traffic_df_agg_functions.update({'sbwy_rout_ln_nm': custom_agg_function})
traffic_df = traffic_df.groupby(['sttn', 'use_mm']).agg(traffic_df_agg_functions).reset_index()

In [26]:
# create columns for traffic_df to calculate net, total (passengers getting both on and off) for each hour interval, divide by 30 for daily average

for i in [_ for _ in range(24) if _ < 1 or _ > 4]:
    traffic_df[f'{i}_total'] = ((traffic_df[f'{i}_on'] + traffic_df[f'{i}_off']) / 30).astype(int)
    traffic_df[f'{i}_net'] = ((traffic_df[f'{i}_on'] - traffic_df[f'{i}_off']) / 30).astype(int)

In [27]:
# subset traffic_df into each time interval

traffic_dict = dict()
for i in [_ for _ in range(24) if _ < 1 or _ > 4]:
    traffic_dict[f'hr_{i}_df'] = traffic_df[['sttn', 'use_mm', 'sbwy_rout_ln_nm', f'{i}_on', f'{i}_off', f'{i}_total', f'{i}_net']]

In [28]:
# dict with info for larger lines (Line 1, etc...) for easier visualization

meta_line_dict = dict()
for line in relevant_lines:
    meta_line_dict[line] = {key: value for key, value in line_dict.items() if value == line}

In [29]:
# testing visualization

hour = 17
month = '202502'

test_traffic_subset = traffic_df[['sttn', 'sbwy_rout_ln_nm', f'{hour}_on', f'{hour}_off', f'{hour}_total', f'{hour}_net']][traffic_df['use_mm'] == month]
test_stations = traffic_df['sttn'][traffic_df['use_mm'] == month].values
test_station_subset = station_df[station_df['bldn_nm'].isin(test_stations)]

test_subgraph = base_seoul_subway_graph.subgraph(test_stations)

net_dict = dict()
total_dict = dict()
for station in test_stations:
    net_dict[station] = test_traffic_subset[f'{hour}_net'][test_traffic_subset['sttn'] == station]
    total_dict[station] = test_traffic_subset[f'{hour}_total'][test_traffic_subset['sttn'] == station]

nx.set_node_attributes(test_subgraph, net_dict, 'net')
nx.set_node_attributes(test_subgraph, total_dict, 'total')

test_df = pd.merge(test_station_subset['bldn_nm'], test_traffic_subset[['17_total', '17_net', 'sttn']], left_on = 'bldn_nm', right_on = 'sttn')

test_df['text'] = test_df['bldn_nm'] + '\nTotal: ' + test_df['17_total'].astype(str) + '\nNet: ' + test_df['17_net'].astype(str)

In [30]:
# generate sample visaulization, template for future maps (frame in animation)

fig_test = go.Figure()

for line in meta_line_dict:
    line_lons = np.empty(3 * len(meta_line_dict[line]))
    line_lats = np.empty(3 * len(meta_line_dict[line]))
    for i, key in zip(range(len(meta_line_dict[line])), meta_line_dict[line]):
        line_lons[i * 3] = station_df[station_df['bldn_nm'] == key[0]].iloc[0]['lot']
        line_lats[i * 3] = station_df[station_df['bldn_nm'] == key[0]].iloc[0]['lat']
        line_lons[i * 3 + 1] = station_df[station_df['bldn_nm'] == key[1]].iloc[0]['lot']
        line_lats[i * 3 + 1] = station_df[station_df['bldn_nm'] == key[1]].iloc[0]['lat']
        line_lons[i * 3 + 2] = None
        line_lats[i * 3 + 2] = None
    
    fig_test.add_trace(go.Scattermap(
        lon = line_lons,
        lat = line_lats,
        name = line,
        mode = 'lines',
        line = dict(width = 3, color = line_color_dict[line])
    ))


fig_test.add_trace(go.Scattermap(
    lon = test_station_subset['lot'],
    lat = test_station_subset['lat'],
    mode = 'markers',
    showlegend = False,
    hoverinfo = 'skip',
    marker = dict(
        size = test_traffic_subset['17_total'] * 1.1,
        sizemode = 'area',
        sizeref = test_traffic_subset['17_total'].max() / 30 ** 2,
        color = 'black',
        opacity = 1.0
    )
))

fig_test.add_trace(go.Scattermap(
    lon = test_station_subset['lot'],
    lat = test_station_subset['lat'],
    showlegend = False,
    text = test_df['text'],
    hoverinfo = 'text',
    mode = 'markers',
    marker = dict(
        size = test_traffic_subset['17_total'],
        sizemode = 'area',
        sizeref = test_traffic_subset['17_total'].max() / 30 ** 2,
        colorscale = px.colors.diverging.RdBu_r,
        color = test_traffic_subset['17_net'],
        showscale = False,
        cmid = 0) 
    ))

fig_test.update_layout(
    map = dict(center = dict(
        lat = np.mean(test_station_subset['lat']),
        lon = np.mean(test_station_subset['lot'])),
        zoom = 10,
        style = 'streets'
    ))

fig_test.show()