In [1]:
'''
outputs:
  ./network_nodes.json (pd.Dataframe with cols: 'station_id', 'station_name_ENG', 'lat', 'lng')

  ./ridership.npy (np.ndarray)              shape: (M, N, H, 7, 2)
  ./urban_features.npy (np.ndarray)         shape: (M, N, D)
  ./adjacency.npy (np.ndarray)              shape: (N, N)
  ./spatial_embeddings.npy (np.ndarray)     shape: (N, D)

Where: 
  M: Months (2017-01 to 2020-12)
  N: Stations
  D: Any number of dimensions
  H: Timesteps per day: ('00~06', '06~10', '10~16', '16~21', '21~00')
'''

"\noutputs:\n  ./network_nodes.json (pd.Dataframe with cols: 'station_id', 'station_name_ENG', 'lat', 'lng')\n\n  ./ridership.npy (np.ndarray)              shape: (M, N, H, 7, 2)\n  ./urban_features.npy (np.ndarray)         shape: (M, N, D)\n  ./adjacency.npy (np.ndarray)              shape: (N, N)\n  ./spatial_embeddings.npy (np.ndarray)     shape: (N, D)\n\nWhere: \n  M: Months (2017-01 to 2020-12)\n  N: Stations\n  D: Any number of dimensions\n  H: Timesteps per day: ('00~06', '06~10', '10~16', '16~21', '21~00')\n"

In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
from datetime import datetime as dt

def __load_network(nodes_path: str, adjacency_path: str):
  nodes: pd.DataFrame = pd.read_json(nodes_path)
  nodes.set_index('station_id', inplace=True, verify_integrity=True)
  nodes = gpd.GeoDataFrame(
    nodes, geometry=gpd.points_from_xy(x=nodes.lng, y=nodes.lat), crs='EPSG:4326'
  )
  adjacency: np.ndarray = np.load(adjacency_path)
  return (nodes, adjacency)

def __load_hourly_ridership(path: str):
  df: pd.DataFrame = pd.read_feather(path)
  df.set_index(['date', 'time', 'station_id'], inplace=True, verify_integrity=True)
  return df

def __load_poi(path: str):
  df: pd.DataFrame = pd.read_feather(path)
  df.set_index(['date', 'station_id'], inplace=True, verify_integrity=True)
  return df

def __load_building_use(path: str):
  df: pd.DataFrame = pd.read_feather(path)
  df.set_index(['date', 'station_id'], inplace=True, verify_integrity=True)
  return df

def __load_lte(path: str):
  df: pd.DataFrame = pd.read_feather(path)
  df.set_index(['date', 'time', 'station_id'], inplace=True, verify_integrity=True)
  return df.groupby(level=['date', 'station_id']).sum()

In [3]:
daily_ts = ['00~06', '06~10', '10~16', '16~21', '21~00']
weekdays = range(7)
years = [2017, 2018, 2019, 2020]
months = range(1, 13)
data_path = '.'

def __filter_stations(df1: pd.DataFrame, df2: pd.DataFrame, df3: pd.DataFrame, df4: pd.DataFrame, df5: pd.DataFrame):
  return set(df1.index.unique('station_id')).intersection(
    set(df2.index.unique('station_id')),
    set(df3.index.unique('station_id')),
    set(df4.index.unique('station_id')),
    set(df5.index.unique('station_id'))
  )

def filter_df(df: pd.DataFrame, st_ids: set, reduce_sum: bool, time=False):
  df = df[df.index.get_level_values('station_id').isin(st_ids)].copy()
  df.reset_index(inplace=True)
  df['date'] = pd.to_datetime(df['date'])
  df['year'] = df['date'].dt.year
  df = df[df['year'].isin(years)]
  df['month'] = df['date'].dt.month
  if time:
    df['weekday'] = df['date'].dt.dayofweek
    group_by = ['year', 'month', 'station_id', 'time', 'weekday']
    fixed_idx = pd.MultiIndex.from_product([years, months, sorted(list(st_ids)), daily_ts, weekdays], names=group_by)
  else:
    group_by = ['year', 'month', 'station_id']
    fixed_idx = pd.MultiIndex.from_product([years, months, sorted(list(st_ids))], names=['year', 'month', 'station_id'])
  df.drop('date', axis=1, inplace=True)
  df = df.groupby(by=group_by)
  df = df.sum() if reduce_sum else df.mean()
  df = df.reindex(fixed_idx, copy=False, fill_value=0)
  return df

def filter_data():
  subway, adj = __load_network(data_path + '/datasets/subway/network_nodes.json', data_path + '/datasets/subway/network_adjacency.npy')
  ridership = __load_hourly_ridership(data_path + '/datasets/subway/ridership_hourly.ftr')
  poi = __load_poi(data_path + '/datasets/poi/poi.ftr')
  bu = __load_building_use(data_path + '/datasets/building_use/building_use.ftr')
  lte = __load_lte(data_path + '/datasets/lte/lte.ftr')
  st_ids = __filter_stations(subway, ridership, poi, bu, lte)
  # subway nodes
  subway = subway[subway.index.get_level_values('station_id').isin(st_ids)]
  subway.sort_index(inplace=True)
  subway.reset_index(inplace=True)
  subway[['station_id', 'station_name_ENG', 'lat', 'lng']].to_json(data_path + '/network_nodes.json')
  del subway
  # urban features
  poi = filter_df(poi, st_ids, False)
  bu = filter_df(bu, st_ids, False)
  lte = filter_df(lte, st_ids, False)
  uf = pd.concat([poi, bu, lte], axis=1, join='inner') # (MxN, F)
  del poi
  del bu
  del lte
  MxN, F = uf.shape
  uf = uf.to_numpy()
  N = len(st_ids)
  M = MxN / N
  assert M % 1 == 0
  M = int(M)
  assert M == 12 * len(years)
  uf = uf.reshape((M, N, F)) # t: Month
  np.save(data_path + '/urban_features.npy', uf)
  del uf
  # ridership
  ridership = filter_df(ridership, st_ids, False, True) # (MxNxHx7, 2)
  MxNxHx7_2 = ridership.shape
  H = len(daily_ts)
  assert MxNxHx7_2 == (M*N*H*7, 2)
  ridership = ridership.to_numpy()
  ridership = ridership.reshape((M, N, H, 7, 2))
  np.save(data_path + '/ridership.npy', ridership)
  del ridership
  # adjacency
  st_ids = sorted(list(st_ids))
  adj = adj[st_ids][:,st_ids]
  np.save(data_path + '/adjacency.npy', adj)
  del adj
  # spatial embeddings
  f = open('./SE.txt', mode='r')
  lines = f.readlines()
  temp = lines[0].split(' ')
  D = int(temp[1])
  SE = np.zeros(shape=(N,D), dtype=np.float32)
  st_to_se = dict()
  for line in lines[1:]:
    temp = line.split(' ')
    st_id = int(temp[0])
    if st_id in st_ids:
      assert not st_id in st_to_se
      st_to_se[st_id] = temp[1:]
  st_to_se = sorted(list(st_to_se.items()), key=lambda tup: tup[0])
  assert len(st_to_se) == N
  for i, (_, se) in enumerate(st_to_se):
    SE[i] = se
  np.save(data_path + '/spatial_embeddings.npy', SE)

filter_data()