# POI Preprocessing

**Data Source:** 

https://www.localdata.go.kr/devcenter/dataDown.do?menuNo=20001 

(지역 다운로드 (Local download) -> [Metropolitan_Area|Province] -> XML)

**Input:** 

  ./[Metropolitan_Area|Province]/*_XML.zip

  ../subway/network_nodes.json  *(Preprocessed subway network)*


**Output:** 

  ./poi.ftr

In [1]:
import glob
from tqdm import notebook

!pip install lxml
!pip install pygeos

for z_path in notebook.tqdm(glob.glob('./*/*.zip')):
  dir_path = z_path[:-4]
  !unzip -nq -I EUC-KR {z_path} -d {dir_path}



  0%|          | 0/5 [00:00<?, ?it/s]

**Convert to csv**

In [2]:
import glob
from lxml import etree
from os import mkdir
from os.path import isfile, dirname, basename, isdir
import csv

# create csv directories
for xml_dir in glob.glob('./*/*_XML'):
  csv_dir = dirname(xml_dir) + '/' + basename(xml_dir).replace('XML', 'CSV')
  if not isdir(csv_dir):
    mkdir(csv_dir)

for f_path_xml in notebook.tqdm(glob.glob('./*/*/*.xml')):
  xml_dir = dirname(f_path_xml)
  dir_csv = dirname(xml_dir) + '/' + basename(xml_dir).replace('XML', 'CSV')
  f_path_csv = dir_csv + '/' + basename(f_path_xml).replace('xml', 'csv')
  if isfile(f_path_csv):
    continue
  doc = etree.parse(f_path_xml)
  root = doc.getroot()
  header = root[0]
  row_cnt = {r.tag:int(r.text) for r in header[1]}['totalCount']
  if row_cnt == 0: # skip empty files
    continue
  header = [e.tag for e in header[0]]
  rows = root[1][0]
  assert len(rows) == row_cnt
  with open(f_path_csv, 'w', newline='') as f:
    writer = csv.writer(f, delimiter=',')
    writer.writerow(header)
    for row in rows:
      row = {e.tag: e.text for e in row}
      writer.writerow([row[e] for e in header])

  0%|          | 0/950 [00:00<?, ?it/s]

**Get data**

In [3]:
def load_subway():
  net_nodes = pd.read_json('../subway/network_nodes.json')
  net_nodes = gpd.GeoDataFrame(
    net_nodes, geometry=gpd.points_from_xy(x=net_nodes.lng, y=net_nodes.lat), crs='EPSG:4326'
  )
  net_nodes.to_crs(kor_crs, inplace=True)
  net_nodes = net_nodes[['station_id', 'geometry']]
  net_nodes.set_index('station_id', inplace=True)
  return net_nodes

In [4]:
import pandas as pd
import geopandas as gpd
from collections import OrderedDict
import datetime
from datetime import datetime as dt

# date boundaries
i_min_date = 20170101
i_max_date = 20201231
min_date = dt.strptime(str(i_min_date), '%Y%m%d').date()
max_date = dt.strptime(str(i_max_date), '%Y%m%d').date()
min_valid_date = pd.Timestamp.min.date()
max_valid_date = pd.Timestamp.max.date()

# columns loaded from each csv file
load_cols = [
  'bplcNm', # business name
  'opnSvcNm', # POI type
  'apvPermYmd', # business start date
  'dcbYmd', # business closure date
  'x', 'y' # location (korean crs)
]

# used to filter dataframes
selected_cols = ['poi_id', 'opnSvcNm', 'apvPermYmd', 'dcbYmd']

# Korean coordinate ref. system
kor_crs = "+proj=tmerc +lat_0=38 +lon_0=127.0028902777778 +k=1 +x_0=200000 +y_0=500000 "\
            "+ellps=bessel +units=m +no_defs +towgs84=-115.80,474.99,674.11,1.16,-2.31,-1.63,6.43"

net_nodes = load_subway()
poi_to_stations = dict() # map POIs to their adjacent stations
poi_types = set()

def str_to_date(date: str):
  # avoid parsing irrelevant dates
  i_date = int(date)
  if i_date < i_min_date:
    return min_valid_date
  elif i_date > i_max_date:
    return max_valid_date
  # parse date
  return pd.to_datetime(date, format='%Y%m%d').date()

def load_df(path: str):
  df = pd.read_csv(path, dtype=str, engine='c', usecols=load_cols, quotechar='"', index_col=False)
  # filter if df['x'] or df['y'] is null
  df = df[(df['x'] == df['x']) & (df['y'] == df['y'])]
  df['x'] = df['x'].astype(float)
  df['y'] = df['y'].astype(float)
  # convert to GeoDataFrame
  df = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(x=df.x, y=df.y), crs=kor_crs, copy=False
  )
  # keep significant POIs only (500m or less from at least one subway station)
  df: gpd.GeoDataFrame = df.sjoin_nearest(net_nodes, how='left', max_distance=501, distance_col='nearest_dist')
  df = df[(df['nearest_dist'].notna()) & (df['nearest_dist'] <= 500)]
  # fix date columns
  df['apvPermYmd'].fillna(value='', inplace=True)
  df['apvPermYmd'] = [str_to_date(date) if (len(date) > 0 and date.isdigit()) else min_valid_date for date in df['apvPermYmd']]
  df['dcbYmd'].fillna(value='', inplace=True)
  df['dcbYmd'] = [str_to_date(date) if (len(date) > 0 and date.isdigit()) else max_valid_date for date in df['dcbYmd']]
  # filter by date (poi must be active in the relevant time period range)
  df = df[(df['apvPermYmd'] <= max_date) & (df['dcbYmd'] >= min_date)]
  # poi 'unique ID'
  df['poi_id'] = list(zip(df['x'], df['y'], df['opnSvcNm'], df['bplcNm']))
  df.drop_duplicates(subset='poi_id', inplace=True, ignore_index=True)
  # map POIs to their adjacent stations
  distances: pd.DataFrame = df.geometry.apply(lambda g: net_nodes.distance(g))
  assert len(df) == len(distances)
  poi_types.update(df['opnSvcNm'].unique())
  for i, poi_id in enumerate(df['poi_id']):
    if poi_id in poi_to_stations:
      continue
    assert not poi_id in poi_to_stations
    dist: pd.Series = distances.iloc[i]
    poi_to_stations[poi_id] = list(dist[dist<=500].index)
  return df[selected_cols]
  
df: pd.DataFrame = pd.concat([load_df(f_path) for f_path in notebook.tqdm(glob.glob('./*/*/*.csv'))], ignore_index=True, copy=False)
print(df.shape)
df.head(5)

  0%|          | 0/901 [00:00<?, ?it/s]

(887184, 4)


Unnamed: 0,poi_id,opnSvcNm,apvPermYmd,dcbYmd
0,"(263363.74960516, 484877.145433762, 병원, 강남병원)",병원,1677-09-21,2262-04-11
1,"(263367.468807365, 485228.657869581, 의원, 박영준이비...",의원,1677-09-21,2262-04-11
2,"(263206.939573518, 484886.270109221, 의원, 아름다운성...",의원,2019-12-17,2262-04-11
3,"(263333.344644982, 484873.235152707, 의원, 맑은정신건...",의원,2019-12-11,2262-04-11
4,"(263258.305781379, 484735.765755389, 의원, 조도연정형...",의원,1677-09-21,2262-04-11


In [5]:
print('\nPOI IDS TO STATIONS:')
for poi_id, stations in list(poi_to_stations.items())[:15]:
  print(f'{poi_id}: {stations}')
print(f'\nPOI TYPES: LEN: {len(poi_types)}\n{poi_types}')
print('\nDF COLUMNS:')
print(df.columns.values)


POI IDS TO STATIONS:
(263363.74960516, 484877.145433762, '병원', '강남병원'): [88]
(263367.468807365, 485228.657869581, '의원', '박영준이비인후과의원'): [88]
(263206.939573518, 484886.270109221, '의원', '아름다운성형외과의원'): [88]
(263333.344644982, 484873.235152707, '의원', '맑은정신건강의학과의원'): [88]
(263258.305781379, 484735.765755389, '의원', '조도연정형외과의원'): [88]
(263333.344644982, 484873.235152707, '의원', '변미숙산부인과의원'): [88]
(263258.305781379, 484735.765755389, '의원', '연세하루치과의원'): [88]
(263141.943193102, 484821.289677657, '의원', '햇살따뜻한의원'): [88]
(263284.0, 484649.0, '의원', '중화당한의원'): [88]
(262963.840066719, 479705.968241861, '의원', '춘천시신동면보건지소'): [78]
(248958.043055216, 481428.278229766, '의원', '춘천시남산면서천보건진료소'): [66]
(263206.939573518, 484886.270109221, '의원', '아름다운산부인과의원'): [88]
(263333.344644982, 484873.235152707, '의원', '강남S치과의원'): [88]
(263688.019874403, 484406.437950693, '의원', '삼기당한의원'): [88]
(263644.806778994, 484353.027957445, '의원', '올리브치과의원'): [88]

POI TYPES: LEN: 179
{'사료제조업', '목재수입유통업', '건설폐기물처리업', '식육포장처리업', '부속의료기관'

In [6]:
def process_poi(df: pd.DataFrame):
  keys = OrderedDict()
  data = {poi_type:[] for poi_type in poi_types}
  for poi_id, poi_type, start_date, close_date in notebook.tqdm(df.itertuples(index=False), total=len(df)):
    for date in pd.date_range(start=max(start_date, min_date), end=min(close_date, max_date)):
      date = date.date()
      for station_id in poi_to_stations[poi_id]:
        key = (date, station_id)
        if key in keys:
          data[poi_type][keys[key]] += 1
        else:
          keys[key] = len(data[poi_type])
          for poi_t, l in data.items():
            l.append(1 if poi_t == poi_type else 0)
  data = pd.DataFrame(data, index=pd.MultiIndex.from_tuples(keys.keys(), names=['date', 'station_id']), copy=False)
  return data

df = process_poi(df)
print(df.shape)
df.sort_index(inplace=True)
df.reset_index(inplace=True, drop=False)
df.to_feather('poi.ftr')
df.head(15)

  0%|          | 0/887184 [00:00<?, ?it/s]

(759562, 179)


  df.reset_index(inplace=True, drop=False)


Unnamed: 0,date,station_id,사료제조업,목재수입유통업,건설폐기물처리업,식육포장처리업,부속의료기관,영화수입업,동물용의료용구판매업,일반도시가스업체,...,방문판매업,전화권유판매업,목욕장업,민방위대피시설,영화제작업,계량기수입업,대규모점포,인터넷컴퓨터게임시설제공업,자동차야영장업,지하수영향조사기관
0,2017-01-01,0,0,0,0,1,0,0,0,0,...,9,0,0,4,2,0,0,20,0,0
1,2017-01-01,1,0,0,1,5,0,0,0,0,...,13,4,3,3,0,0,1,8,0,0
2,2017-01-01,2,23,2,1,3,0,0,0,0,...,67,12,5,5,2,0,2,21,0,0
3,2017-01-01,3,7,4,0,2,0,1,0,0,...,138,57,1,2,16,0,3,15,0,1
4,2017-01-01,4,1,0,1,0,0,0,0,0,...,11,0,4,7,3,0,3,15,0,0
5,2017-01-01,5,1,0,1,0,0,0,0,0,...,19,1,3,8,1,0,1,33,0,0
6,2017-01-01,6,0,0,0,0,0,1,0,0,...,5,1,2,8,7,0,2,5,0,0
7,2017-01-01,7,0,0,0,0,0,0,0,0,...,8,0,1,6,0,0,0,2,0,0
8,2017-01-01,8,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
9,2017-01-01,9,2,0,0,2,0,0,0,0,...,18,5,2,14,0,0,3,4,0,0
