# Census data Preprocessing

### Data Source : 통계지리정보서비스(https://sgis.kostat.go.kr/)

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import re

In [2]:
flderlist = os.listdir('data/CENSUS/Grid')
flderlist.remove('.DS_Store') # Grid folder list

# Unit(1KM) 단위 격자 데이터 수집
unit = '1K'

grid = gpd.GeoDataFrame(columns = ['GRID_'+unit+'_CD','geometry'], crs='epsg:5179')
for f in flderlist:
    filelist = os.listdir('data/CENSUS/Grid/'+f)
    idx = np.where([re.search(unit+'.shp$', i) for i in filelist])[0][0]
    filename = filelist[idx]
    gdf = gpd.read_file('data/CENSUS/Grid/'+f+'/'+filename)
    grid = pd.concat([grid, gdf], axis=0, ignore_index=True)
print(grid.shape)

(39423, 2)


In [3]:
# 인구데이터 수집
df = pd.DataFrame()
censuslist = os.listdir('data/CENSUS')
idxs = np.where([re.search(unit+'.txt$', i) for i in censuslist])[0]
for i in idxs:
    df_new = pd.read_table('data/CENSUS/'+censuslist[i], sep='^', header=None)
    df_new.columns = ['YEAR','GRID','TYPE','COUNT']
    df_new = df_new[['GRID','TYPE','COUNT']]
    df_new = df_new.set_index(['GRID','TYPE']).stack().unstack(level=1)
    df_new = df_new.reset_index(level=['GRID']).reset_index(drop=True)
    df_new.index.name = None
    df = pd.concat([df, df_new], ignore_index=True)

df = df.set_index('GRID')
df.index.name = None
df.columns.name = None
print(df.shape)

(30437, 66)


### 격자데이터와 지역데이터 병합
: 대전, 세종지역만 포함하는 격자 처리

In [4]:
# read gpkg
whole_area_adm = gpd.read_file('data/whole_adm_대전세종.gpkg')
whole_area_adm = whole_area_adm.to_crs(epsg=5181)
grid = grid.to_crs(epsg=5181)
grid['INTERSECTED'] = 0

for i in tqdm(range(len(grid))):
    if whole_area_adm.geometry.intersects(grid.geometry[i]).any():
        grid.iat[i, 2] = 1

grid = grid.loc[grid.INTERSECTED == 1,:]
grid = grid[grid.columns.drop('INTERSECTED')].reset_index(drop=True)
grid.to_file('data/CENSUS/grid.gpkg', index=False)

100%|██████████| 39423/39423 [00:11<00:00, 3449.85it/s]


인구데이터와 결합

In [5]:
grid = grid.set_index('GRID_1K_CD')
grid.index.name = None

In [6]:
# Join
census = grid.join(df)
census.shape

(1149, 67)

In [7]:
# save
census.to_file('data/CENSUS/census.gpkg', index=True)