# Census data Preprocessing

### Data Source : 통계지리정보서비스(https://sgis.kostat.go.kr/)

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import re

In [10]:
flderlist = os.listdir('data/CENSUS/Grid')
flderlist.remove('.DS_Store') # Grid folder list

# Unit(100M) 단위 격자 데이터 수집
unit = '100M'

grid = gpd.GeoDataFrame(columns = ['GRID_'+unit+'_','geometry'], crs='epsg:5179')
for f in tqdm(flderlist):
    filelist = os.listdir('data/CENSUS/Grid/'+f)
    idx = np.where([re.search(unit+'.shp$', i) for i in filelist])[0][0]
    filename = filelist[idx]
    gdf = gpd.read_file('data/CENSUS/Grid/'+f+'/'+filename)
    grid = pd.concat([grid, gdf], axis=0, ignore_index=True)
print(grid.shape)

100%|██████████| 4/4 [01:40<00:00, 25.21s/it]

(3785474, 2)





In [11]:
# 인구데이터 수집
df = pd.DataFrame()
censuslist = os.listdir('data/CENSUS')
idxs = np.where([re.search(unit+'.txt$', i) for i in censuslist])[0]
for i in idxs:
    df_new = pd.read_table('data/CENSUS/'+censuslist[i], sep='^', header=None)
    df_new.columns = ['YEAR','GRID','TYPE','COUNT']
    df_new = df_new[['GRID','TYPE','COUNT']]
    df_new = df_new.set_index(['GRID','TYPE']).stack().unstack(level=1)
    df_new = df_new.reset_index(level=['GRID']).reset_index(drop=True)
    df_new.index.name = None
    df = pd.concat([df, df_new], ignore_index=True)

df = df.set_index('GRID')
df.index.name = None
df.columns.name = None
print(df.shape)

(410420, 3)


### 격자데이터와 지역데이터 병합
: 대전, 세종지역만 포함하는 격자 처리

In [12]:
# read gpkg
whole_area_adm = gpd.read_file('data/whole_adm_대전세종.gpkg')
whole_area_adm = whole_area_adm.to_crs(epsg=5179)
# grid = grid.to_crs(epsg=5181)
grid['INTERSECTED'] = 0
print('crs changed')

for i in tqdm(range(len(grid))):
    if whole_area_adm.geometry.intersects(grid.geometry[i]).any():
        grid.iat[i, 2] = 1

grid = grid.loc[grid.INTERSECTED == 1,:]
grid = grid[grid.columns.drop('INTERSECTED')].reset_index(drop=True)
grid = grid.to_crs(epsg=5181)
grid.to_file('data/CENSUS/grid_100M.gpkg', index=False)

crs changed


100%|██████████| 3785474/3785474 [22:17<00:00, 2829.69it/s]


대전, 세종 각각 격자화

In [23]:
# read gpkg
grid = gpd.read_file('data/CENSUS/grid_100M.gpkg')
DJ_area_adm = whole_area_adm.loc[whole_area_adm.시도 == '대전', :]
DJ_area_adm = DJ_area_adm.to_crs(epsg=5181)
grid_DJ = grid.to_crs(epsg=5181)
grid_DJ['INTERSECTED'] = 0

for i in tqdm(range(len(grid_DJ))):
    if DJ_area_adm.geometry.intersects(grid_DJ.geometry[i]).any():
        grid_DJ.iat[i, 2] = 1

grid_DJ = grid_DJ.loc[grid_DJ.INTERSECTED == 1,:]
grid_DJ = grid_DJ[grid_DJ.columns.drop('INTERSECTED')].reset_index(drop=True)
grid_DJ.to_file('data/CENSUS/grid_DJ_100M.gpkg', index=False)

# read gpkg
SJ_area_adm = whole_area_adm.loc[whole_area_adm.시도 == '세종', :]
SJ_area_adm = SJ_area_adm.to_crs(epsg=5181)
grid_SJ = grid.to_crs(epsg=5181)
grid_SJ['INTERSECTED'] = 0

for i in tqdm(range(len(grid_SJ))):
    if SJ_area_adm.geometry.intersects(grid_SJ.geometry[i]).any():
        grid_SJ.iat[i, 2] = 1

grid_SJ = grid_SJ.loc[grid_SJ.INTERSECTED == 1,:]
grid_SJ = grid_SJ[grid_SJ.columns.drop('INTERSECTED')].reset_index(drop=True)
grid_SJ.to_file('data/CENSUS/grid_SJ_100M.gpkg', index=False)

100%|██████████| 102032/102032 [00:46<00:00, 2182.59it/s]
100%|██████████| 102032/102032 [00:29<00:00, 3459.50it/s]


인구데이터와 결합

In [15]:
# Join
census = grid.join(df)
census.shape

(102032, 4)

In [16]:
# save
census.to_file('data/CENSUS/census_100M.gpkg', index=True)

In [27]:
census

Unnamed: 0,geometry,to_in_001,to_in_007,to_in_008
다바848000,"POLYGON ((229763.414 299957.480, 229762.899 30...",,,
다바849000,"POLYGON ((229863.454 299957.995, 229862.938 30...",,,
다바850000,"POLYGON ((229963.493 299958.511, 229962.978 30...",,,
다바851000,"POLYGON ((230063.533 299959.027, 230063.017 30...",,,
다바852000,"POLYGON ((230163.572 299959.542, 230163.057 30...",,,
...,...,...,...,...
라바000286,"POLYGON ((244821.634 328647.621, 244821.115 32...",,,
라바001286,"POLYGON ((244921.675 328648.140, 244921.156 32...",,,
라바002286,"POLYGON ((245021.716 328648.658, 245021.197 32...",,,
라바000287,"POLYGON ((244821.115 328747.662, 244820.596 32...",,,


In [None]:
DJ = gpd.read_file('data/CENSUS/')