# 침수 API용 데이터 구축

In [62]:
import pandas as pd 
import glob
from tqdm import tqdm
import dask.dataframe as dd

def make_sido_code(row):
    return str(row["시군구코드"])[:2]

### (1) 건축물대장 기본개요

In [64]:
%%time

# 건축물대장 기본개요
columns = ["관리건축물대장PK","관리상위건축물대장PK","대장구분코드","대장구분코드명","대장종류코드","대장종류코드명", \
            "대지위치","도로명대지위치","건물명","시군구코드","법정동코드","대지구분코드","번","지","특수지명","블록","로트","외필지수", \
            "새주소도로코드","새주소법정동코드","새주소지상지하코드","새주소본번","새주소부번","지역코드","지구코드","구역코드", \
            "지역코드명","지구코드명","구역코드명","생성일자"]

basis = dd.read_csv("data/rawdata/building/기본개요/mart_djy_01.txt", sep = "\|", engine='python', dtype=str, \
                    on_bad_lines='skip', keep_default_na=False, encoding = "cp949", header=None, names=columns)
print(f"건축물대장 기본개요의 분할된 개수는 {basis.npartitions}개입니다.")

busan_basis = basis[["관리건축물대장PK", "관리상위건축물대장PK", "시군구코드"]]
busan_basis["시도코드"] = busan_basis.apply(make_sido_code, axis=1, meta=object)
busan_basis = busan_basis[busan_basis["시도코드"] == "26"].drop(["시군구코드"], axis=1)

# preprocessing
busan_basis["관리건축물대장PK"] = busan_basis["관리건축물대장PK"].map(lambda x: x.strip())
busan_basis["관리상위건축물대장PK"] = busan_basis["관리상위건축물대장PK"].map(lambda x: x.strip())
busan_basis = busan_basis.drop("시도코드", axis=1)

# convert dask to pandas
busan_basis = busan_basis.compute()

# save csv
busan_basis.to_csv("data/refined-data/busan-기본개요.csv", encoding="utf-8", index=False)

건축물대장 기본개요의 분할된 개수는 80개입니다.


### (2) 건축물대장 주택가격

In [63]:
%%time

# 건축물대장 주택가격

# names=columns
# columns = ["관리건축물대장PK","대장구분코드","대장구분코드명","대장종류코드","대장종류코드명","대지위치","도로명대지위치", \
#             "건물명","시군구코드","법정동코드","대지구분코드","번","지","특수지명","블록","로트","외필지수","새주소도로코드", \
#             "새주소지상지하코드","새주소본번","새주소부번","기준일자","주택가격","생성일자"]

price = dd.read_csv("data/rawdata/building/주택가격/mart_djy_08.txt", sep="\|", dtype=str, encoding='cp949',
                        on_bad_lines="skip", header=None, engine='python')

# the number of partitions
print(f"건축물대장의 주택가격이 분할된 파일 개수는 {price.npartitions}개입니다.")

# save partitions
# price.to_csv("data/rawdata/building/주택가격/csvs", encoding="utf-8")

price_df = price[[0,8,23,24]]
price_df.columns = ["관리상위건축물대장PK","시군구코드","주택가격","생성일자"]
price_df["시도코드"] = price_df.apply(make_sido_code, axis=1, meta=object)
price_df = price_df.loc[price_df["시도코드"] == "26"]

# preprocessing
price_df["관리상위건축물대장PK"] = price_df["관리상위건축물대장PK"].map(lambda x: x.strip())
price_df = price_df.map_partitions(lambda df: df.sort_values(by="생성일자", ascending=False))
price_df = price_df.drop(["시도코드"], axis=1).drop_duplicates()

# convert dask to pandas
busan_price_pdf = price_df.compute()

# save csv
busan_price_pdf.to_csv("data/refined-data/busan-price.csv", encoding="utf-8", index=False)

# row 개수
print(f"주택가격의  행수는 {busan_price_pdf.shape[0]}개입니다.")

건축물대장의 주택가격이 분할된 파일 개수는 416개입니다.


ValueError: Metadata inference failed in `drop_by_shallow_copy`.

You have supplied a custom function and Dask is unable to 
determine the type of output that that function returns. 

To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.

Original error is below:
------------------------
KeyError("['Unnamed: 0'] not found in axis")

Traceback:
---------
  File "c:\Users\datahub\anaconda3\lib\site-packages\dask\dataframe\utils.py", line 173, in raise_on_meta_error
    yield
  File "c:\Users\datahub\anaconda3\lib\site-packages\dask\dataframe\core.py", line 5125, in _emulate
    return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
  File "c:\Users\datahub\anaconda3\lib\site-packages\dask\dataframe\utils.py", line 972, in drop_by_shallow_copy
    df2.drop(columns=columns, inplace=True, errors=errors)
  File "c:\Users\datahub\anaconda3\lib\site-packages\pandas\util\_decorators.py", line 311, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\datahub\anaconda3\lib\site-packages\pandas\core\frame.py", line 4906, in drop
    return super().drop(
  File "c:\Users\datahub\anaconda3\lib\site-packages\pandas\core\generic.py", line 4150, in drop
    obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  File "c:\Users\datahub\anaconda3\lib\site-packages\pandas\core\generic.py", line 4185, in _drop_axis
    new_axis = axis.drop(labels, errors=errors)
  File "c:\Users\datahub\anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 6017, in drop
    raise KeyError(f"{labels[mask]} not found in axis")


In [21]:
# 생성일자 기준 정렬 후 drop_duplicates
busan_price = dd.read_csv("data/refined-data/busan-price.csv", encoding="utf-8", dtype=str)

busan_price = busan_price.map_partitions(lambda df: df.sort_values(by="생성일자", ascending=False))
busan_price = busan_price.drop(["Unnamed: 0","시도코드"], axis=1).drop_duplicates()

busan_price_df = busan_price.compute()
print(busan_price_df.shape)

busan_price_df.to_csv("data/refined-data/busan-price-unique-rows.csv", encoding="utf-8", index=False)

(9183677, 4)


### (3) 건축물대장 표제부

In [58]:
%%time

columns = ["관리건축물대장PK","대장구분코드","대장구분코드명","대장종류코드","대장종류코드명","대지위치", \
            "도로명대지위치","건물명","시군구코드","법정동코드","대지구분코드","번","지","특수지명", \
            "블록","로트","외필지수","새주소도로코드","새주소법정동코드","새주소지상지하코드","새주소본번", \
            "새주소부번","동명","주부속구분코드","주부속구분코드명","대지면적(M2)","건축면적(M2)", \
            "건폐율(%)","연면적","용적률산정연면적","용적률","구조코드","구조코드명","기타구조", \
            "주용도코드","주용도코드명","기타용도","지붕코드","지붕코드명","기타지붕","세대수(세대)", \
            "가구수(가구)","높이(M)","지상층수","지하층수","승용승강기수","비상용승강기수","부속건축물수", \
            "부속건축물면적","총동연면적","옥내기계식대수(대)","옥내기계식면적","옥외기계식대수(대)", \
            "옥외기계식면적","옥내자주식대수(대)","옥내자주식면적","옥외자주식대수(대)","옥외자주식면적", \
            "허가일","착공일","사용승인일","허가번호년","허가번호기관코드","허가번호기관코드명", \
            "허가번호구분코드","허가번호구분코드명","호수(호)","에너지효율등급","에너지절감율", \
            "에너지EPI점수","친환경건축물등급","친환경건축물인증점수","지능형건축물등급","지능형건축물인증점수", \
            "생성일자","내진설계적용여부","내진능력"]

building = dd.read_csv("data/rawdata/building/표제부/mart_djy_03.txt", sep = "\|", engine='python', dtype=str, \
                    on_bad_lines='skip', keep_default_na=False, encoding = "cp949", header=None, names=columns)
print(f"건축물대장 표제부의 분할된 개수는 {building.npartitions}개입니다.")

busan_building = building[["관리건축물대장PK","대장종류코드명","도로명대지위치","시군구코드","시군구코드", \
                     "대지면적(M2)","건축면적(M2)","건폐율(%)", "연면적","주용도코드명","가구수(가구)","높이(M)","지상층수","지하층수", \
                     "사용승인일","허가번호기관코드","내진설계적용여부", \
                     "친환경건축물등급","친환경건축물인증점수","지능형건축물등급","지능형건축물인증점수","내진설계적용여부","내진능력"]]

busan_building["시도코드"] = building.apply(make_sido_code, axis=1, meta=object)
busan_building = busan_building[busan_building["시도코드"] == "26"].drop(["시군구코드"], axis=1)

# preprocessing
busan_building["관리건축물대장PK"] = busan_building["관리건축물대장PK"].map(lambda x: x.strip())
busan_building = busan_building.drop("시도코드", axis=1)

# convert dask to pandas
busan_building = busan_building.compute()

# save csv
busan_building.to_csv("data/refined-data/busan-표제부.csv", encoding="utf-8", index=False)

건축물대장 기본개요의 분할된 개수는 47개입니다.
Wall time: 5min 20s


Unnamed: 0,관리건축물대장PK,대장종류코드명,도로명대지위치,대지면적(M2),건축면적(M2),건폐율(%),연면적,주용도코드명,가구수(가구),높이(M),...,사용승인일,허가번호기관코드,내진설계적용여부,친환경건축물등급,친환경건축물인증점수,지능형건축물등급,지능형건축물인증점수,내진설계적용여부.1,내진능력,시도코드
76,26530-5448,일반건축물,부산광역시 사상구 모라로56번길 28,0,64.1,0,128.2,공장,0,0,...,19670610,,,,0,,0,,,26
86,26140-22667,일반건축물,,0,37.54,0,37.54,단독주택,1,0,...,19850629,,,,0,,0,,,26
162,26470-20656,일반건축물,부산광역시 연제구 거제대로230번길 45-1,0,56.2,0,81.32,단독주택,1,0,...,19751218,,,,0,,0,,,26
172,26290-31275,일반건축물,부산광역시 남구 양지골로251번가길 25,0,75.77,0,75.77,단독주택,1,0,...,19781115,,,,0,,0,,,26
279,26470-8512,일반건축물,부산광역시 연제구 구락로153번길 68,0,82.06,0,284.22,단독주택,4,0,...,19900517,,,,0,,0,,,26


### (4) KISTI가 제공하는 침수된 건물 정보

In [52]:
# KISTI 침수된 건물 정보
path = "data/rawdata/flooding-kisti-data/*.csv"

# Merge all files
flood_df = pd.concat([pd.read_csv(f, encoding="utf-8", keep_default_na=False) for f in glob.glob(path)])
flood_df = flood_df.drop(["OBJECTID_1", "OBJECTID", "full_pk", "pnu_1", "Shape_Length", "Shape_Area"], axis=1)
flood_df["관리건축물대장PK"] = flood_df["관리건축물대장PK"].apply(lambda x: x.strip())
print(flood_df.shape)

# Convert pandas to dask dataframe
flood_dd = dd.from_pandas(flood_df, npartitions=3)
flood_dd.head()

(141129, 15)


Unnamed: 0,PNU,BD_MGT_SN,시군구코드,법정동코드,번,지,관리건축물대장PK,건물명,새주소도로코드,새주소법정동코드,새주소지상지하코드,동명칭,대지구분코드,Cent_X,Cent_Y
0,2653010700102740020,,26530,10700,274,20,26530-22997,,265304217371.0,10701.0,0,,1,381261,185051
0,2611012000100140072,2611012000100140072008066,26110,12000,14,72,26110-7752,,261104175158.0,12001.0,0,,1,384394,180636
0,2611012000100140072,2611012000100140072008066,26110,12000,14,72,26110-7752,,261104175158.0,12001.0,0,,1,384394,180636
0,2611012000100140072,2611012000100140072008066,26110,12000,14,72,26110-7752,,261104175158.0,12001.0,0,,1,384394,180636
0,2614012400105830003,2614012400105830003018981,26140,12400,583,3,26140-24148,,,,0,,1,383781,176883


In [53]:
# 건축물대장 기본개요
busan_basis = dd.read_csv("data/refined-data/busan-기본개요.csv", encoding="utf-8", dtype=str, keep_default_na=False)
# busan_basis = busan_basis.drop("시도코드", axis=1)
# busan_basis["관리건축물대장PK"] = busan_basis["관리건축물대장PK"].map(lambda x: x.strip())
# busan_basis["관리상위건축물대장PK"] = busan_basis["관리상위건축물대장PK"].map(lambda x: x.strip())

busan_basis.head()

Unnamed: 0,관리건축물대장PK,관리상위건축물대장PK
0,26380-4195,
1,26380-4200,
2,26380-4260,26380-168
3,26380-4292,26380-1169
4,26380-4293,26380-1169


In [None]:
merged =  flood_dd.merge(busan_basis, how="left", on=["관리건축물대장PK"])
print(f"병합된 데이터프레임의 행수는 {merged.shape[0].compute()}개입니다.")

merged.head()

In [61]:
# 건축물대장 주택가격
busan_price = dd.read_csv("data/refined-data/busan-price-unique-rows.csv", encoding="utf-8", dtype=str, keep_default_na=False)
busan_price.head()

Unnamed: 0,관리건축물대장PK,시군구코드,주택가격,생성일자
0,26440-100291670,26440,138000000,20220517
1,26710-100262383,26710,199000000,20220517
2,26710-100262402,26710,185000000,20220517
3,26710-100262400,26710,264000000,20220517
4,26710-100262399,26710,264000000,20220517


In [59]:
#건축물대장 표제부
busan_building = dd.read_csv("data/refined-data/busan-표제부.csv", encoding="utf-8", dtype=str, keep_default_na=False)
busan_building.head()

Unnamed: 0,관리건축물대장PK,대장종류코드명,도로명대지위치,대지면적(M2),건축면적(M2),건폐율(%),연면적,주용도코드명,가구수(가구),높이(M),...,사용승인일,허가번호기관코드,내진설계적용여부,친환경건축물등급,친환경건축물인증점수,지능형건축물등급,지능형건축물인증점수,내진설계적용여부.1,내진능력,시도코드
0,26530-5448,일반건축물,부산광역시 사상구 모라로56번길 28,0,64.1,0,128.2,공장,0,0,...,19670610,,,,0,,0,,,26
1,26140-22667,일반건축물,,0,37.54,0,37.54,단독주택,1,0,...,19850629,,,,0,,0,,,26
2,26470-20656,일반건축물,부산광역시 연제구 거제대로230번길 45-1,0,56.2,0,81.32,단독주택,1,0,...,19751218,,,,0,,0,,,26
3,26290-31275,일반건축물,부산광역시 남구 양지골로251번가길 25,0,75.77,0,75.77,단독주택,1,0,...,19781115,,,,0,,0,,,26
4,26470-8512,일반건축물,부산광역시 연제구 구락로153번길 68,0,82.06,0,284.22,단독주택,4,0,...,19900517,,,,0,,0,,,26
