In [1]:
import io
import zipfile
from pathlib import Path
from tempfile import TemporaryDirectory, TemporaryFile
from typing import Mapping

import dask.dataframe as dd
import pandas as pd
from dask.diagnostics import ProgressBar
from tqdm import tqdm

from openeais import (bldrgst_dtypes, expos_dtypes, floor_dtypes,
                      openeais_to_parquet, recap_dtypes, title_dtypes)


In [None]:
for file in Path("data/").glob("국토교통부_건축물대장_기본개요*.zip"):  # bldrgst
    print(file)
for file in Path("data/").glob("국토교통부_건축물대장_총괄표제부*.zip"):  # recap
    print(file)
for file in Path("data/").glob("국토교통부_건축물대장_표제부*.zip"):  # title
    print(file)
for file in Path("data/").glob("국토교통부_건축물대장_층별개요*.zip"):  # floor
    print(file)
for file in Path("data/").glob("국토교통부_건축물대장_전유부*.zip"):  # expos
    print(file)


data\국토교통부_건축물대장_기본개요+(2022년+07월).zip
data\국토교통부_건축물대장_총괄표제부+(2022년+07월).zip
data\국토교통부_건축물대장_표제부+(2022년+07월).zip
data\국토교통부_건축물대장_층별개요+(2022년+07월).zip
data\국토교통부_건축물대장_전유부+(2022년+07월).zip


In [None]:
sorted(Path("data/").glob("국토교통부_건축물대장_기본개요*.zip"), reverse=True)[0]


WindowsPath('data/국토교통부_건축물대장_기본개요+(2022년+07월).zip')

In [None]:
zpath = sorted(Path("data/").glob("국토교통부_건축물대장_기본개요*.zip"), reverse=True)[0]
with zipfile.ZipFile(zpath) as zf:
    for fname in zf.namelist():
        with zf.open(fname) as bf:
            with io.TextIOWrapper(bf, encoding="cp949") as f:
                print(f.readline())
                break


45710-100181124|45710-4744|2|집합|4|전유부|전라북도 완주군 삼례읍 석전리 71-24번지| 전라북도 완주군 삼례읍 신수로 142|삼례읍 석전리 71-24 제1종근린생활시설 (석전리학동마을(대표:한병희))|45710|25026|0|0071|0024||||0|457103273043|25001|0|142|0|||||||20100619



In [None]:
with TemporaryFile() as tf:
    print(dir(tf))


['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_closer', 'close', 'delete', 'file', 'name']


In [None]:
with TemporaryFile() as tf:
    tf.write(b"Hello world!")
    tf.seek(0)
    print(tf.read())


b'Hello world!'


In [None]:
zpath = sorted(Path("data/").glob("국토교통부_건축물대장_기본개요*.zip"), reverse=True)[0]

with TemporaryFile() as tf:
    with zipfile.ZipFile(zpath) as zf:

In [None]:
with TemporaryDirectory() as tdir:
    print(tdir, type(tdir))


C:\Users\USER\AppData\Local\Temp\tmpt1iujjoj <class 'str'>


In [4]:
zpath = sorted(Path("data/").glob("국토교통부_건축물대장_기본개요*.zip"), reverse=True)[0]

with TemporaryDirectory() as tdir:  # type: str
    with zipfile.ZipFile(zpath) as zf:
        print(zf.infolist())


[<ZipInfo filename='mart_djy_01.txt' compress_type=deflate file_size=5125237721 compress_size=703468891>]


In [12]:
zpath = sorted(Path("data/").glob("국토교통부_건축물대장_기본개요*.zip"), reverse=True)[0]

with TemporaryDirectory() as tdir:  # type: str
    with zipfile.ZipFile(zpath) as zf:
        zf.extractall(path=tdir, members=tqdm(zf.infolist(), desc="Extracting zip"))

    paths = sorted(Path(tdir).glob("*.txt"))
    ddf = dd.read_csv(
        paths,
        encoding="cp949",
        header=None,
        sep="|",
        names=bldrgst_dtypes.keys(),
        dtype=bldrgst_dtypes,
    )


Extracting zip: 100%|██████████| 1/1 [00:13<00:00, 13.31s/it]


In [24]:
def zip_to_parquet(
    path,
    savedir,
    dtypes: Mapping,
    member_pattern="*.txt",
    encoding="cp949",
    header=None,    
    sep="|",
    index=None,
    **kwargs
):
    with TemporaryDirectory() as tdir:  # type: str
        with zipfile.ZipFile(path) as zf:
            zf.extractall(path=tdir, members=tqdm(zf.infolist(), desc="Extracting zip"))

        paths = sorted(Path(tdir).glob(member_pattern))
        ddf: dd.DataFrame = dd.read_csv(
            paths,
            encoding=encoding,
            header=header,
            sep=sep,
            names=dtypes.keys(),
            dtype=dtypes,
            **kwargs
        )
        
        with ProgressBar():
            if index:
                print("Setting index...")
                ddf = ddf.set_index(index)  
            print("Saving...")
            ddf.to_parquet(savedir)


In [2]:
zpath = sorted(Path("data/").glob("국토교통부_건축물대장_기본개요*.zip"), reverse=True)[0]
openeais_to_parquet(zpath, "data/bldrgst", bldrgst_dtypes)


already done


WindowsPath('data/bldrgst')

In [3]:
zpath

WindowsPath('data/국토교통부_건축물대장_기본개요+(2022년+07월).zip')

In [4]:
ppath = Path("data/bldrgst")
ppath

WindowsPath('data/bldrgst')

In [5]:
zpath.stat()

os.stat_result(st_mode=33206, st_ino=18295873486342750, st_dev=239204461, st_nlink=1, st_uid=0, st_gid=0, st_size=703469167, st_atime=1663289397, st_mtime=1663289397, st_ctime=1663289015)

In [7]:
zpath.stat().st_mtime > ppath.stat().st_mtime

False

In [13]:
if ppath.exists() and ppath.stat().st_mtime > zpath.stat().st_mtime:
    print("nope")

nope
