In [107]:
import re
import os
from pathlib import Path
from typing import Union
from warnings import simplefilter
from datetime import date
from shutil import disk_usage
from math import floor
from multiprocessing import Pool
from multiprocessing import cpu_count
from zipfile import ZipFile
from sys import getsizeof

from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
test = Path(r'C:\Users\wonhyeong\downloads\20211020_10-Q_edgar_data_1456802_0001477932-21-007463.txt') #ixbrl


In [None]:
# append all file in the path to given df


In [108]:
def clean_html(file: Path) -> str:
    docu_end = '</DOCUMENT'
    try:
        with open(file, 'r+', encoding='utf-8') as f:
            # read and rewrite
            text = f.read()
            found = text.find(docu_end)
            if found == -1:
                found = text.find(docu_end.lower())
            if found == -1:
                raise ValueError('No document end tag found')
            text = text[:text.find(docu_end)]
            f.close()
    except:
        try:
            f.close()
        except:
            pass
        return file, None, False
    return file, text, True

def make_folder(file: Union[str, Path]) -> Path:
    name = '_'.join([str(file.name),'cleaned'])
    cleaned = file.parent / name
    done= cleaned / 'done'
    errored = cleaned / 'errored'
    cleaned.mkdir(exist_ok=True)
    done.mkdir(exist_ok=True)
    errored.mkdir(exist_ok=True)
    return done, errored

def save_on_success(file: Path, text: str, done: Path) -> None:
    print(f'Saving {file.name}')
    with open(file, 'w', encoding='utf-8') as f:
        f.write(text)
        f.close()
    file.rename(done / file.name)
    

def save_on_failure(file: Path, errored_dir: Path) -> None:
    print(f'Error in {file.name}')
    file.rename(errored_dir / file.name)

def get_free_space():
    """return free space of disk as GB"""
    _,_,free = disk_usage('C:\\')
    result = round(free / 1024**3, 2)
    print(result)
    return result

def unzip_file(zip_file: Union[str, Path]) -> None:
    """Unzip a file to the same directory."""
    with ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(zip_file.parent)
    return None

In [None]:
qtr = Path(r'C:\Users\wonhyeong\workings\data\10X\QTR4')

if __name__ == '__main__':
    root = Path(r'D:\10-X')
    root_dir = [x for x in root.iterdir() if x.is_dir()]
    # list of files
    sub_dir = [list(f.iterdir()) for f in root_dir if f.is_dir()]
    sub_dir = sub_dir[1:]
    for quarters in sub_dir:
        for qtr in quarters:
            files = list(qtr.iterdir())
            # zip_list = [x for x in files if x.is_file() and x.suffix == '.zip']
            done, errored = make_folder(qtr)
            # free_space = get_free_space()
            # if free_space > 500:
            #     unzip_file(zip_list[0])
            #     zip_list = zip_list[1:]
            #     zip_list[0].unlink()
            with Pool(cpu_count() * 2) as p:
                result = p.imap(clean_html, files, chunksize=100) 
                for file, text, success in result:
                    if success:
                        save_on_success(file, text, done)
                    else:
                        save_on_failure(file, errored)
            print('Done')

## 혹시 안되면 아래 버전으로 실행해 주세요!

In [None]:
qtr = Path(r'C:\Users\wonhyeong\workings\data\10X\QTR4')

if __name__ == '__main__':
    root = Path(r'D:\10-X')
    root_dir = [x for x in root.iterdir() if x.is_dir()]
    # list of files
    sub_dir = [list(f.iterdir()) for f in root_dir if f.is_dir()]
    sub_dir = sub_dir[1:]
    for quarters in sub_dir:
        for qtr in quarters:
            files = list(qtr.iterdir())
            done, errored = make_folder(qtr)
            for file in files:
                path, text, success = clean_html(file)
                if success:
                    save_on_success(path, text, done)
                else:
                    save_on_failure(path, errored)
            print('Done')

In [129]:
def get_name_of(path: Path, delta: int) -> str:
    """Return the basename of the path."""
    name = '_'.join([path.parent.parent.name, path.parent.name, str(delta)]) + '.parquet'
    return name

def is_over_512MB(df: pd.DataFrame, file: Path) -> bool:
    """Return True if the size of the dataframe is over 128MB."""
    mb_512 = 512 * (1024**2) 
    if ( getsizeof(df) + file.stat().st_size ) < mb_512:
        return False
    return True

def to_parquet(df: pd.DataFrame, next: Path, path: Path) -> pd.DataFrame:
    """Save a dataframe as a parquet file."""
    if is_over_512MB(df, next):
        for i in range(1, 1000):
            name = path / get_name_of(path, i)
            if not name.exists():
                break
        df.to_parquet(name, compression='snappy', engine='pyarrow')
        print(f'Saving {name}')
        df = pd.DataFrame()
    return df

def get_header(txt: str) -> dict:
    """Return the header of the txt file."""
    def find_acc(txt: str) -> str:
        return txt[txt.find('ACCESSION NUMBER:') + 17:txt.find('CONFORMED SUBMISSION TYPE:')].strip()
    def find_cik(txt: str) -> str:
        return txt[txt.find('CENTRAL INDEX KEY:') + 18:txt.find('STANDARD INDUSTRIAL CLASSIFICATION:')].strip()
    def find_irs(txt: str) -> str:
        return txt[txt.find('IRS NUMBER:') + 11:txt.find('STATE OF INCORPORATION:')].strip()
    def find_form(txt: str) -> str:
        return txt[txt.find('CONFORMED SUBMISSION TYPE:') + 26:txt.find('PUBLIC DOCUMENT COUNT:')].strip()
    def find_name(txt: str) -> str:
        return txt[txt.find('COMPANY CONFORMED NAME:') + 23:txt.find('CENTRAL INDEX KEY:')].strip()
    def find_date(txt: str) -> str:
        return txt[txt.find('FILED AS OF DATE:') + 17:txt.find('DATE AS OF CHANGE:')].strip()
    def identify_markup(txt: str) -> str:
        text = txt.find('<TEXT>') + 6
        markup = txt[text: text+20].strip().casefold()
        if len(markup) == 0:
            raise ValueError('No markup found.')
        elif 'HTML'.casefold() in markup:
            return 'html'
        elif 'XML'.casefold() in markup:
            return 'xml'
        elif 'XBRL'.casefold() in markup:
            return 'xbrl'
        else:
            return 'text'
            
    header = txt[txt.find('<SEC-HEADER>') + 12:txt.find('</SEC-HEADER>')]
    header_dict = {
        'acc': find_acc(header),
        'cik': find_cik(header),
        'irs': find_irs(header),
        'form': find_form(header),
        'coname': find_name(header),
        'date': find_date(header),
        'markup': identify_markup(txt)
    }
    return header_dict

def get_content(txt: str) -> dict:
    """Return the content of a file."""
    content = txt[ txt.find('<TEXT>') + 6:txt.find( '</TEXT>' ) ].strip()
    return { 'text': content }

def get_single_row(file: Union[str, Path]) -> pd.DataFrame:
    """Return a dataframe of a single row."""
    txt = file.read_text(encoding='utf-8')
    header = get_header(txt)
    content = get_content(txt)
    # merge two dictionaries
    df = pd.DataFrame([{**header, **content}], )
    return df

def append_df(df: pd.DataFrame, file: Union[str, Path]) -> pd.DataFrame:
    """Return a dataframe from a file."""
    return df.append(get_single_row(file), ignore_index=True)

In [130]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
qtr = Path(r'C:\Users\wonhyeong\workings\data\10X\QTR4_cleaned\done')
files = list(qtr.iterdir())
li = []
for file in files:
    if file.name.endswith('.parquet'):
        print(f'Loading {file.name}, size is {file.stat().st_size / (1024**2):.2f}MB')
        li.append(pd.read_parquet(file))

In [126]:
for i in li:
    print(f'Loading, size is {getsizeof(i) / (1024**2):.2f}MB')

Loading, size is 85.13MB
Loading, size is 114.14MB
Loading, size is 93.40MB
Loading, size is 77.04MB
Loading, size is 92.70MB
Loading, size is 93.79MB
Loading, size is 114.79MB
Loading, size is 72.87MB
Loading, size is 37.37MB
Loading, size is 35.88MB
Loading, size is 119.68MB
Loading, size is 109.04MB
Loading, size is 118.37MB
Loading, size is 105.49MB
Loading, size is 110.55MB
Loading, size is 111.26MB
Loading, size is 106.65MB
Loading, size is 110.62MB
Loading, size is 112.55MB
Loading, size is 105.23MB
Loading, size is 92.29MB
Loading, size is 115.86MB


In [131]:
# sample test
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
qtr = Path(r'C:\Users\wonhyeong\workings\data\10X\QTR4_cleaned\done')

if __name__ == '__main__':
    files = list(qtr.iterdir())
    done, errored = make_folder(qtr)
    df = pd.DataFrame()
    for file in files:
        df = to_parquet(df, file, done)
        df = append_df(df, file)
    print('Done')

Saving C:\Users\wonhyeong\workings\data\10X\QTR4_cleaned\done_cleaned\done\QTR4_cleaned_done_cleaned_1.parquet
Saving C:\Users\wonhyeong\workings\data\10X\QTR4_cleaned\done_cleaned\done\QTR4_cleaned_done_cleaned_2.parquet
Done


In [132]:
path = Path(r'C:\Users\wonhyeong\workings\data\10X\QTR4_cleaned\done_cleaned\done')
files = list(path.iterdir())
li = [pd.read_parquet(file) for file in files]

In [135]:
for i in li:
    print(getsizeof(i) / (1024**2))

510.42223072052
509.6137046813965
