In [1]:
from collections import deque

srts = ['ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']
mids = ['ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ']
ends = ['', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']


def is_hangul(char):
    if ord('가') <= ord(char) <= ord('힣'):
        return True
    else:
        return False


def split_char(char):
    key = ord(char[0]) - 44032
    end = int(key % 28)
    mid = int((key - end) / 28 % 21)
    srt = int((key / 28 - mid) / 21)
    return srts[srt], mids[mid], ends[end]


def merge_char(srt, mid, end):
    i = 44032 + 28 * 21 * srts.index(srt) + 28 * mids.index(mid) + ends.index(end)
    return chr(i)


def split_text(text):
    output = list()
    for w in text:
        if ord('가') <= ord(w) <= ord('힣'):
            output.append(''.join(split_char(w)))
        else:
            output.append(w)
    return ''.join(output)


def merge_text(text):
    output = list()
    text = deque([w for w in text])
    while len(text) != 0:
        if len(text) >= 3 and text[0] in srts and text[1] in mids and text[2] in ends and (len(text) == 3 or text[3] not in mids):
            output.append(merge_char(text.popleft(), text.popleft(), text.popleft()))
        elif len(text) >=2 and text[0] in srts and text[1] in mids:
            output.append(merge_char(text.popleft(), text.popleft(), ''))
        else:
            output.append(text.popleft())

    return ''.join(output)

In [2]:
def parse_tagged(tagged):
    tags = tagged.split('+')
    tags = [tag.split('/') for tag in tags]
    umjol_tags = list()

    for tag in tags:
        for letter in tag[0]:
            umjol_tags.append((letter, tag[1]))

    return umjol_tags


def process_ejol(ejol):
    code, origin, tagged = ejol.split('\t')
    origin = [l for l in origin]
    tagged = parse_tagged(tagged)

    if len(origin) > len(tagged):
        if False in list(map(lambda x: is_hangul(x), origin)):
            raise Exception('Not Enough Tagged')

    result = list()
    i = 0
    for j, tag in enumerate(tagged):
        if j == 0:
            result.append(0)
            i += 1
            continue

        if i >= len(origin):
            result.append(-1)
            continue

        if origin[i] == tag[0]:
            result.append(i)
            i += 1
        elif tag[0] in origin[i:]:
            i = origin.index(tag[0], i)
            if result[-1] != -1 and i - result[-1] > 2:
                result.append(-1)
            else:
                result.append(i)
                i += 1
        else:
            result.append(-1)

    for i, r in enumerate(result):
        if r == -1:
            if result[i - 1] + 1 in result:
                result[i] = result[i - 1]
            else:
                result[i] = result[i - 1] + 1

        if result[i] > len(origin) - 1:
            result[i] = len(origin) - 1

        if i >= 2 and result[i - 2] == result[i]:
            if is_hangul(origin[i - 2]) and is_hangul(tagged[i - 2][0]) and \
                            split_char(origin[i - 2])[0:2] == split_char(tagged[i - 2][0])[0:2]:
                pass
            else:
                result[i - 2] -= 1

    return origin, tagged, result


def parse_sentence(stc):
    stc = stc.text.replace(' ', '').strip()
    ejols = stc.split('\n')
    sentence = list()
    sentence.append({'letter': '<시작>', 'pos': [{'letter': '<시작>', 'tag': 'ZST'}]})

    for ejol in ejols:
        letters = list()
        origin, tagged, result = process_ejol(ejol)
        for l in origin:
            letters.append({'letter': l, 'pos': []})
        for i, t in enumerate(tagged):
            letters[result[i]]['pos'].append({'letter': t[0], 'tag': t[1]})
        for l in letters:
            if len(l['pos']) == 0:
                l['pos'].append({'letter': l['letter'], 'tag': 'ZNO'})
        sentence.extend(letters)
        sentence.append({'letter': ' ', 'pos': [{'letter': ' ', 'tag': 'ZSP'}]})
    
    sentence.pop(-1)
    sentence.append({'letter': '<끝>', 'pos': [{'letter': '<끝>', 'tag': 'ZED'}]})
    return sentence

In [3]:
from bs4 import BeautifulSoup
from pathlib import Path
import simplejson as json
from tqdm import tqdm

korpus_dir = Path('corpus')
file_list = [file for file in korpus_dir.iterdir() if file.name.endswith('.txt') and not file.name.startswith('.')]

output = open('output.txt', 'w', encoding='utf-8')
for file in tqdm(file_list):
    c_file = file.open('r', encoding='utf-16 le')
    parsed = BeautifulSoup(c_file.read(), 'html.parser')
    ps = parsed.find_all('p')

    for p in ps:
        try:
            output.write(json.dumps(parse_sentence(p), ensure_ascii=False) + '\n')
        except Exception as e:
            pass

output.close()

100%|██████████| 478/478 [08:20<00:00, 10.42it/s]
