In [None]:
from konlpy.tag import Komoran

Komoran().pos('했다') # [('하', 'VV'), ('았', 'EP'), ('다', 'EC')]

In [None]:
def lemmatize(word):
    morphtags = komoran.pos(word)
    if morphtags[0][1] == 'VA' or morphtags[0][1] == 'VV':
        return morphtags[0][0] + '다'

In [None]:
import konlpy
print(konlpy.__version__) # 0.5.1

from konlpy.tag import Okt

Okt().pos('했다') # [('했다', 'Verb')]
Okt().pos('했더라도') # [('했더라도', 'Verb')]

In [None]:
import konlpy
print(konlpy.__version__) # 0.4.4

from konlpy.tag import Twitter

for word in ['했다', '했지만', '하면서도', '했던', '하니까']:
    print(twitter.pos(word))

In [None]:
[('했', 'Verb'), ('다', 'Eomi')]
[('했지만', 'Josa')]
[('하면', 'Verb'), ('서도', 'Noun')]
[('했', 'Verb'), ('던', 'Eomi')]
[('하니', 'Verb'), ('까', 'Eomi')]

In [None]:
def conjugate(stem, eomi):
    return stem + eomi

conjugate('시작하', '는') # '시작하는'

In [None]:
def conjugate(stem, eomi):
    cho_s, jung_s, jong_s = decompose(stem[-1])
    cho_e, jung_e, jong_e = decompose(eomi[0])
    if jong_s == ' ' and jung_e == ' ':
        return stem[:-1] + compose(cho_s, jung_s, cho_e) + eomi[1:]
    return stem + eomi

In [None]:
lemma_rules = {'란': {('랗', 'ㄴ')}, '했': {('하', '았')}}
conju_rules = {('랗', 'ㄴ'): {'란'}, ('하', '았'): {'했'}}

def conjugate(stem, eomi, rules):
    key = (stem[-1], eomi[0])
    surfaces = [stem + eomi]
    for conjugation in rules.get(key, {}):
        surfaces.append(stem[:-1] + conjugation + eomi[1:])
    return surfaces

conjugate('파랗', 'ㄴ', conju_rules) # ['파란', '파랗ㄴ']

In [None]:
def _lemmatize(word, i, rules):
    key = word[i-1]
    lemmas = [(word[:i], word[i:])]
    for s, e in rules.get(key, {}):
        lemmas.append((word[:i-1] + s, e + word[i:]))
    return lemmas

_lemmatize('파란', 2, lemma_rules) # [('파', '란'), ('파랗', 'ㄴ')]

In [None]:
def lemmatize(word, rules, adjectives, verbs, eomis):
    lemmas = []
    # generate candidates
    for i in range(1, len(word) + 1):
        lemmas += _lemmatize(word, i, rules)
    # check dictionary
    lemmas_ = []
    for stem, eomi in lemmas:
        if not ((stem in adjectives) and (eomi in eomis)):
            continue
        lemmas_.append((stem, eomi))
    return lemmas_

In [None]:
lemma_rules = {
    '했' : {('하', '았')}
    '랬' : {('랗', '았')}
    '추운' : {('춥', '은')}
    '했다' : {('하', '았다')}
    '가우니' : {('갑', '니')}
}

conju_rules = {
    ('하', '았'): {'했'}
    ('랗', '았'): {'랬'}
    ('춥', '은'): {'추운'}
    ('하', '았다'): {'했다'}
    ('갑', '니'): {'가우니'}
}

In [None]:
def get_lemma_candidates(word, rules):
    max_i = len(word) - 1
    candidates = []
    for i, c in enumerate(word):
        l = word[:i+1]
        r = word[i+1:]
        l_ = word[:i]
        # concatenation
        if i < max_i:
            candidates.append((l, r))

        # 1 syllable conjugation
        for stem, eomi in rules.get(c, {}):
            for stem, eomi in rules.get(c, {}):
                candidates.append((l_ + stem, eomi + r))

        # 2 or 3 syllables conjugation
        for conj in {word[i:i+2], word[i:i+3]}:
            for stem, eomi in rules.get(conj, {}):
                candidates.append((l_ + stem, eomi + r[1:]))
    return candidates

In [None]:
from sejong_corpus_cleaner.simplier import eojeol_morphtags_to_lr

eojeol_morphtags_to_lr('로드무비였다', [('로드', 'NNG'), ('무비', 'NNG'), ('이', 'VCP'), ('었', 'EP'), ('다', 'EC')], separate_xsv=False)

In [None]:
from soylemma import extract_rule

eojeol = '로드무비였다'
lw = '로드무비이'
lt = 'Adjective'
rw = '었다'
rt = 'Eomi'

extract_rule(eojeol, lw, lt, rw, rt)