In [82]:
# !pip install russian-tagsets
# !pip install sklearn-crfsuite
# !pip install pymorphy2
# !pip install conllu
# !pip install scikit-learn==0.23.2
# !pip install gitpython

In [83]:
from russian_tagsets import converters
from pymorphy2 import MorphAnalyzer
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn_crfsuite import CRF
import conllu
import git
import os

In [84]:
# repo_url = "https://github.com/UniversalDependencies/UD_Russian-SynTagRus.git"
# repo_dir = os.path.join(os.getcwd(), "UD_Russian-SynTagRus")
# git.Repo.clone_from(repo_url, repo_dir)

file_path = os.path.join(repo_dir, "ru_syntagrus-ud-train-c.conllu")

In [85]:
morph_analyzer = MorphAnalyzer()
to_ud = converters.converter('opencorpora-int', 'ud20')
infile = open(file_path, "rt", encoding="UTF8")
train_conllu_iter = conllu.parse_incr(infile)

In [86]:
def process_sentence(sent):
    result = []
    for token in sent:
        try:
            word = token['form']
        except:
            word = token
        parses = morph_analyzer.parse(word)
        top_3_parses = parses[:3]
        ud_parses = []
        for parse in top_3_parses:
            ud_parse = to_ud(str(parse.tag))
            parse_l = ud_parse.split()
            lemma = parse.normal_form
            upos = parse_l[0]
            feats = parse_l[1]
            ud_parses.append({'lemma': lemma, 'upos': upos, 'feats': feats})
        result.append((word, ud_parses))
    return result

In [87]:
def create_features(processed_sentence):
    result = []
    for pos in range(len(processed_sentence)):
        word_features = {}
        for j in range(-1, 2):
            shift = pos + j
            if 0 <= shift < len(processed_sentence):
                word, ud_parses = processed_sentence[shift]
                for k, parse in enumerate(ud_parses):
                    prefix = f'{j}_{k}_'
                    word_features[prefix + 'lemma'] = parse['lemma']
                    word_features[prefix + 'upos'] = parse['upos']
                    # word_features[prefix + 'feats'] = parse['feats']
        result.append(word_features)
    return result

In [88]:
X_train = []
y_train = []

for idx, sent in enumerate(train_conllu_iter):
    processed_sentence = process_sentence(sent)
    sent_features = create_features(processed_sentence)
    sent_labels = [token['upos'] for token in sent]
    X_train.append(sent_features)
    y_train.append(sent_labels)

In [93]:
X_train[0]

[{'0_0_lemma': 'мой',
  '0_0_upos': 'DET',
  '0_1_lemma': 'мыть',
  '0_1_upos': 'VERB',
  '1_0_lemma': 'мать',
  '1_0_upos': 'NOUN',
  '1_1_lemma': 'мать',
  '1_1_upos': 'NOUN'},
 {'-1_0_lemma': 'мой',
  '-1_0_upos': 'DET',
  '-1_1_lemma': 'мыть',
  '-1_1_upos': 'VERB',
  '0_0_lemma': 'мать',
  '0_0_upos': 'NOUN',
  '0_1_lemma': 'мать',
  '0_1_upos': 'NOUN',
  '1_0_lemma': ',',
  '1_0_upos': 'PUNCT'},
 {'-1_0_lemma': 'мать',
  '-1_0_upos': 'NOUN',
  '-1_1_lemma': 'мать',
  '-1_1_upos': 'NOUN',
  '0_0_lemma': ',',
  '0_0_upos': 'PUNCT',
  '1_0_lemma': 'анна',
  '1_0_upos': 'PROPN'},
 {'-1_0_lemma': ',',
  '-1_0_upos': 'PUNCT',
  '0_0_lemma': 'анна',
  '0_0_upos': 'PROPN',
  '1_0_lemma': 'всеволодович',
  '1_0_upos': 'PROPN'},
 {'-1_0_lemma': 'анна',
  '-1_0_upos': 'PROPN',
  '0_0_lemma': 'всеволодович',
  '0_0_upos': 'PROPN',
  '1_0_lemma': 'мохов',
  '1_0_upos': 'PROPN',
  '1_1_lemma': 'мохов',
  '1_1_upos': 'PROPN',
  '1_2_lemma': 'мохов',
  '1_2_upos': 'PROPN'},
 {'-1_0_lemma': 'всев

In [90]:
y_train[0]

['DET',
 'NOUN',
 'PUNCT',
 'PROPN',
 'PROPN',
 'PROPN',
 'PUNCT',
 'PROPN',
 'PUNCT',
 'PUNCT',
 'VERB',
 'ADJ',
 'NOUN',
 'ADJ',
 'NOUN',
 'PUNCT']

In [123]:
testfile_path = os.path.join(repo_dir, "ru_syntagrus-ud-test.conllu")
testfile = open(testfile_path, "rt", encoding="UTF8")

In [124]:
X_test, y_test = [], []

test_conllu_iter = conllu.parse_incr(testfile)
for idx, sent in enumerate(test_conllu_iter):
    processed_sentence = process_sentence(sent)
    sent_features = create_features(processed_sentence)
    sent_labels = [token['upos'] for token in sent]
    X_test.append(sent_features)
    y_test.append(sent_labels)

In [125]:
print(y_test[0])
print(X_test[0])

['ADP', 'ADJ', 'NOUN', 'NOUN', 'NOUN', 'PROPN', 'PUNCT', 'NOUN', 'ADP', 'PROPN', 'VERB', 'ADP', 'NUM', 'NOUN', 'PUNCT']
[{'0_0_lemma': 'в', '0_0_upos': 'ADP', '0_1_lemma': 'в', '0_1_upos': 'NOUN', '0_2_lemma': 'в', '0_2_upos': 'NOUN', '1_0_lemma': 'советский', '1_0_upos': 'ADJ', '1_1_lemma': 'советский', '1_1_upos': 'ADJ'}, {'-1_0_lemma': 'в', '-1_0_upos': 'ADP', '-1_1_lemma': 'в', '-1_1_upos': 'NOUN', '-1_2_lemma': 'в', '-1_2_upos': 'NOUN', '0_0_lemma': 'советский', '0_0_upos': 'ADJ', '0_1_lemma': 'советский', '0_1_upos': 'ADJ', '1_0_lemma': 'период', '1_0_upos': 'NOUN', '1_1_lemma': 'период', '1_1_upos': 'NOUN'}, {'-1_0_lemma': 'советский', '-1_0_upos': 'ADJ', '-1_1_lemma': 'советский', '-1_1_upos': 'ADJ', '0_0_lemma': 'период', '0_0_upos': 'NOUN', '0_1_lemma': 'период', '0_1_upos': 'NOUN', '1_0_lemma': 'время', '1_0_upos': 'NOUN', '1_1_lemma': 'время', '1_1_upos': 'NOUN', '1_2_lemma': 'время', '1_2_upos': 'NOUN'}, {'-1_0_lemma': 'период', '-1_0_upos': 'NOUN', '-1_1_lemma': 'период',

In [126]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    keep_tempfiles=None,
    all_possible_transitions=True
)

try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

In [127]:
y_pred = crf.predict(X_test)

In [154]:
import warnings
from seqeval.metrics import classification_report as seq_classification_report

warnings.filterwarnings("ignore", category=UserWarning)

print(seq_classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ART       0.84      0.88      0.86      4575
        CONJ       0.93      0.97      0.95      8632
          DJ       0.91      0.91      0.91     13443
          DP       1.00      1.00      1.00     15061
          DV       0.92      0.86      0.89      7526
         ERB       0.98      0.97      0.98     17019
          ET       0.81      0.94      0.87      4016
         NTJ       0.94      0.65      0.77        23
         OUN       0.96      0.97      0.96     31428
         RON       0.95      0.93      0.94      7641
        ROPN       0.87      0.68      0.76      4669
          UM       0.90      0.81      0.85      2502
        UNCT       1.00      1.00      1.00     27479
          UX       0.92      0.95      0.93      1483
          YM       0.99      1.00      1.00       165
           _       0.53      0.92      0.67       283

   micro avg       0.95      0.95      0.95    145945
   macro avg       0.90   

In [155]:
def disambiguate_sentence(sentence, crf):
    processed_sentence = process_sentence(sentence.split())
    sent_features = create_features(processed_sentence)
    predicted_labels = crf.predict_single(sent_features)
    result = []
    for word, label in zip(sentence.split(), predicted_labels):
        result.append((word, label))
    return result

In [162]:
sentence = "Мама мыла раму, а я остался без мыла"

In [163]:
disambiguated_sentence = disambiguate_sentence(sentence, crf)
disambiguated_sentence

[('Мама', 'NOUN'),
 ('мыла', 'VERB'),
 ('раму,', 'X'),
 ('а', 'CCONJ'),
 ('я', 'PRON'),
 ('остался', 'VERB'),
 ('без', 'ADP'),
 ('мыла', 'NOUN')]