# 4 задание по курсу формальных языков

In [1]:
! pip install pymorphy2 russian_tagsets sklearn_crfsuite conllu
! wget -O ru_syntagrus-ud-train.conllu "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-train-a.conllu"
! wget -O ru_syntagrus-ud-test.conllu "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-dev.conllu"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
--2023-06-21 00:29:59--  https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-train-a.conllu
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 40736581 (39M) [text/plain]
Saving to: ‘ru_syntagrus-ud-train.conllu’


2023-06-21 00:30:00 (219 MB/s) - ‘ru_syntagrus-ud-train.conllu’ saved [40736581/40736581]

--2023-06-21 00:30:00--  https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-dev.conllu
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubuserconte

## Функции для препроцессинга

In [2]:
import conllu
import sklearn_crfsuite

from pymorphy2 import MorphAnalyzer
from russian_tagsets import converters
from tqdm import tqdm

In [3]:
morph = MorphAnalyzer()
to_ud20 = converters.converter('opencorpora-int', 'ud20')

In [4]:
def word_to_features_and_label(sentence, word_position):
    USE_POS_FORMS = 3
    WINDOW_SIZE = 5

    features = {}
    label = sentence[word_position]['upos']

    for i in range(word_position - WINDOW_SIZE, word_position + WINDOW_SIZE):

        _relative_position_postfix = str(i - word_position)

        '''
        Признаки:
        -    начальная форма,
        -    окончание/суффикс/корень последние 3 буквы начальной формы,
        -    часть речи,
        -    признаки слова - падеж, число, род и прочее
        '''
        feat_keys = ['lemma', 'lemma[-3:]', 'upos', 'feats']
        pref_feat_keys = [ key + _relative_position_postfix for key in feat_keys ]

        if i >= 0 and i < len(sentence):
            pos_form_ind = 0
            pos_forms = morph.parse(sentence[i]['form'])

            while pos_form_ind < len(pos_forms) and pos_form_ind < USE_POS_FORMS:
                str_pos_form_ind = str(pos_form_ind)
                lemma_str, lemma_tail_str, upos_str, feats_str = [ key + str_pos_form_ind for key in pref_feat_keys ]

                upos, word_feats = to_ud20(
                    str( pos_forms[pos_form_ind].tag )
                ).split(' ')
                lemma = pos_forms[pos_form_ind].normal_form

                features[lemma_str]      = lemma
                features[lemma_tail_str] = lemma[-3:]
                features[upos_str]       = upos
                features[feats_str]      = word_feats
                pos_form_ind += 1

            # возможное кол-во частей речи менее используемого, тогда заполняем словарь признаков пропусками
            while pos_form_ind < USE_POS_FORMS:
                str_pos_form_ind = str(pos_form_ind)
                lemma_str, lemma_tail_str, upos_str, feats_str = [ key + str_pos_form_ind for key in pref_feat_keys ]

                features[lemma_str]      = '--'
                features[lemma_tail_str] = '--'
                features[upos_str]       = '--'
                features[feats_str]      = '--'
                pos_form_ind += 1

        else: # обработка токенов, за границами предложения, из-за размеров окна
            for pos_form_ind in range(USE_POS_FORMS):
                str_pos_form_ind = str(pos_form_ind)
                lemma_str, lemma_tail_str, upos_str, feats_str = [ key + str_pos_form_ind for key in pref_feat_keys ]

                features[lemma_str]      = '--'
                features[lemma_tail_str] = '--'
                features[upos_str]       = '--'
                features[feats_str]      = '--'

    return features, label

In [5]:
def get_dataset(connlu_iter):
    '''
    Формирует набор данных вида:
    (
        [
            [ {} -> набор признаков для одного слова, ... ], -> предложение
            ...
        ]
        ,
        [
            [ str -> часть речи , ... ], -> предложение
            ...
        ]
    )
    '''
    X = []
    y = []

    for sentence in connlu_iter:
        featured_sentence  = []
        labeled_sentence   = []

        for word_position in range(len(sentence)):
            features, pos = word_to_features_and_label(sentence, word_position)
            featured_sentence.append(features)
            labeled_sentence.append(pos)

        X.append(featured_sentence)
        y.append(labeled_sentence)

    return X, y

## Обучение модели

Будем использовать метод Марковских случайных полей, а именно CRF (Conditional Random Fields).

In [6]:
%%time

with open('ru_syntagrus-ud-train.conllu', 'rt', encoding="utf8") as f:
    train_connlu_iter = conllu.parse_incr(f)
    X_train, y_train = get_dataset(train_connlu_iter)

CPU times: user 10min 59s, sys: 10.3 s, total: 11min 10s
Wall time: 11min 18s


In [7]:
model = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

In [8]:
%%time
try:
    model.fit(X_train, y_train)
except:
    pass

CPU times: user 10min 50s, sys: 3.17 s, total: 10min 54s
Wall time: 10min 56s


In [9]:
print(list(model.classes_))

['NOUN', 'PUNCT', 'ADJ', 'PROPN', 'AUX', 'VERB', 'ADP', 'ADV', 'CCONJ', 'PART', 'PRON', 'DET', 'SCONJ', 'NUM', '_', 'INTJ', 'X', 'SYM']


## Валидация модели

In [10]:
%%time

with open('ru_syntagrus-ud-test.conllu', 'rt', encoding="utf8") as f:
    test_connlu_iter = conllu.parse_incr(f)
    X_test, y_test = get_dataset(test_connlu_iter)

CPU times: user 3min 59s, sys: 3.19 s, total: 4min 2s
Wall time: 4min 4s


In [11]:
%%time

y_pred = model.predict(X_test)

CPU times: user 14.2 s, sys: 58.5 ms, total: 14.2 s
Wall time: 14.4 s


In [12]:
y_pred_flatten = [item for row in y_pred for item in row]
y_test_flatten = [item for row in y_test for item in row]

In [14]:
from sklearn import metrics

metrics.f1_score(
    y_test_flatten,
    y_pred_flatten,
    average='weighted',
    labels=list(model.classes_)
)

0.9681154394969362

In [15]:
# Статистика по каждому классу
print(
    metrics.classification_report(
        y_test_flatten,
        y_pred_flatten,
        labels=list(model.classes_)
    )
)

              precision    recall  f1-score   support

        NOUN       0.97      0.99      0.98     36238
       PUNCT       1.00      1.00      1.00     29186
         ADJ       0.95      0.96      0.96     15103
       PROPN       0.90      0.80      0.85      5473
         AUX       0.93      0.91      0.92      1390
        VERB       0.98      0.98      0.98     17110
         ADP       1.00      1.00      1.00     13717
         ADV       0.94      0.96      0.95      7783
       CCONJ       0.96      0.98      0.97      5672
        PART       0.95      0.90      0.92      5125
        PRON       0.95      0.96      0.95      7444
         DET       0.94      0.89      0.91      4265
       SCONJ       0.91      0.94      0.92      2865
         NUM       0.93      0.92      0.92      1734
           _       1.00      1.00      1.00       265
        INTJ       0.68      0.54      0.60        24
           X       0.56      0.04      0.07       134
         SYM       1.00    

```
"The tag X is used for words that for some reason cannot be assigned a real part-of-speech category".
```
Вследствие чего и наблюдается ухудшение на данном классе.