In [None]:
def read_corpus(path):
    with open(path, encoding='utf8') as file:
        sentence = []
        for line in file:
            if not line.strip():
                yield sentence
                sentence = []
            else:
                _, word, lemma, pos, tagline, *_ = line.split()
                
                tags = {}
                if tagline != '_':
                    tags.update(elem.split("=") for elem in tagline.split("|"))

                sentence.append({'Word': word, 'POS': pos, 'Lemma': lemma.lower(), 'Tags': tags})

In [3]:
# data is located at https://github.com/dialogue-evaluation/morphoRuEval-2017
data = []
for corpus in [
    'models/data/gikrya_new_train.out', 
    'models/data/gikrya_new_test.out', 
]:
    data.extend(read_corpus(corpus))

In [4]:
import re
import unicodedata


def preprocess(word):
    word = word.strip().lower().replace('_', ' ')
    word = re.sub('\d', 'D', word)
    word = word.replace('<emo>', '.')
    return word


def is_useful_example(word):
    return (
        word['POS'] != 'PUNCT' and 
        word['Tags'].get('NumForm') != 'Digit' and 
        not (word['POS'] == 'VERB' and word['Word'] in ['гуля', 'МАША']) and
        not all(unicodedata.category(ch)[0] == 'P' for ch in word['Word']) and
        not re.match('\d+([.,]\d+)?$', word['Word'])
    )

In [5]:
import pymorphy2

morph = pymorphy2.MorphAnalyzer()

for i, word in enumerate(word for sent in data for word in sent if is_useful_example(word)):
    if word['POS'] == 'VERB' and word['Tags'].get('Tense') == 'Notpast':
        if word['Word'].lower() in ['нет', 'нету', 'мятясь', 'внемлет', 'ебу', 'упоминаеться']:
            word['Tags']['Tense'] = 'Pres'
        else:
            parses = morph.parse(word['Word'])
            for parse in parses:
                if parse.tag.POS in ['VERB', 'GRND']:
                    if parse.tag.tense == 'futr':
                        word['Tags']['Tense'] = 'Fut'
                        break
                    elif parse.tag.tense == 'pres':
                        word['Tags']['Tense'] = 'Pres'
                        break
            else:
                word['Tags'].pop('Tense')

In [6]:
for i, word in enumerate(word for sent in data for word in sent if is_useful_example(word)):
    if word['POS'] == 'VERB':
        parses = morph.parse(word['Word'])
        for parse in parses:
            if parse.tag.POS in ['VERB']:
                if parse.tag.aspect == 'perf':
                    word['Tags']['Aspect'] = 'Perf'
                    break
                elif parse.tag.aspect == 'impf':
                    word['Tags']['Aspect'] = 'Imp'
                    break

In [7]:
for i, word in enumerate(word for sent in data for word in sent if is_useful_example(word)):
    if word['POS'] == 'ADJ' and 'Variant' not in word['Tags']:
        word['Tags']['Variant'] = 'Full'

In [8]:
from sklearn.preprocessing import LabelEncoder
from maru.grammeme import (
    Animacy,
    Aspect,
    Case,
    Degree,
    Gender,
    Mood,
    Number,
    NumericalForm,
    Person,
    PartOfSpeech,
    Tense,
    Variant,
    VerbForm,
    Voice,
)
from maru.tag import Tag

GRAMMEMES = {
    'animacy': Animacy,
    'aspect': Aspect,
    'case': Case,
    'degree': Degree,
    'gender': Gender,
    'mood': Mood,
    'number': Number,
    'numform': NumericalForm,
    'person': Person,
    'pos': PartOfSpeech,
    'tense': Tense,
    'variant': Variant,
    'verbform': VerbForm,
    'voice': Voice,
}



def to_tag(parts):
    grammemes = {}

    for part in parts:
        label, value = part.split('=')
        grammeme = GRAMMEMES[label]
        grammemes[label] = grammeme(value)

    return Tag(**grammemes)


def get_class(word):    
    return to_tag([f"pos={word['POS']}"] + [f'{name.lower()}={value}' for name, value in word['Tags'].items()])

Using TensorFlow backend.


In [None]:
tags = {}
y = []

for sent in data:
    classes = []
    for word in sent:
        cls = get_class(word) if is_useful_example(word) else ''
        if cls:
            cls = tags.setdefault(cls, str(len(tags) + 1))
        classes.append(cls)
    y.append(classes)

In [11]:
len(y)

83150

In [12]:
len(tags)

343

In [13]:
import re

char_vocabulary = {}

for sent in data:
    for word in sent:
        for sym in preprocess(word['Word']):
            char_vocabulary.setdefault(sym, len(char_vocabulary) + 1)

In [14]:
char_vocabulary

{'ч': 1,
 'ь': 2,
 'я': 3,
 '-': 4,
 'т': 5,
 'о': 6,
 'р': 7,
 'у': 8,
 'к': 9,
 'а': 10,
 'л': 11,
 'е': 12,
 'г': 13,
 'м': 14,
 'н': 15,
 'п': 16,
 '.': 17,
 'д': 18,
 ',': 19,
 'з': 20,
 'в': 21,
 'ж': 22,
 'и': 23,
 'б': 24,
 'с': 25,
 'ц': 26,
 'ю': 27,
 'ш': 28,
 'ы': 29,
 'х': 30,
 'э': 31,
 'й': 32,
 'щ': 33,
 'ф': 34,
 'ё': 35,
 ':': 36,
 '—': 37,
 ' ': 38,
 'ъ': 39,
 'D': 40,
 ')': 41,
 '?': 42,
 '!': 43,
 '"': 44,
 '(': 45,
 ';': 46,
 '/': 47,
 '[': 48,
 ']': 49,
 '+': 50,
 '>': 51,
 '<': 52,
 "'": 53,
 '|': 54}

In [15]:
import maru.feature.extractor
import maru.feature.window

extractor = maru.feature.extractor.Cache(
    maru.feature.extractor.PymorphyExtractor(hypotheses=10000000),
    size=40000,
)

In [16]:
from maru.feature.vocabulary import FeatureVocabulary

In [17]:
features = []
for sent in data:
    for word in sent:
        if is_useful_example(word):
            features.append(list(extractor.extract(preprocess(word['Word']))))

In [18]:
grammeme_vocabulary = FeatureVocabulary.train(features, min_count=10)

In [19]:
len(grammeme_vocabulary)

599

In [20]:
from maru.vectorizer.sparse import SparseFeatureVectorizer

vectorizer = SparseFeatureVectorizer(grammeme_vocabulary)

In [21]:
word_data = [[preprocess(word['Word']) for word in sent] for sent in data]

In [22]:
from sklearn.model_selection import train_test_split

train_data, test_data, train_labels, test_labels = train_test_split(word_data, y, test_size=0.05, random_state=12)

In [23]:
from keras.layers import Input, Embedding, BatchNormalization, Activation
from keras.layers.core import Dense, Reshape, Dropout
from keras.layers.recurrent import LSTM
from keras.layers.wrappers import Bidirectional, TimeDistributed
from keras.layers.merge import concatenate
from keras.models import Model

MAX_WORD_LENGTH = 12

VOCABULARY_SIZE = len(char_vocabulary) + 1
CLASS_COUNT = 344

RNN_DROPOUT = 0.2

CHAR_EMBEDDING_SIZE = 24
CHAR_HIDDEN_LAYER_SIZE = 256
CHAR_OUTPUT_LAYER_SIZE = 256
CHAR_EMBEDDING_DROPOUT = 0.3

GRAMMEME_EMBEDDING_SIZE = 128
GRAMMEME_DROPOUT = 0.3

LSTM_INPUT_SIZE = 128

WORD_LSTM_SIZE = 128

DENSE_SIZE = 128
DENSE_DROPOUT = 0.5


def create_grammeme_embedding():
    grammeme_input = Input(shape=(None, len(grammeme_vocabulary)), name='grammemes')
    grammeme_embedding = Dropout(GRAMMEME_DROPOUT)(grammeme_input)
    grammeme_embedding = Dense(GRAMMEME_EMBEDDING_SIZE, activation='relu')(grammeme_embedding)
    return grammeme_input, grammeme_embedding


def create_char_embedding():
    char_input = Input(shape=(None, MAX_WORD_LENGTH), name='chars')
    char_dropout = Dropout(CHAR_EMBEDDING_DROPOUT)
    char_embedding = Embedding(VOCABULARY_SIZE, CHAR_EMBEDDING_SIZE, name='char_embedding')
    char_embedding = TimeDistributed(char_embedding)(char_input)
    char_embedding = Reshape((-1, MAX_WORD_LENGTH * CHAR_EMBEDDING_SIZE))(char_embedding)
    char_embedding = char_dropout(char_embedding)
    char_embedding = char_dropout(Dense(CHAR_HIDDEN_LAYER_SIZE, activation='relu')(char_embedding))
    char_embedding = char_dropout(char_embedding)
    char_embedding = char_dropout(Dense(CHAR_OUTPUT_LAYER_SIZE, activation='relu')(char_embedding))
    return char_input, char_embedding


def create_network():
    grammeme_input, grammeme_embedding = create_grammeme_embedding()
    char_input, char_embedding = create_char_embedding()

    embeddings = concatenate([grammeme_embedding, char_embedding], name='lstm_input')

    lstm_input = Dense(LSTM_INPUT_SIZE, activation='relu')(embeddings)
    
    lstm_1 = LSTM(WORD_LSTM_SIZE, dropout=RNN_DROPOUT, recurrent_dropout=RNN_DROPOUT, return_sequences=True, name='lstm_1')
    lstm_1 = Bidirectional(lstm_1)(lstm_input)
  
    lstm_2 = LSTM(WORD_LSTM_SIZE, dropout=RNN_DROPOUT, recurrent_dropout=RNN_DROPOUT, return_sequences=True, name='lstm_2')
    lstm_2 = Bidirectional(lstm_2)(lstm_1)
    
    dense = TimeDistributed(Dense(DENSE_SIZE))(lstm_2)
    dense = TimeDistributed(Dropout(DENSE_DROPOUT))(dense)
    dense = TimeDistributed(BatchNormalization())(dense)
    dense = TimeDistributed(Activation('relu'))(dense)
    
    prob = Dense(CLASS_COUNT, activation='softmax')(dense)

    model = Model(inputs=[grammeme_input, char_input], outputs=prob)
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [24]:
model = create_network()

In [25]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
chars (InputLayer)              (None, None, 12)     0                                            
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, None, 12, 24) 1320        chars[0][0]                      
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, None, 288)    0           time_distributed_1[0][0]         
__________________________________________________________________________________________________
dropout_2 (Dropout)             multiple             0           reshape_1[0][0]                  
                                                                 dense_2[0][0]                    
          

In [None]:
import tqdm
from keras.callbacks import Callback


class ModelEvaluation(Callback):
    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % 5 == 0:
            predictions = []
            for sent, lab in tqdm.tqdm_notebook(zip(test_data, test_labels), total=len(test_data)):
                predictions.append(
                    model.predict_generator(
                        iter_batches([sent], [lab], 1),
                        steps=1,
                    ).argmax(axis=2)
                )

            tag_acc = []
            sent_acc = []
            for pred, true in zip(predictions, test_labels):
                pred = pred[0]
                true = [int(x or 0) for x in true]

                sent_acc.append(all(x == y for x, y in zip(pred, true)))
                tag_acc.extend(x == y for x, y in zip(pred, true))

            print(f'Tag accuracy: {numpy.mean(tag_acc)}')
            print(f'Sentence accuracy: {numpy.mean(sent_acc)}')

In [27]:
import random

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical


buckets = [
    range(1, 7),
    range(7, 15),
    range(15, 26),
    range(26, 41),
    range(41, 51),
    range(51, 1000000),
]


def get_input(batch):
    batch_labels = [
        to_categorical(
            [int(label or 0) for label in sent], 
            num_classes=CLASS_COUNT,
        )
        for _, sent in batch
    ]
    batch_labels = pad_sequences(batch_labels, value=[0] * CLASS_COUNT)

    batch_data_chars = []
    for sent, _ in batch:
        words = []
        for word in sent:
            words.append([char_vocabulary.get(sym, 0) for sym in word])

        words = pad_sequences(words, maxlen=MAX_WORD_LENGTH, padding='pre')
        batch_data_chars.append(words)
    batch_data_chars = pad_sequences(batch_data_chars, value=[0] * MAX_WORD_LENGTH)

    batch_data_grammemes = []
    for sent, _ in batch:
        features = []
        for word in sent:
            features.append(extractor.extract(word))

        features = vectorizer.transform(features)
        batch_data_grammemes.append(features.todense())
    batch_data_grammemes = pad_sequences(batch_data_grammemes, value=[0] * len(grammeme_vocabulary))

    return {'grammemes': batch_data_grammemes, 'chars': batch_data_chars}, batch_labels


def iter_batches(data, labels, batch_size):
    data_labels = list(zip(data, labels))

    while True:  
        random.shuffle(data_labels)
        
        batches = [([], sizes) for sizes in buckets]
        
        for x in data_labels:
            for batch, size in batches:
                if len(x[0]) in size:
                    batch.append(x)
                    if len(batch) == batch_size:
                        yield get_input(batch)
                        batch.clear()
                    break
            else:
                raise AssertionError(f'Bucket not found for sentence of length {x[0]}')
        for batch, _ in batches:
            if batch:
                yield get_input(batch)
                batch.clear()

In [29]:
import math

from keras.callbacks import ModelCheckpoint, TensorBoard
from keras_tqdm.tqdm_notebook_callback import TQDMNotebookCallback

BATCH_SIZE = 256

model.fit_generator(
    epochs=50,
    verbose=0,

    generator=iter_batches(train_data, train_labels, BATCH_SIZE),
    steps_per_epoch=int(math.ceil(len(train_data) / BATCH_SIZE)),

    validation_data=iter_batches(test_data, test_labels, BATCH_SIZE),
    validation_steps=int(math.ceil(len(test_data) / BATCH_SIZE)),

    callbacks=[
        TQDMNotebookCallback(),
        ModelEvaluation(),
        ModelCheckpoint('{epoch:02d}-{val_loss:.2f}.h5', monitor='val_loss'),
        TensorBoard(log_dir='rnn_logs'),
    ],
)

HBox(children=(IntProgress(value=0, description='Training', max=50), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 0', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 1', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 2', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 3', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 4', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4158), HTML(value='')))

Tag accuracy: 0.9740025978845797
Sentence accuracy: 0.7493987493987494


HBox(children=(IntProgress(value=0, description='Epoch 5', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 6', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 7', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 8', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 9', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4158), HTML(value='')))

Tag accuracy: 0.9742809426609761
Sentence accuracy: 0.7532467532467533


HBox(children=(IntProgress(value=0, description='Epoch 10', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 11', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 12', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 13', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 14', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4158), HTML(value='')))

Tag accuracy: 0.9747448506216366
Sentence accuracy: 0.7556517556517557


HBox(children=(IntProgress(value=0, description='Epoch 15', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 16', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 17', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 18', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 19', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4158), HTML(value='')))

Tag accuracy: 0.9747819632584895
Sentence accuracy: 0.7566137566137566


HBox(children=(IntProgress(value=0, description='Epoch 20', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 21', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 22', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 23', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 24', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4158), HTML(value='')))

Tag accuracy: 0.9750788643533123
Sentence accuracy: 0.7619047619047619


HBox(children=(IntProgress(value=0, description='Epoch 25', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 26', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 27', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 28', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 29', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4158), HTML(value='')))

Tag accuracy: 0.9749489701243274
Sentence accuracy: 0.7602212602212602


HBox(children=(IntProgress(value=0, description='Epoch 30', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 31', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 32', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 33', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 34', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4158), HTML(value='')))

Tag accuracy: 0.9748561885321952
Sentence accuracy: 0.7602212602212602


HBox(children=(IntProgress(value=0, description='Epoch 35', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 36', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 37', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 38', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 39', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4158), HTML(value='')))

Tag accuracy: 0.9753200964928558
Sentence accuracy: 0.7633477633477633


HBox(children=(IntProgress(value=0, description='Epoch 40', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 41', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 42', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 43', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 44', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4158), HTML(value='')))

Tag accuracy: 0.9755242159955465
Sentence accuracy: 0.765993265993266


HBox(children=(IntProgress(value=0, description='Epoch 45', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 46', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 47', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 48', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 49', max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4158), HTML(value='')))

Tag accuracy: 0.9751345333085916
Sentence accuracy: 0.7623857623857624



<keras.callbacks.History at 0x225676736d8>

In [32]:
from keras.models import load_model

model_best = load_model('45-0.07.h5')
model_best.save('maru/model/rnn/tagger.h5')

In [33]:
import pickle
from sklearn.externals import joblib

joblib.dump(maru.feature.extractor.PymorphyExtractor(hypotheses=10000000), 'maru/model/rnn/extractor.joblib', compress=True, protocol=pickle.HIGHEST_PROTOCOL)

['maru/model/rnn/extractor.joblib']

In [34]:
import json

with open('maru/model/rnn/grammeme_vocabulary.json', 'w', encoding='utf8') as f:
    json.dump(grammeme_vocabulary, f, indent=4, ensure_ascii=False)

In [35]:
joblib.dump({int(num): tag for tag, num in tags.items()}, 'maru/model/rnn/tags.joblib', compress=True, protocol=pickle.HIGHEST_PROTOCOL)

['maru/model/rnn/tags.joblib']

In [None]:
with open('maru/model/rnn/char_vocabulary.json', 'w', encoding='utf8') as f:
    json.dump(char_vocabulary, f, indent=4, ensure_ascii=False)