## Prepare normalized texts

In [10]:
import glob
import numpy as np
import re

def get_claims_texts(fname):
    with open(fname) as f:
        all_claims_text = f.read()
    return re.split("\d{7}", all_claims_text)

claims_texts = {}
for claim_fname in glob.glob("1551/*.txt"):
    cat_name = claim_fname[5:-4]
    claims_texts[cat_name] = []
    for claim_text in get_claims_texts(claim_fname):
        claims_texts[cat_name].append(claim_text)


import langid

is_uk = lambda text: langid.classify(text)[0] == 'uk'

for cat in claims_texts:
    prev_len = len(claims_texts[cat])
    claims_texts[cat] = [text for text in claims_texts[cat] if is_uk(text)]
    #print(f'{cat}: {len(claims_texts[cat])}/{prev_len}')


y2cat = list(claims_texts.keys())
cat2y = lambda cat : y2cat.index(cat)


import random

TRAIN_SIZE = 0.7

X_train_texts, Y_train = [], []
X_test_texts, Y_test = [], []
for cat in claims_texts:
    y = cat2y(cat)
    random.shuffle(claims_texts[cat])
    train_size = int(len(claims_texts[cat]) * TRAIN_SIZE)
    for i in range(train_size):
        X_train_texts.append(claims_texts[cat][i])
        Y_train.append(y)
    for i in range(train_size, len(claims_texts[cat])):
        X_test_texts.append(claims_texts[cat][i])
        Y_test.append(y)


from stop_words import get_stop_words
import tokenize_uk
import pymorphy2
import re

morph = pymorphy2.MorphAnalyzer(lang='uk')
stop_words = get_stop_words('ukrainian')

def text2norm_words(text):
    words = tokenize_uk.tokenize_uk.tokenize_words(text)

    # f1: 0.3 -> 0.36
    words = [w for w in words if len(w) > 3]
    words = [w for w in words if w.isalpha()]
    words = [w.lower() for w in words]

    # f1: 0.36 -> 0.39
    words = [w for w in words if w not in stop_words]
    words = [morph.parse(word)[0].normal_form for word in words]
    #TODO: filter by POS

    #words = list(set(words)) -> f1 -= 0.04
    if not words:
        words = ['']
    return words

X_train_texts = [text2norm_words(text) for text in X_train_texts]
X_test_texts = [text2norm_words(text) for text in X_test_texts]

In [11]:
import json
with open('xy_texts.json', 'w') as f:
    data = {'X_train_texts':X_train_texts, 'X_test_texts':X_test_texts, 'Y_train':Y_train, 'Y_test':Y_test}
    json.dump(data, f)

In [8]:
import json
with open('xy_texts.json') as f:
    data = json.load(f)

train_texts = data['X_train_texts']
test_texts  = data['X_test_texts']
Y_train = data['Y_train']
Y_test = data['Y_test']

In [11]:
from gensim.models import KeyedVectors

uk_vectors_file = 'news.lowercased.tokenized.word2vec.300d'
uk_vectors = KeyedVectors.load_word2vec_format(uk_vectors_file, binary=False)

word2vec = lambda word : np.array(uk_vectors[word]) if word in uk_vectors else np.zeros(300)

def text2vecs(text):
    return [word2vec(word) for word in text2norm_words(text)]

train_vecs_sequences = [text2vecs(text) for text in train_texts]
test_vecs_sequences = [text2vecs(text) for text in test_texts]

## Baseline classifier
Text -> average(words_vectors)

In [12]:
import numpy as np

def average_sequence(sequence, first_n):
    res = np.zeros(300)
    for i in range(min(len(sequence), first_n)):
        res = np.add(res, sequence[i])
    return res

def vectorize_sequences(train_vecs):
    results = np.zeros((len(train_vecs), len(train_vecs[0])))
    for i, train_vec in enumerate(train_vecs):
        results[i] = train_vec
    return results

FIRST_N = 20
X_train = vectorize_sequences([average_sequence(vecs_sequence, FIRST_N) for vecs_sequence in train_vecs_sequences])
X_test = vectorize_sequences([average_sequence(vecs_sequence, FIRST_N) for vecs_sequence in test_vecs_sequences])

Y_train = np.asarray(Y_train).astype('float32')
Y_test = np.asarray(Y_test).astype('float32')

In [15]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import regularizers

model = keras.Sequential([
    keras.layers.Dense(50, activation='relu', input_shape=(300,)),
    keras.layers.Dense(188, activation='softmax')
])

model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

#model.fit(X_train, Y_train, epochs=5)

### Select optimal params
model.evaluate(X_test, Y_test):
* layer_size = 20, FIRST_N=5:  acc=0.33
* layer_size = 20, FIRST_N=10: acc=0.42
* layer_size = 20, FIRST_N=20: acc=0.43
* layer_size = 20, FIRST_N=50: acc=0.41
* layer_size = 20, FIRST_N=1000: acc=0.40


* FIRST_N=20, layer_size = 20: acc=0.43
* FIRST_N=20, layer_size = 50: acc=0.46
* FIRST_N=20, layer_size = 100: acc=0.47

## LSTM-based classifier
Let's change first layer for previous network

In [16]:
def vectorize_sequences(X, new_x_len):
    def alighn(vecs, new_len):
        res = np.zeros((new_len, 300))
        for i in range(min(new_len, len(vecs))):
            for j in range(300):
                res[i][j] = vecs[i][j]
        return res

    results = np.zeros((len(X), new_x_len, 300))
    for i in range(len(X)):
        results[i] = alighn(X[i], new_x_len)
    return results

In [17]:
X_train = vectorize_sequences(train_vecs_sequences , 20)
X_test = vectorize_sequences(test_vecs_sequences, 20)

In [21]:
model = keras.Sequential([
    keras.layers.LSTM(32, input_shape=(20, 300)),
    keras.layers.Dense(50, activation='relu', kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l1(0.01)),
    keras.layers.Dense(188, activation='softmax')
])

model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train, Y_train, epochs=5, validation_split=0.2)

Train on 35035 samples, validate on 8759 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
model.evaluate(X_test, Y_test)



[4.049109916319398, 0.42894208]

### Also where tested in LSTM network:
* dropout and recurrent_dropout in LSTM layer
* dropout in dense layers
* units in dense and LSTM layers
* bigger train sequence(FIRST_N)
* no hidden layer

All this approaches gave similar performance