In [None]:
# Do not remove stop words
# LTSM + Dense + Dense
# No embeddings (50 dims)
# Val: 0.76 after 13 epochs

In [None]:
import json
import random

raw_data = []
with open("C:\\Projects\\Research\\Events\\notebooks\\file.json", "r+", encoding="utf-8") as f:
    for line in f:
        raw_data.append(json.loads(line.strip()))

In [None]:
data = [(d['text'], [a['label'] for a in d['annotations']]) for d in raw_data]

In [None]:
labels = sorted(list(set([item for sublist in [d[1] for d in data] for item in sublist])))
label_names = [
    "Вечеринка", "Выставка", "Интенсив", "Квиз", "Конкурс", "Конференция", "Концерт", "Лекция", "Мастер-класс",
    "Семинар", "Спектакль", "Тренинг", "Фестиваль", "Шоу", "Встреча", "Презентация", "Прием заявок", "Просмотр", "Экскурсия",
    "Ярмарка", "Автограф-сессия", "Квест", "Модный показ", "Хакатон", "Игра", "Стендап", "Спортивное мероприятие"]

In [None]:
import urlextract
import re
import math

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import Counter

class TextPreprocessor:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.url_extractor = urlextract.URLExtract()
        self.tag_regex = re.compile(r"<[^>]*>")
        self.email_regex = re.compile(r"[^\s]+@[^\s]+")
        self.number_regex = re.compile(r'\d+(?:\.\d*(?:[eE]\d+))?')
        self.dollar_regex = re.compile(r"[$]+")
        self.spaces_regex = re.compile(r"\s+")
        self.special_chars = [
            "<", "[", "]", "`", "^", ">", "+", "?", "!", "'", ".", ",", ":",
            "*", "%", "#", "_", "=", "-", "&", '/', '\\', '(', ')', ";", "\"", "«", "»", "|", "•", "—", "–", "●", "►", "\n",
            "@"
        ]
        self.stop_words = set(stopwords.words('russian'))

    def preprocess_text(self, text):
        text = text.lower()
        text = self.remove_html_tags(text)
        text = self.replace_urls(text)
        text = self.replace_emails(text)
        text = self.replace_numbers(text)
        text = self.replace_dollar_signs(text)
        text = self.remove_special_characters(text)
        text = self.remove_stop_words(text)
        text = self.spaces_regex.sub(' ', text)
        return text.strip()

    def remove_html_tags(self, text):
        text = self.tag_regex.sub(" ", text).split(" ")
        text = filter(len, text)
        text = ' '.join(text)
        return text

    def replace_urls(self, text):
        urls = list(set(self.url_extractor.find_urls(text)))
        urls.sort(key=lambda u: len(u), reverse=True)
        for url in urls:
            text = text.replace(url, " httpaddr ")
        return text

    def replace_emails(self, text):
        return self.email_regex.sub(" emailaddr ", text)
    
    def replace_numbers(self, text):
        return self.number_regex.sub(" number ", text)

    def replace_dollar_signs(self, text):
        return self.dollar_regex.sub(" dollar ", text)

    def remove_special_characters(self, text):
        for char in self.special_chars:
            text = text.replace(str(char), " ")
        return text
    
    def remove_stop_words(self, text):
        for word in self.stop_words:
            text = text.replace(" %s " % word, " ")
        return text

preprocessor = TextPreprocessor()
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def tokenize(text):
    text = preprocessor.preprocess_text(text)
    return [morph.parse(t.lower())[0].normal_form for t in word_tokenize(text)]

def calculate_tf_idf(documents):
    words_freqs = []
    for i, document in enumerate(documents):
        words_freqs.append({})
        token_counts = Counter(document)
        total = sum([c for (w, c) in token_counts.items()])
        for token, count in token_counts.items():
            words_freqs[-1][token] = count / total

    words_idf = []
    for i, document in enumerate(documents):
        words_idf.append({})
        freqs = words_freqs[i]
        for token, freq in freqs.items():
            docs_with_token = len([doc for j, doc in enumerate(documents) if token in words_freqs[j]])
            idf = math.log(len(documents) / docs_with_token)
            words_idf[-1][token] = freq * idf
    return words_idf

def get_top(i, k):
    return [w for w, c in sorted(words_idf[i].items(), key=lambda p:p[1], reverse=True)[:k]]

def get_bottom(i, k):
    return [w for w, c in sorted(words_idf[i].items(), key=lambda p:p[1], reverse=False)[:k]]

In [None]:
data = [(tokenize(d[0]), d[1]) for d in data] 

In [None]:
# Load embeddings
vocab = []
embedding_dim = 300
embeddings = [np.zeros(embedding_dim), np.random.rand(embedding_dim)]

idx2word = ["<PAD>", "<UNK>"]
word2idx = defaultdict(lambda: 0)
word2idx["<PAD>"] = 0
word2idx["<UNK>"] = 1

with open("static embeddings\\model.txt", "r+", encoding="utf-8") as f:
    f.readline() # skip header
    for line in f:
        word_pos, *vector = line.strip().split(" ")
        word, pos = word_pos.split("_")
        vector = np.array([float(v) for v in vector])
        idx2word.append(word)
        word2idx[word] = len(idx2word) - 1
        embeddings.append(vector)
        
embeddings = np.array(embeddings)

In [None]:
# custom vocab

from collections import defaultdict

vocab = sorted(list(set([item for sublist in [d[0] for d in data] for item in sublist])))

special_tokens = ["<PAD>", "<UNK>"]
idx2word = special_tokens
word2idx = defaultdict(lambda: 0)
word2idx["<PAD>"] = 0
word2idx["<UNK>"] = 1

for word in vocab:
    idx2word.append(word)
    word2idx[word] = len(idx2word) - 1

In [None]:
idx2word[:10]

In [None]:
import matplotlib.pyplot as plt

lengths = [len(d[0]) for d in data]
plt.hist(lengths, bins = 50)
plt.show()

In [None]:
max_length = 600
import numpy as np

def pad_data(tokens, max_length=600):
    idxs = [word2idx[word] for word in tokens]
    pad_idx = word2idx["<PAD>"]
    if len(idxs) < max_length:
        idxs.extend([pad_idx] * (max_length - len(idxs)))
    else:
        return np.array(idxs[:max_length])
    return np.array(idxs)

def get_tfidf(tokens, document_id, max_length=600):
    weights = [words_idf[document_id][token] for token in tokens]
    if len(weights) < max_length:
        weights.extend([0] * (max_length - len(weights)))
    else:
        return np.array(weights[:max_length])
    return np.array(weights)

def labels_to_one_hot(text_labels):
    one_hot = np.zeros(len(labels))
    for label in text_labels:
        one_hot[labels.index(label)] = 1
    return one_hot

In [None]:
# with tfidf

import random 

x_train_data = []
i_train_data = []
y_train_data = []

train_data = []

for i, tokens in enumerate(tokenized_texts):
    train_data.append((pad_data(tokens), get_tfidf(tokens, i), labels_to_one_hot(data[i][1])))

random.shuffle(train_data)
val_data = train_data[:161]
train_data = train_data[161:]

x_train_data = np.array([d[0] for d in train_data])
i_train_data = np.array([d[1] for d in train_data])
y_train_data = np.array([d[2] for d in train_data])
                      
x_val_data = np.array([d[0] for d in val_data])
i_val_data = np.array([d[1] for d in val_data])
y_val_data = np.array([d[2] for d in val_data])

In [None]:
# without tfidf

import random 

x_train_data = []
y_train_data = []

train_data = []

for i, tokens in enumerate(tokenized_texts):
    train_data.append((pad_data(tokens), labels_to_one_hot(data[i][1])))

random.shuffle(train_data)

val_data = train_data[:161]
train_data = train_data[161:]

x_train_data = np.array([d[0] for d in train_data])
y_train_data = np.array([d[1] for d in train_data])
                      
x_val_data = np.array([d[0] for d in val_data])
y_val_data = np.array([d[1] for d in val_data])

In [None]:
# No embeddings
# TFIDF

import tensorflow as tf
from tensorflow.keras import Model, Input, Sequential, initializers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Embedding, Layer, InputSpec, Dense, TimeDistributed, Dropout, Bidirectional, Lambda, Add, Flatten, Activation
from tensorflow.keras.regularizers import L1L2


n_words = len(idx2word)
embedding_dim = 50
batch_size = 32

tokens_input = Input(shape=(max_length,))
tf_idf_input = Input(shape=(max_length,))

embedding = Embedding(input_dim=n_words, output_dim=embedding_dim)(tokens_input)
features = Bidirectional(LSTM(64))(embedding)
features = Dropout(0.1)(features)
features = tf.keras.layers.concatenate([features, tf_idf_input])
dense = Dense(512)(features)
output = Dense(len(labels), activation='sigmoid')(features)

model = Model(inputs=[tokens_input, tf_idf_input], outputs=[output])
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
model.fit([x_train_data, i_train_data], y_train_data, batch_size=batch_size, epochs=15, verbose=1, validation_data=([x_val_data, i_val_data], y_val_data))

In [None]:
# Embeddings

import tensorflow as tf
from tensorflow.keras import Model, Input, Sequential, initializers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Embedding, Layer, InputSpec, Dense, TimeDistributed, Dropout, Bidirectional, Lambda, Add, Flatten, Activation
from tensorflow.keras.regularizers import L1L2


n_words = len(idx2word)
embedding_dim = 300
batch_size = 32

tokens_input = Input(shape=(max_length,))

embedding = Embedding(input_dim=n_words, output_dim=embedding_dim, weights=[embeddings], trainable=False, mask_zero=True)(tokens_input)
features = Bidirectional(LSTM(64))(embedding)
features = Dropout(0.1)(features)
dense = Dense(512)(features)
dense = Dense(256)(dense)
output = Dense(len(labels), activation='sigmoid')(dense)

model = Model(tokens_input, output)
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(x_train_data, y_train_data, batch_size=batch_size, epochs=5, verbose=1, validation_data=(x_val_data, y_val_data))

In [None]:
y_val_pred = model.predict([x_val_data, i_val_data])

In [None]:
np.argmax(y_val_pred[15])

In [None]:
import tensorflow as tf
from tensorflow.keras import Model, Input, Sequential, initializers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Embedding, Layer, InputSpec, Dense, TimeDistributed, Dropout, Bidirectional, Lambda, Add, Flatten, Activation
from tensorflow.keras.regularizers import L1L2


n_words = len(idx2word)
embedding_dim = 50
batch_size = 32

tokens_input = Input(shape=(max_length,))

embedding = Embedding(input_dim=n_words, output_dim=embedding_dim)(tokens_input)
features = Bidirectional(LSTM(128))(embedding)
features = Dropout(0.1)(features)
dense = Dense(512)(features)
dense = Dense(256)(features)
output = Dense(len(labels), activation='sigmoid')(features)

model = Model(inputs=[tokens_input], outputs=[output])
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# 13 epoch - max val value
model.fit([x_train_data], y_train_data, batch_size=batch_size, epochs=15, verbose=1, validation_data=([x_val_data], y_val_data))

In [None]:
history2 = model.fit([x_train_data], y_train_data, batch_size=batch_size, epochs=5, verbose=1, validation_data=([x_val_data], y_val_data))