In [None]:
import numpy as np
import random
import nltk
import numpy as np
import tensorflow as tf
import scipy
from itertools import islice
from nltk import tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from tensorflow import keras
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dropout, Layer
from tensorflow.keras.layers import Embedding, Input, GlobalAveragePooling1D, Dense, Conv1D, Concatenate, TimeDistributed
from tensorflow.keras import utils
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.utils import to_categorical
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

Формирование словаря эмбеддингов

In [2]:
embedding_file = open("glove.twitter.27B.25d.txt", 'r')
word_dict = {}
word_dict['<empty_sent>'] = np.array([0.1] * 25)
word_dict['<pad_sent>'] = np.array([0.1] * 25)
for line in embedding_file:
  word, vector = line.split(maxsplit = 1)
  vector = np.fromstring(vector, "f", sep = " ")
  word_dict[word] = vector

Загрузка и предобработка данных

In [3]:
dataset = open("imdb62.txt", 'r')
data_list = []
user_list = []
content_list = []
for line in dataset:
  data_list.append(line)
random.shuffle(data_list)
for line in data_list:
  reviewId, userId, itemId, rating, title, content = line.split('\t')
  user_list.append(userId)
  content_list.append(content)

In [4]:
def vectorize(text_data, sent_num, word_num, embed_dim):
  vec_data = np.zeros(shape = (len(text_data), sent_num, word_num, embed_dim))
  for k in range(len(text_data)):
    message = text_data[k]
    vec_message = np.zeros(shape = (sent_num, word_num, embed_dim))
    sent_list = tokenize.sent_tokenize(message)
    sent_list_pad = (sent_list + ['<empty_sent>'] * sent_num)[:sent_num]
    for j in range(len(sent_list_pad)):
      sentence = sent_list_pad[j]
      vec_sentence = np.zeros(shape = (word_num, embed_dim))
      word_list = sentence.split()
      word_list_pad = (word_list+['<pad_sent>'] * word_num)[:word_num]
      for i in range(len(word_list_pad)):
        if (lemmatizer.lemmatize(word_list_pad[i])).lower() in word_dict:
          vec_sentence[i] = word_dict.get((lemmatizer.lemmatize(word_list_pad[i])).lower())
      vec_message[j] = vec_sentence
    vec_data[k] = vec_message
  return vec_data

In [5]:
embed_dim = 25
word_num = 40
sent_num = 15

In [6]:
text_data=vectorize(text_data = content_list, sent_num = sent_num, word_num = word_num, embed_dim = embed_dim)

In [7]:
train_text = text_data[:40000]
val_text = text_data[40000:50000]
test_text = text_data[50000:62000]

In [8]:
unique_user = set()
for user in user_list:
  unique_user.add(user)
user_index = {}
for index, number in enumerate(unique_user):
  user_index[number] = index
user = []
for i in user_list:
  user.append(user_index.get(i))

In [9]:
train_label = utils.to_categorical(user[:40000], 62)
val_label = utils.to_categorical(user[40000:50000], 62)
test_label = utils.to_categorical(user[50000:62000], 62)

Блок кодирующего компонента

In [10]:
class EncoderBlock(Layer):
    def __init__(self, embed_dim, num_heads, ffn_dim, rate = 0.1, name = None):
        super(EncoderBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads = num_heads, key_dim = embed_dim)
        self.ffn = Sequential(
            [Dense(ffn_dim, activation = "relu"), 
             Dense(embed_dim),]
        )
        self.layernorm_att = LayerNormalization(epsilon = 1e-8)
        self.layernorm_ffn = LayerNormalization(epsilon = 1e-8)
        self.dropout_att = Dropout(rate)
        self.dropout_ffn = Dropout(rate)

    def call(self, inputs, training):
        att_output = self.att(inputs, inputs)
        att_output = self.dropout_att(att_output, training = training)
        out_att = self.layernorm_att(inputs + att_output)
        ffn_output = self.ffn(out_att)
        ffn_output = self.dropout_ffn(ffn_output, training = training)
        return self.layernorm_ffn(out_att + ffn_output)
    
    def compute_output_shape(self, input_shape):
        return input_shape

Формирование модели

In [None]:
ffn_word_dim = 25
ffn_sent_dim = 50
word_output_dim = 50
sentence_output_dim = 100
class_num = 62
num_heads = 4
dropout_rate = 0.1

word_level_input = Input(shape = (word_num, embed_dim), name = 'word_level_input')
word_encoder = EncoderBlock(embed_dim = embed_dim, num_heads = num_heads, ffn_dim = ffn_word_dim, name = 'word_encoder')(word_level_input)
word_pooling = GlobalAveragePooling1D(name = 'word_pooling')(word_encoder)
word_dropout = Dropout(dropout_rate, name = 'word_dropout')(word_pooling)
word_level_out = Dense(word_output_dim, activation = 'relu', name = 'word_level_output')(word_dropout)
word_encoder = Model(word_level_input, word_level_out)

word_encoder.summary()

sentence_input = Input(shape = (sent_num, word_num, embed_dim), name = 'sentence_level_input')
sentence_repr = TimeDistributed(word_encoder, name = 'sentence_representation')(sentence_input)
sentence_encoder = EncoderBlock(embed_dim = word_output_dim, num_heads = num_heads, ffn_dim = ffn_sent_dim, name = 'sentence_encoder')(sentence_repr)
sentence_pooling = GlobalAveragePooling1D(name = 'sentence_pooling')(sentence_encoder)
sentence_dropout = Dropout(dropout_rate, name = 'sentence_dropout')(sentence_pooling)
sentence_level_out = Dense(sentence_output_dim, activation = 'relu', name = 'sentence_level_output')(sentence_dropout)

classifier = Dense(class_num , activation = 'softmax', name = 'classifier')(sentence_level_out)

model = keras.Model(sentence_input, classifier)
model.summary()

Обучение и оценка модели

In [None]:
model.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["accuracy"])

history = model.fit(train_text, train_label, 
                    epochs = 35, batch_size = 32,
                    validation_data = (val_text, val_label)
                   )

In [None]:
results = model.evaluate(test_text, test_label, verbose=2)

for name, value in zip(model.metrics_names, results):
    print("%s: %.3f" % (name, value))