Данные берем отызывы за лето

На вебинаре мы говорили, что долгое время CNN и RNN архитектуры были конурируещими выяснить какая архитектура больше подходит для нашей задачи
- построить свёрточные архитектуры
- построить различные архитектуры с RNN
- построить совместные архитектуры CNN -> RNN или (RNN -> CNN)

In [40]:
import re
import os
import nltk
import numpy as np
import pandas as pd
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, Flatten, SimpleRNN, LSTM, GRU, Reshape
from keras.losses import categorical_crossentropy, SparseCategoricalCrossentropy

In [41]:
data_path = r'/home/dmitriy/Downloads/ai_nlp_data/hw_8/'

In [42]:
data = pd.read_excel(os.path.join(data_path, r'отзывы за лето.xls'))

In [43]:
max_words = 20000
max_len = 150
num_classes = 5
epochs = 10
batch_size = 512
print_batch_n = 100

In [44]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dmitriy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/dmitriy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [45]:
stopwordslist = stopwords.words("russian")
ptrn = r'[^a-zA-Zа-яА-Я0-9]'
morpher = MorphAnalyzer()


def words_only(text):
    text=str(text)
    return text.lower()   


def remove_punkt(text):
    return re.sub(ptrn, ' ', text)


def to_token(text):
    return nltk.tokenize.word_tokenize(text)


def remove_stopwords(text):
    text_list = [w for w in text if w not in stopwordslist]
    return ' '.join(word for word in text_list)


def morphe_text(text):
    text = [morpher.parse(word)[0].normal_form for word in text.split() if word not in stopwordslist]
    return " ".join(text)


def normalize_text(text):
    text = words_only(text)
    text = remove_punkt(text)
    text = to_token(text)
    text = remove_stopwords(text)
    text = morphe_text(text)
    return text

In [46]:
data['normalized_content'] = data['Content'].apply(normalize_text)

In [47]:
train_corpus = " ".join(data['normalized_content'])
train_tokens = word_tokenize(train_corpus)
train_tokens_filtered = [word for word in train_tokens if word.isalnum()]

In [48]:
dist = FreqDist(train_tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]
voc = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [49]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in voc:
            result.append(voc[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]


data_train = np.asarray(
    [text_to_sequence(text, max_len) for text in data['normalized_content']], 
    dtype=np.int32)

In [50]:
X_train, X_test, y_train, y_test = train_test_split(data_train, data.Rating, test_size=0.3, random_state=1)
le = LabelEncoder()
y_train = le.fit_transform(y_train) 
y_test = le.transform(y_test)

CNN

In [51]:
model = Sequential()
model.add(Embedding(input_dim=max_words, 
                    output_dim=128, 
                    input_length=max_len))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(GlobalMaxPool1D())
model.add(Flatten())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f669ca88940>

In [52]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
metrics_df = pd.DataFrame(columns=['model', 'Test score', 'Test accuracy'])
metrics_df = pd.concat([metrics_df, pd.DataFrame({
    'model': ['CNN'],
    'Test score': [score[0]],
    'Test accuracy': [score[1]],
})])
print('Test score:', score[0])
print('Test accuracy:', score[1])

 1/13 [=>............................] - ETA: 1s - loss: 0.7886 - accuracy: 0.7305

Test score: 0.7740876078605652
Test accuracy: 0.7282994389533997


SimpleRNN

In [53]:
model = Sequential()
model.add(Embedding(input_dim=max_words, 
                    output_dim=128, 
                    input_length=max_len,
                    trainable=True,
                    mask_zero=True))
model.add(SimpleRNN(128))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Reshape((1,128)))
model.add(SimpleRNN(128))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f669d6f7d90>

In [54]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
metrics_df = pd.concat([metrics_df, pd.DataFrame({
    'model': ['SimpleRNN'],
    'Test score': [score[0]],
    'Test accuracy': [score[1]],
})])
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.9805441498756409
Test accuracy: 0.735237181186676


LSTM

In [55]:
model = Sequential()
model.add(Embedding(input_dim=max_words, 
                    output_dim=128, 
                    input_length=max_len,
                    trainable=True,
                    mask_zero=True))
model.add(LSTM(128))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Reshape((1,128)))
model.add(LSTM(128))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.2)

Epoch 1/10
 2/23 [=>............................] - ETA: 10s - loss: 1.6087 - accuracy: 0.4180 

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f662e402ec0>

In [56]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
metrics_df = pd.concat([metrics_df, pd.DataFrame({
    'model': ['LSTM'],
    'Test score': [score[0]],
    'Test accuracy': [score[1]],
})])
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.7613303661346436
Test accuracy: 0.750887393951416


GRU

In [57]:
model = Sequential()
model.add(Embedding(input_dim=max_words, 
                    output_dim=128, 
                    input_length=max_len,
                    trainable=True,
                    mask_zero=True))
model.add(GRU(128))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Reshape((1,128)))
model.add(GRU(128))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f669b0915d0>

In [58]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
metrics_df = pd.concat([metrics_df, pd.DataFrame({
    'model': ['GRU'],
    'Test score': [score[0]],
    'Test accuracy': [score[1]],
})])
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.808746337890625
Test accuracy: 0.749112606048584


CNN -> LSTM

In [59]:
model = Sequential()
model.add(Embedding(input_dim=max_words, 
                    output_dim=128, 
                    input_length=max_len,
                    trainable=True,
                    mask_zero=True))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f669ca887c0>

In [60]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
metrics_df = pd.concat([metrics_df, pd.DataFrame({
    'model': ['CNN -> LSTM'],
    'Test score': [score[0]],
    'Test accuracy': [score[1]],
})])
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.7965673804283142
Test accuracy: 0.7460471391677856


CNN -> GRU

In [61]:
model = Sequential()
model.add(Embedding(input_dim=max_words, 
                    output_dim=128, 
                    input_length=max_len,
                    trainable=True,
                    mask_zero=True))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(GRU(128))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.2)

Epoch 1/10
 5/23 [=====>........................] - ETA: 9s - loss: 1.5730 - accuracy: 0.5820 

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f669af3cc10>

In [62]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
metrics_df = pd.concat([metrics_df, pd.DataFrame({
    'model': ['CNN -> GRU'],
    'Test score': [score[0]],
    'Test accuracy': [score[1]],
})])
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.8225601315498352
Test accuracy: 0.7421749234199524


CNN -> SimpleRNN

In [63]:
model = Sequential()
model.add(Embedding(input_dim=max_words, 
                    output_dim=128, 
                    input_length=max_len,
                    trainable=True,
                    mask_zero=True))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(SimpleRNN(128))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.2)

Epoch 1/10

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f669a920b80>

In [64]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
metrics_df = pd.concat([metrics_df, pd.DataFrame({
    'model': ['CNN -> SimpleRNN'],
    'Test score': [score[0]],
    'Test accuracy': [score[1]],
})])
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.7766216397285461
Test accuracy: 0.7497580051422119


SimpleRNN -> CNN

In [65]:
model = Sequential()
model.add(Embedding(input_dim=max_words, 
                    output_dim=128, 
                    input_length=max_len,
                    trainable=True,
                    mask_zero=True))
model.add(SimpleRNN(128, return_sequences=True))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f65cc2e5bd0>

In [66]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
metrics_df = pd.concat([metrics_df, pd.DataFrame({
    'model': ['SimpleRNN -> CNN'],
    'Test score': [score[0]],
    'Test accuracy': [score[1]],
})])
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 1.1343705654144287
Test accuracy: 0.7247499227523804


LSTM -> CNN

In [67]:
model = Sequential()
model.add(Embedding(input_dim=max_words, 
                    output_dim=128, 
                    input_length=max_len,
                    trainable=True,
                    mask_zero=True))
model.add(LSTM(128, return_sequences=True))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f66171102e0>

In [68]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
metrics_df = pd.concat([metrics_df, pd.DataFrame({
    'model': ['LSTM -> CNN'],
    'Test score': [score[0]],
    'Test accuracy': [score[1]],
})])
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.8934483528137207
Test accuracy: 0.7460471391677856


GRU -> CNN

In [69]:
model = Sequential()
model.add(Embedding(input_dim=max_words, 
                    output_dim=128, 
                    input_length=max_len,
                    trainable=True,
                    mask_zero=True))
model.add(GRU(128, return_sequences=True))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.2)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f6616e50550>

In [70]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
metrics_df = pd.concat([metrics_df, pd.DataFrame({
    'model': ['GRU -> CNN'],
    'Test score': [score[0]],
    'Test accuracy': [score[1]],
})])
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.9781825542449951
Test accuracy: 0.7376573085784912


Итоги

In [71]:
metrics_df.sort_values(by='Test score', ascending=False)

Unnamed: 0,model,Test score,Test accuracy
0,SimpleRNN -> CNN,1.134371,0.72475
0,SimpleRNN,0.980544,0.735237
0,GRU -> CNN,0.978183,0.737657
0,LSTM -> CNN,0.893448,0.746047
0,CNN -> GRU,0.82256,0.742175
0,GRU,0.808746,0.749113
0,CNN -> LSTM,0.796567,0.746047
0,CNN -> SimpleRNN,0.776622,0.749758
0,CNN,0.774088,0.728299
0,LSTM,0.76133,0.750887


In [72]:
metrics_df.sort_values(by='Test accuracy', ascending=False)

Unnamed: 0,model,Test score,Test accuracy
0,LSTM,0.76133,0.750887
0,CNN -> SimpleRNN,0.776622,0.749758
0,GRU,0.808746,0.749113
0,CNN -> LSTM,0.796567,0.746047
0,LSTM -> CNN,0.893448,0.746047
0,CNN -> GRU,0.82256,0.742175
0,GRU -> CNN,0.978183,0.737657
0,SimpleRNN,0.980544,0.735237
0,CNN,0.774088,0.728299
0,SimpleRNN -> CNN,1.134371,0.72475
