## Урок 8. Рекуррентные нейронные сети RNN LSTM GRU

Материалы.<br>
У вас лекционный ноутбук в архиве он с картинками и презентация тоже есть в архиве

**Задание**

На вебинаре мы говорили что долгое время CNN и RNN архитектуры были конурируещими.

Постарайтесь выяснить какая архитектура больше подходит для задачи сантимент анализа на данных с вебинара
  1. построить свёрточные архитектуры
  2. построить различные архитектуры с RNN
  3. построить совместные архитектуры CNN -> RNN и (RNN -> CNN)
  4. сдлать выводы что получилось лучше


In [1]:
!pip install stop_words



In [2]:
# Попробуем запрограммировать простую рекурентную сеть. 
# Возьмем датасет с прошлого занятия

import pandas as pd
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re
from tqdm import tqdm

from utils import apostrophe_dict, emoticon_dict, short_word_dict  # см. файл utils.py

tqdm.pandas()

In [3]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
df_val = pd.read_csv("data/val.csv")

In [4]:
df_train.head()

Unnamed: 0,id,text,class
0,0,@alisachachka не уезжаааааааай. :(❤ я тоже не ...,0
1,1,RT @GalyginVadim: Ребята и девчата!\nВсе в кин...,1
2,2,RT @ARTEM_KLYUSHIN: Кто ненавидит пробки ретви...,0
3,3,RT @epupybobv: Хочется котлету по-киевски. Зап...,1
4,4,@KarineKurganova @Yess__Boss босапопа есбоса н...,1


In [5]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

# Теперь повторим это для всех записей.
def replase_words(text,dict_): 
    output = ''
    for word in text.split(' '): # не будем делить текст на части будем искать подстроку в строке. это касается только смайликов.
        word = word.strip()
        if word in dict_.keys(): 
            output += ' ' + dict_[word]
        else:
            output += ' ' + word
    return output

def preprocess_text(txt):
    txt = str(txt)
    txt = re.sub("[\,]","",txt)
    txt = re.sub("@[\w]*","",txt)
    # txt = re.sub("RT","",txt)
    # Заменим эмодзи на соответствующие им слова.
    txt = replase_words(txt, emoticon_dict)
     # Заменим сокращения на их полные формы
    txt = replase_words(txt, apostrophe_dict)
    txt = replase_words(txt, short_word_dict)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)#[w for w in txt if len(w)>1])

In [6]:
df_train['text'] = df_train['text'].progress_apply(preprocess_text)
df_val['text'] = df_val['text'].progress_apply(preprocess_text)
df_test['text'] = df_test['text'].progress_apply(preprocess_text)

100%|██████████| 181467/181467 [02:49<00:00, 1072.07it/s]
100%|██████████| 22683/22683 [00:23<00:00, 973.71it/s] 
100%|██████████| 22684/22684 [00:36<00:00, 626.89it/s]


In [7]:
df_train.head()

Unnamed: 0,id,text,class
0,0,уезжаааааааать ❤ тожена хотеть уезжать,0
1,1,rt ребята девчата кино любовь завтра вотэтолюбовь,1
2,2,rt ктоненавидеть пробка ретвит rt,0
3,3,rt хотеться котлета покиевск запретный плод happy,1
4,4,босапоп есбосан бояться мороз,1


In [53]:
import numpy as np
# import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, SimpleRNN, LSTM, GRU, Masking,MaxPooling1D, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import TensorBoard 
# from tensorflow.keras.objectives import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping  

In [9]:
text_corpus_train = df_train['text'].values
text_corpus_valid = df_val['text'].values
text_corpus_test = df_test['text'].values

In [10]:
tokenizer = Tokenizer(num_words=None, 
                     filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')
tokenizer.fit_on_texts(text_corpus_train)

sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_val = tokenizer.texts_to_sequences(text_corpus_valid)
sequences_test = tokenizer.texts_to_sequences(text_corpus_test)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])

X_train = pad_sequences(sequences_train, maxlen=training_length)
X_valid = pad_sequences(sequences_val, maxlen=training_length)
X_test = pad_sequences(sequences_test, maxlen=training_length)

In [51]:
word_count, training_length

(188690, 28)

In [23]:
y_train = df_train['class'].values
y_val = df_val['class'].values
# y_test = df_test['class'].values

In [24]:
results = {
    "NN":[],
    "loss":[],
    "accuracy":[]
}

In [25]:
early_stopping=EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=3)

### CNN

In [26]:
model = Sequential()
model.add(Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [27]:
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=512,
                    # validation_data=[X_test, y_test],
                    # validation_batch_size=512,
                    epochs=5,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping]
                    )

Epoch 1/5


In [28]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
# print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append("CNN")
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Test score: 0.4386669099330902
Test accuracy: 0.7766609191894531


## SimpleRNN

In [29]:

model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(SimpleRNN(64))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [30]:
history = model.fit(X_train, y_train,
                    batch_size=512,
                    # validation_data=[X_test, y_test],
                    # validation_batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping]
                    )

Epoch 1/10


In [31]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append("SimpleRNN")
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Test score: 0.4422447085380554
Test accuracy: 0.7722523212432861


## LSTM

In [32]:
model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(LSTM(64))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [33]:
history = model.fit(X_train, y_train,
                    batch_size=512,
                    # validation_data=[X_test, y_test],
                    # validation_batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10


In [34]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append("LSTM")
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Test score: 0.4356742799282074
Test accuracy: 0.775646984577179


## GRU

In [35]:
model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(GRU(64))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [36]:
history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10


In [37]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append("GRU")
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Test score: 0.4343072474002838
Test accuracy: 0.7766168713569641


In [38]:
model = Sequential()
model.add(Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=128,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))

model.add(MaxPooling1D(2))
model.add(Activation("relu"))

model.add(Conv1D(64, 3))
model.add(Activation("relu"))

# model.add(Conv1D(128, 3))
# model.add(Activation("relu"))

model.add(GlobalMaxPool1D())
model.add(Dense(64))
model.add(Activation("relu"))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [39]:
history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10


In [40]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append("CNN")
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Test score: 0.4320409595966339
Test accuracy: 0.7788652181625366


## CNN+RNN

In [91]:
model = Sequential()
model.add(Embedding(input_dim=word_count,
                    input_length=training_length,
                    output_dim=30,
                    trainable=True,
                    mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(Conv1D(32, 2))
model.add(Activation("relu"))
# model.add(BatchNormalization())
model.add(MaxPooling1D(1))

model.add(Conv1D(16, 2))
model.add(Activation("relu"))
model.add(MaxPooling1D(1))
# model.add(BatchNormalization())

# model.add(LSTM(32,return_sequences=True))
model.add(LSTM(16))
# model.add(Dense(64, activation='relu'))

# model.add(LSTM(64))

# model.add(Dense(32, activation='relu'))
# model.add(Conv1D(128, 3))
# model.add(Activation("relu"))

# model.add(GlobalMaxPool1D())
model.add(Dense(32))
model.add(Activation("relu"))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [92]:
history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])
                    
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append("CNN+RNN")
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Epoch 1/10
Test score: 0.43829575181007385
Test accuracy: 0.7751179337501526


In [43]:
model = Sequential()
model.add(Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=128,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))

model.add(MaxPooling1D(2))
model.add(Activation("relu"))

model.add(Conv1D(64, 3))
model.add(Activation("relu"))

model.add(LSTM(64))
model.add(Dense(64, activation='relu'))
# model.add(Conv1D(128, 3))
# model.add(Activation("relu"))

# model.add(GlobalMaxPool1D())
model.add(Dense(64))
model.add(Activation("relu"))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [44]:
history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])
                    
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append("CNN+RNN")
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Epoch 1/10
Test score: 0.43146824836730957
Test accuracy: 0.7774544954299927


## RNN+CNN

In [45]:
model = Sequential()
model.add(Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))

model.add(Masking(mask_value=0.0))
model.add(GRU(128,return_sequences=True))
# model.add(GlobalMaxPool1D())

model.add(Conv1D(128, 3))
model.add(Activation("relu"))

model.add(MaxPooling1D(2))
model.add(Activation("relu"))

model.add(Conv1D(64, 3))
model.add(Activation("relu"))

# model.add(Masking(mask_value=0.0))
# model.add(GRU(64))
# model.add(Conv1D(128, 3))
# model.add(Activation("relu"))

model.add(GlobalMaxPool1D())
model.add(Dense(64))
model.add(Activation("relu"))
model.add(Dense(1))
model.add(Activation('sigmoid'))
# model.summary()

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [46]:
history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])
                    
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append("RNN+CNN")
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Epoch 1/10
Test score: 0.44357970356941223
Test accuracy: 0.7723405361175537


In [47]:
model = Sequential()
model.add(Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=128,
              trainable=True,
              mask_zero=True))

model.add(Masking(mask_value=0.0))
model.add(LSTM(128,return_sequences=True))

model.add(Conv1D(128, 3))
model.add(Activation("relu"))

model.add(MaxPooling1D(2))
model.add(Activation("relu"))

model.add(Conv1D(64, 3))
model.add(Activation("relu"))

# model.add(Masking(mask_value=0.0))
# model.add(LSTM(64))
# model.add(Conv1D(128, 3))
# model.add(Activation("relu"))

model.add(GlobalMaxPool1D())
model.add(Dense(64))
model.add(Activation("relu"))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [48]:
history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])
                    
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append("RNN+CNN")
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Epoch 1/10
Test score: 0.4330320358276367
Test accuracy: 0.7791297435760498


In [86]:
pd.DataFrame(results)

Unnamed: 0,NN,loss,accuracy
0,CNN,0.438667,0.776661
1,SimpleRNN,0.442245,0.772252
2,LSTM,0.435674,0.775647
3,GRU,0.434307,0.776617
4,CNN,0.432041,0.778865
5,CNN+RNN,0.429332,0.782172
6,CNN+RNN,0.431468,0.777454
7,RNN+CNN,0.44358,0.772341
8,RNN+CNN,0.433032,0.77913
9,CNN+RNN,0.610862,0.71278
