### Задание 8. Рекуррентные нейронные сети

In [1]:
import pandas as pd
import numpy as np

from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re
from utils import apostrophe_dict, emoticon_dict, short_word_dict

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, Embedding, Masking
from tensorflow.keras.layers import Conv1D, GlobalMaxPool1D, MaxPooling1D, SimpleRNN, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping  

from tqdm import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv('train.csv')
df_val = pd.read_csv('val.csv')

In [3]:
df_train.head(4)

Unnamed: 0,id,text,class
0,0,@alisachachka не уезжаааааааай. :(❤ я тоже не ...,0
1,1,RT @GalyginVadim: Ребята и девчата!\nВсе в кин...,1
2,2,RT @ARTEM_KLYUSHIN: Кто ненавидит пробки ретви...,0
3,3,RT @epupybobv: Хочется котлету по-киевски. Зап...,1


In [4]:
df_val.head(4)

Unnamed: 0,id,text,class
0,181467,RT @TukvaSociopat: Максимальный репост! ))) #є...,1
1,181468,чтоб у меня з.п. ежегодно индексировали на инд...,0
2,181469,@chilyandlime нехуя мне не хорошо !!! :((((,0
3,181470,"@inafish нее , когда ногами ахахах когда?ахаха...",0


In [5]:
sw = set(get_stop_words('ru'))
exclude = set(punctuation)
morpher = MorphAnalyzer()

In [6]:
def replace_words(text,dict_): 
    output = ''
    
    for word in text.split(' '):
        word = word.strip()
        if word in dict_.keys(): 
            output += ' ' + dict_[word]
        else:
            output += ' ' + word
            
    return output

def preprocess_text(txt):
    txt = str(txt)
    txt = re.sub('[\,]','',txt)
    txt = re.sub('@[\w]*','',txt)
    txt = replace_words(txt, emoticon_dict)
    txt = replace_words(txt, apostrophe_dict)
    txt = replace_words(txt, short_word_dict)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub('\sне', 'не', txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    
    return ' '.join(txt)

In [7]:
df_train['text'] = df_train['text'].progress_apply(preprocess_text)
df_val['text'] = df_val['text'].progress_apply(preprocess_text)

100%|████████████████████████████████████████████████████████████████████████| 181467/181467 [02:28<00:00, 1225.54it/s]
100%|██████████████████████████████████████████████████████████████████████████| 22683/22683 [00:18<00:00, 1222.03it/s]


In [8]:
df_train.head(4)

Unnamed: 0,id,text,class
0,0,уезжаааааааать ❤ тожена хотеть уезжать,0
1,1,rt ребята девчата кино любовь завтра вотэтолюбовь,1
2,2,rt ктоненавидеть пробка ретвит rt,0
3,3,rt хотеться котлета покиевск запретный плод happy,1


In [9]:
df_val.head(4)

Unnamed: 0,id,text,class
0,181467,rt максимальный репост happy євромайдан httptc...,1
1,181468,зп ежегодно индексировать индекс инфляция тари...,0
2,181469,нехуй мнен,0
3,181470,нога ахахи когдаахах честнна помнить завтра шк...,0


In [10]:
text_corpus_train = df_train['text'].values
text_corpus_valid = df_val['text'].values

In [11]:
tokenizer = Tokenizer(num_words=None, 
                     filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')

tokenizer.fit_on_texts(text_corpus_train)

In [12]:
sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_val = tokenizer.texts_to_sequences(text_corpus_valid)

In [13]:
word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])

In [14]:
X_train = pad_sequences(sequences_train, maxlen=training_length)
X_valid = pad_sequences(sequences_val, maxlen=training_length)

In [15]:
y_train = df_train['class'].values
y_val = df_val['class'].values

In [16]:
results = {
    'NN': [],
    'loss': [],
    'accuracy': []
}

In [17]:
early_stopping = EarlyStopping(monitor='val_loss')

#### CNN

In [18]:
model = Sequential()

model.add(Embedding(input_dim=word_count, 
                    input_length=training_length, 
                    output_dim=30, 
                    trainable=True, 
                    mask_zero=True))

model.add(Masking(mask_value=0.0))
model.add(Conv1D(128, 3))
model.add(Activation('relu'))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [19]:
history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=5,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping]
                    )

Epoch 1/5
Epoch 2/5


In [20]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append('CNN')
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Test score: 0.47195789217948914
Test accuracy: 0.7719878554344177


#### SimpleRNN

In [21]:
model = Sequential()

model.add(Embedding(input_dim=word_count, 
                    input_length=training_length, 
                    output_dim=30, 
                    trainable=True, 
                    mask_zero=True))

model.add(Masking(mask_value=0.0))
model.add(SimpleRNN(64))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [22]:
history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping]
                    )

Epoch 1/10
Epoch 2/10


In [23]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append('SimpleRNN')
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Test score: 0.4812166690826416
Test accuracy: 0.7678437829017639


#### CNN+RNN

In [24]:
model = Sequential()

model.add(Embedding(input_dim=word_count,
                    input_length=training_length,
                    output_dim=30,
                    trainable=True,
                    mask_zero=True))

model.add(Masking(mask_value=0.0))
model.add(Conv1D(32, 2))
model.add(Activation('relu'))
model.add(Conv1D(16, 2))
model.add(Activation('relu'))
model.add(MaxPooling1D(1))
model.add(LSTM(16,return_sequences=True))
model.add(LSTM(16))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [25]:
history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10


In [26]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append('CNN+RNN')
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Test score: 0.46573373675346375
Test accuracy: 0.7727813720703125


#### RNN+CNN

In [27]:
model = Sequential()

model.add(Embedding(input_dim=word_count, 
                    input_length=training_length, 
                    output_dim=30, 
                    trainable=True, 
                    mask_zero=True))

model.add(Masking(mask_value=0.0))
model.add(LSTM(32,return_sequences=True))
model.add(LSTM(32,return_sequences=True))
model.add(Conv1D(32, 3))
model.add(Activation('relu'))
model.add(MaxPooling1D(2))
model.add(Activation('relu'))
model.add(Conv1D(16, 3))
model.add(Activation('relu'))
model.add(GlobalMaxPool1D())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [28]:
history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10


In [29]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append('RNN+CNN')
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Test score: 0.4682137072086334
Test accuracy: 0.7696512937545776


#### Результат

In [30]:
pd.DataFrame(results)

Unnamed: 0,NN,loss,accuracy
0,CNN,0.471958,0.771988
1,SimpleRNN,0.481217,0.767844
2,CNN+RNN,0.465734,0.772781
3,RNN+CNN,0.468214,0.769651


Результат у всех примерно одинаковый, обычный CNN выглядит чуть предпочтительнее.