# Практическое задание к уроку 6

In [77]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, SimpleRNN, LSTM, GRU, Masking
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard 
from keras.objectives import categorical_crossentropy
from keras.callbacks import EarlyStopping  
from sklearn.model_selection import train_test_split

In [9]:
print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

Version:  2.1.0
Eager mode:  True
Hub version:  0.10.0
GPU is available


In [29]:
data=pd.read_excel('отзывы за лето.xls')
data.head()

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14


In [36]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

data['Content'] =data['Content'].apply(preprocess_text)


In [37]:
df_train=data.copy()
df_train, df_val=train_test_split(data, test_size=0.33, random_state=42)
#df_train, df_val=train_test_split(train, test_size=0.2, random_state=42) 

In [38]:
text_corpus_train = df_train['Content'].values
text_corpus_valid = df_val['Content'].values

In [51]:
tokenizer = Tokenizer(num_words=None, 
                     filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')
tokenizer.fit_on_texts(text_corpus_train)

sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_val = tokenizer.texts_to_sequences(text_corpus_valid)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])

x_train = pad_sequences(sequences_train, maxlen=training_length)
x_val = pad_sequences(sequences_val, maxlen=training_length)

In [52]:
num_classes=data['Rating'].value_counts().shape[0]
num_classes+=1

In [53]:
y_train = keras.utils.to_categorical(df_train["Rating"], num_classes)
y_val = keras.utils.to_categorical(df_val["Rating"], num_classes)

In [69]:
RNN = Sequential()

RNN.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
RNN.add(Masking(mask_value=0.0))

RNN.add(SimpleRNN(64))
RNN.add(Dense(64, activation='relu'))
RNN.add(Dropout(0.5))
RNN.add(Dense(num_classes, activation='softmax'))

RNN.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = RNN.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=20,
                    verbose=False,
                    validation_data=(x_val,y_val)
                    )


In [70]:
score = RNN.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.2754876183891758
Test accuracy: 0.9217023849487305


In [78]:
LSTM_model = Sequential()

LSTM_model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
LSTM_model.add(Masking(mask_value=0.0))
LSTM_model.add(LSTM(64, recurrent_dropout=0.2))
LSTM_model.add(Dense(64, activation='relu'))
LSTM_model.add(Dropout(0.5))
LSTM_model.add(Dense(num_classes, activation='softmax'))

LSTM_model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = LSTM_model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=20,
                    verbose=False,
                    validation_data=(x_val,y_val)
                    )


In [79]:
score = LSTM_model.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.27873765752615554
Test accuracy: 0.9229002594947815


In [80]:
GRU_model = Sequential()

GRU_model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
GRU_model.add(Masking(mask_value=0.0))
GRU_model.add(GRU(64, recurrent_dropout=0.2))
GRU_model.add(Dense(64, activation='relu'))
GRU_model.add(Dropout(0.5))
GRU_model.add(Dense(num_classes, activation='softmax'))

GRU_model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = GRU_model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=20,
                    verbose=False,
                    validation_data=(x_val,y_val)
                    )


In [85]:
scoreRNN = RNN.evaluate(X_valid, y_val, batch_size=512, verbose=1)
scoreLSTM = LSTM_model.evaluate(X_valid, y_val, batch_size=512, verbose=1)
scoreGRU = GRU_model.evaluate(X_valid, y_val, batch_size=512, verbose=1)

print('\n')
print(f'Test score: RNN {scoreRNN[0]}, LSTM {scoreLSTM[0]},  GRU {scoreGRU[0]}')
print(f'Test accuracy: RNN {scoreRNN[1]}, LSTM {scoreLSTM[1]},  GRU {scoreGRU[1]}')



Test score: RNN 0.2754876183891758, LSTM 0.27873765752615554,  GRU 0.2718669737747388
Test accuracy: RNN 0.9217023849487305, LSTM 0.9229002594947815,  GRU 0.922508955001831


LSTM и GRU показали немного лучше RNN в третьем знаке. В данном тесте можно сказать что все три модели отработали с одинаковой точностью