### Задание
Данные берем отызывы за лето
На вебинаре мы говорили, что долгое время CNN и RNN архитектуры были конурируещими выяснить какая архитектура больше подходит для нашей задачи
1. построить свёрточные архитектуры
2. построить различные архитектуры с RNN
3. построить совместные архитектуры CNN -> RNN или (RNN -> CNN)

In [76]:
import pandas as pd
import re
import keras
from stop_words import get_stop_words
from string import punctuation
from pymorphy2 import MorphAnalyzer
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Embedding, Masking, SimpleRNN, Dense, Dropout, LSTM, GRU, Conv1D, Activation, GlobalMaxPool1D, Flatten
from keras.callbacks import EarlyStopping

In [2]:
df = pd.read_excel('отзывы за лето.xls')

df.sample(15)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20659 entries, 0 to 20658
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Rating   20659 non-null  int64 
 1   Content  20656 non-null  object
 2   Date     20659 non-null  object
dtypes: int64(1), object(2)
memory usage: 484.3+ KB


In [3]:
df.Content = df.Content.astype(str)

In [11]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

In [12]:
def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("не\s", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

In [15]:
df['Content'] = df['Content'].apply(preprocess_text)

df.sample(5)

Unnamed: 0,Rating,Content,Date
2789,5,хороший приложение,2017-08-10
2581,4,вроде нормально заход второй раз раздражать на...,2017-08-10
5081,5,,2017-08-07
7613,5,достаточно понятно прозрачный,2017-08-03
12678,4,принцип неплохо недоработка,2017-07-26


In [18]:
train_df, test_val_df = train_test_split(df, test_size = 0.3, random_state = 49)
test_df, val_df = train_test_split(test_val_df, test_size = 0.5, random_state = 49)

In [20]:
train_df.to_csv('data/train_df.csv')
test_df.to_csv('data/test_df.csv')
val_df.to_csv('data/val_df.csv')
test_val_df.to_csv('data/test_val_df.csv')

In [21]:
text_corpus_train = train_df['Content'].values
text_corpus_valid = val_df['Content'].values
text_corpus_test = test_df['Content'].values

In [22]:
num_classes = len(df['Rating'].unique())
num_classes

5

In [34]:
tokenizer = Tokenizer(num_words = None, 
                     filters = '#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')
tokenizer.fit_on_texts(text_corpus_train)

sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_val = tokenizer.texts_to_sequences(text_corpus_valid)
sequences_test = tokenizer.texts_to_sequences(text_corpus_test)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])

X_train = pad_sequences(sequences_train, maxlen=training_length)
X_valid = pad_sequences(sequences_val, maxlen=training_length)

y_train = keras.utils.np_utils.to_categorical(train_df['Rating'], num_classes+1)
y_test = keras.utils.np_utils.to_categorical(test_df['Rating'], num_classes+1)
y_val = keras.utils.np_utils.to_categorical(val_df['Rating'], num_classes+1)

In [50]:
# RNN

model = Sequential()

model.add(
    Embedding(input_dim = word_count,
              input_length = training_length,
              output_dim = 30,
              trainable = True,
              mask_zero = True))
model.add(Masking(mask_value = 0.0))

model.add(SimpleRNN(132))
model.add(Dense(132, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes+1, activation = 'softmax'))

model.compile(
    optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

early_stopping = EarlyStopping(monitor = 'val_loss', patience = 2, restore_best_weights = 1)

model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 132, 30)           294870    
                                                                 
 masking_4 (Masking)         (None, 132, 30)           0         
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 132)               21516     
                                                                 
 dense_3 (Dense)             (None, 132)               17556     
                                                                 
 dropout_1 (Dropout)         (None, 132)               0         
                                                                 
 dense_4 (Dense)             (None, 6)                 798       
                                                                 
Total params: 334,740
Trainable params: 334,740
Non-tr

In [51]:
history = model.fit(X_train, y_train,
                    batch_size = 512,
                    epochs = 10,
                    verbose = 1,
                    validation_split = 0.1,
                    callbacks = [early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


In [52]:
score = model.evaluate(X_valid, y_val, batch_size = 512, verbose = 1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.6884294748306274
Test accuracy: 0.7673442959785461


In [53]:
results = []

results.append(['RNN', score[0], score[1]])

In [56]:
# LSTM

model = Sequential()

model.add(
    Embedding(input_dim = word_count,
              input_length = training_length,
              output_dim = 30,
              trainable = True,
              mask_zero = True))
model.add(Masking(mask_value = 0.0))
model.add(LSTM(132, recurrent_dropout = 0.2))
model.add(Dense(132, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes+1, activation = 'softmax'))

In [57]:
model.compile(
    optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 132, 30)           294870    
                                                                 
 masking_6 (Masking)         (None, 132, 30)           0         
                                                                 
 lstm (LSTM)                 (None, 132)               86064     
                                                                 
 dense_5 (Dense)             (None, 132)               17556     
                                                                 
 dropout_2 (Dropout)         (None, 132)               0         
                                                                 
 dense_6 (Dense)             (None, 6)                 798       
                                                                 
Total params: 399,288
Trainable params: 399,288
Non-tr

In [58]:
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 2, restore_best_weights = 1)  

history = model.fit(X_train, y_train,
                    batch_size = 512,
                    epochs = 10,
                    verbose = 1,
                    validation_split = 0.1,
                    callbacks = [early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


In [59]:
score = model.evaluate(X_valid, y_val, batch_size = 512, verbose = 1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

results.append(['LSTM', score[0], score[1]])



Test score: 0.6746797561645508
Test accuracy: 0.766376256942749


In [63]:
# GRU

model = Sequential()

model.add(
    Embedding(input_dim = word_count,
              input_length = training_length,
              output_dim = 30,
              trainable = True,
              mask_zero = True))
model.add(Masking(mask_value = 0.0))
model.add(GRU(64, recurrent_dropout = 0.2))
model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes+1, activation = 'softmax'))
model.compile(
    optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 132, 30)           294870    
                                                                 
 masking_9 (Masking)         (None, 132, 30)           0         
                                                                 
 gru (GRU)                   (None, 64)                18432     
                                                                 
 dense_7 (Dense)             (None, 64)                4160      
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_8 (Dense)             (None, 6)                 390       
                                                                 
Total params: 317,852
Trainable params: 317,852
Non-t

In [64]:
early_stopping = EarlyStopping(monitor = 'val_loss',patience = 2, restore_best_weights = 1)  


history = model.fit(X_train, y_train,
                    batch_size = 512,
                    epochs = 10,
                    verbose = 1,
                    validation_split = 0.1,
                    callbacks = [early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


In [65]:
score = model.evaluate(X_valid, y_val, batch_size = 512, verbose = 1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

results.append(['GRU', score[0], score[1]])



Test score: 0.6965758800506592
Test accuracy: 0.7637947797775269


In [72]:
# CNN

model = Sequential()

model.add(
    Embedding(input_dim = word_count,
              input_length = training_length,
              output_dim = 30,
              trainable = True,
              mask_zero = True))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dropout(0.5))
model.add(Dense(num_classes+1, activation = 'softmax'))

model.compile(
    optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

model.summary()

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 132, 30)           294870    
                                                                 
 conv1d_2 (Conv1D)           (None, 130, 128)          11648     
                                                                 
 activation_1 (Activation)   (None, 130, 128)          0         
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense_9 (Dense)             (None, 10)                1290      
                                                                 
 activation_2 (Activation)   (None, 10)                0         
                                                     

In [73]:
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 2, restore_best_weights = 1)  


history = model.fit(X_train, y_train,
                    batch_size = 512,
                    epochs = 10,
                    verbose = 1,
                    validation_split = 0.1,
                    callbacks = [early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [74]:
score = model.evaluate(X_valid, y_val, batch_size = 512, verbose = 1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

results.append(['CNN', score[0], score[1]])



Test score: 0.7424861788749695
Test accuracy: 0.7699257731437683


In [77]:

# RNN + CNN

model = Sequential()

model.add(
    Embedding(input_dim = word_count,
              input_length = training_length,
              output_dim = 30,
              trainable = True,
              mask_zero = True))
model.add(SimpleRNN(132, recurrent_dropout = 0.2, return_sequences = "True"))
model.add(Conv1D(132, 3, activation = "linear"))
model.add(Conv1D(64, 1, activation = "linear")) 
model.add(Flatten())                      
model.add(Dropout(0.5)) 
model.add(Dense(num_classes+1, activation = "softmax"))      


model.compile(
    optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

model.summary()

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_16 (Embedding)    (None, 132, 30)           294870    
                                                                 
 simple_rnn_5 (SimpleRNN)    (None, 132, 132)          21516     
                                                                 
 conv1d_5 (Conv1D)           (None, 130, 132)          52404     
                                                                 
 conv1d_6 (Conv1D)           (None, 130, 64)           8512      
                                                                 
 flatten (Flatten)           (None, 8320)              0         
                                                                 
 dropout_5 (Dropout)         (None, 8320)              0         
                                                                 
 dense_11 (Dense)            (None, 6)               

In [78]:
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 2, restore_best_weights = 1)  


history = model.fit(X_train, y_train,
                    batch_size = 512,
                    epochs = 10,
                    verbose = 1,
                    validation_split = 0.1,
                    callbacks = [early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [79]:
score = model.evaluate(X_valid, y_val, batch_size = 512, verbose = 1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

results.append(['RNN + CNN', score[0], score[1]])



Test score: 0.6968570947647095
Test accuracy: 0.7686350345611572


In [80]:
results_df = pd.DataFrame(results, columns = ['Model', 'Test score', 'Test accuracy'])
results_df

Unnamed: 0,Model,Test score,Test accuracy
0,RNN,0.688429,0.767344
1,LSTM,0.67468,0.766376
2,GRU,0.696576,0.763795
3,CNN,0.742486,0.769926
4,RNN + CNN,0.696857,0.768635


#### Вывод:
Модельки справились примерно с одинаковой accuracy, однако loss меньше у RNN 