# ДЗ 7. Сверточные нейронные сети для анализа текста

**Задание**  

Берем отызывы за лето (из архива с материалами или предыдущего занятия).  
1. Учим conv сеть для классификации  
2. Рассмотреть 2-а варианта сеточек  
2.1 Инициализировать tf.keras.layers.Embedding предобученными векторами взять к примеру с https://rusvectores.org/ru/  
2.2 Инициализировать слой tf.keras.layers.Embedding по умолчанию (ну то есть вам ничего не делать с весами)  

Сравнить две архитектуры с предобученными весами и когда tf.keras.layers.Embedding обучается сразу со всей сеточкой, что получилось лучше.  

## Загрузка и обзор данных

In [1]:
!pip install stop_words

Collecting stop_words
  Downloading stop-words-2018.7.23.tar.gz (31 kB)
Building wheels for collected packages: stop-words
  Building wheel for stop-words (setup.py) ... [?25l[?25hdone
  Created wheel for stop-words: filename=stop_words-2018.7.23-py3-none-any.whl size=32911 sha256=5b3949299193d8c892f8d56a90f616dd4f5d263e739028a9cb6d25317bfb158c
  Stored in directory: /root/.cache/pip/wheels/fb/86/b2/277b10b1ce9f73ce15059bf6975d4547cc4ec3feeb651978e9
Successfully built stop-words
Installing collected packages: stop-words
Successfully installed stop-words-2018.7.23


In [2]:
!pip install pymorphy2

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.0 MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 6.6 MB/s 
[?25hInstalling collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2
Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844


In [3]:
!pip install gensim



In [4]:
import numpy as np
import pandas as pd
import re

from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from sklearn.preprocessing import LabelEncoder

In [5]:
df = pd.read_excel('отзывы за лето.xls')
df.head(10)

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14
5,5,Всё удобно норм 👍👍👍,2017-08-14
6,5,Очень удобное приложение.,2017-08-14
7,5,Все устраивает,2017-08-14
8,5,У меня работает все четко. В отличии от банком...,2017-08-14
9,5,Очень все хорошо👍,2017-08-14


## Предобработка данных

In [6]:
# Зададим параметры
max_words = 200
max_len = 150
num_classes = 1

# Training
epochs = 20
batch_size = 512
print_batch_n = 100

In [7]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)
    
df['Content'] = df['Content'].apply(preprocess_text)

In [8]:
df.head(10)

Unnamed: 0,Rating,Content,Date
0,5,it just works,2017-08-14
1,4,целое удобноной приложениеиз минус хотеть боль...,2017-08-14
2,5,отлично,2017-08-14
3,5,зависать 1 работа антивирус ранее пользоваться...,2017-08-14
4,5,удобно работать быстро,2017-08-14
5,5,удобно норма 👍👍👍,2017-08-14
6,5,удобный приложение,2017-08-14
7,5,устраивать,2017-08-14
8,5,работать чётко отличие банкомат вечно зависать...,2017-08-14
9,5,хорошо👍,2017-08-14


In [9]:
# Разделим на трейн и тест
X_train, X_test, y_train, y_test = train_test_split(df.Content, df.Rating, test_size=0.30, random_state=42)

In [10]:
train_corpus = " ".join(X_train)
train_corpus = train_corpus.lower()

In [11]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")

tokens = word_tokenize(train_corpus)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [13]:
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [14]:
tokens_filtered_top[:10]

['приложение',
 'удобно',
 'работать',
 'удобный',
 'отлично',
 'нравиться',
 'хороший',
 'отличный',
 'телефон',
 'супер']

In [15]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [16]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [17]:
x_train = np.asarray([text_to_sequence(text, max_len) for text in X_train], dtype=np.int32)
x_test = np.asarray([text_to_sequence(text, max_len) for text in X_test], dtype=np.int32)

In [18]:
x_train.shape

(14461, 150)

In [19]:
max_len

150

In [20]:
x_train[1]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  1,  2, 32],
      dtype=int32)

## CNN

In [21]:
import numpy as np
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Bidirectional, LSTM, SpatialDropout1D, concatenate
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers import Input
from keras.layers import Embedding
from keras.layers import Conv1D, Flatten, MaxPooling1D
from keras.layers import GlobalMaxPool1D, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard 
from keras.objectives import categorical_crossentropy
from keras.callbacks import EarlyStopping 
import tensorflow as tf

In [22]:
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [23]:
num_classes = 6
y_train = tf.keras.utils.to_categorical(y_train['Rating'], num_classes)
y_test = tf.keras.utils.to_categorical(y_test['Rating'], num_classes)

In [24]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [25]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [26]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20


In [27]:
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 1.6554709672927856
Test accuracy: 0.09793481975793839


In [28]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(Conv1D(128, 3))
model.add(Bidirectional(LSTM(units=32, return_sequences=True)))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [29]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [30]:
tensorboard1=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping1=EarlyStopping(monitor='val_loss')  


history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard1, early_stopping1])

Epoch 1/20


In [31]:
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 1.0882039070129395
Test accuracy: 0.7086156606674194


## Word2Vec

In [32]:
ru_w2v = KeyedVectors.load_word2vec_format('https://rusvectores.org/static/models/news_upos_cbow_300_2_2017.bin.gz', binary=True)

In [33]:
def code_w2v_txt(txt, max_len = 100):
    sent_w2v = []
    zero_point = np.zeros(300)
    txt = txt.split()
   
    for i in range(max_len):
        try:
            word = ru_w2v[txt[i]]
        except:
            word = zero_point
        sent_w2v.append(word)
    return np.array(sent_w2v)

max_len = 50
df['w2v'] = df['Content'].apply(code_w2v_txt, max_len = max_len)
del(ru_w2v)

In [34]:
X = list(df['w2v'].values)
X = np.array(X)
y = df['Rating'].values

In [35]:
# Разделим на трейн и тест
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [36]:
le = LabelEncoder()
train_enc_labels = le.fit_transform(y_train) 
test_enc_labels = le.transform(y_test)
le.classes_

array([1, 2, 3, 4, 5])

In [37]:
num_classes = len(df.Rating.unique())

inputs = Input(shape=(X_train.shape[1], X_train.shape[2]))
dropout_embeds = SpatialDropout1D(0.05)(inputs)
x = Bidirectional(LSTM(units=7, return_sequences=True))(dropout_embeds)
pooled_avg_sequences = GlobalAveragePooling1D()(x)
pooled_max_sequences = GlobalMaxPooling1D()(x)
concated = concatenate([pooled_avg_sequences, pooled_max_sequences])
dense_intermediate = Dense(64, activation='elu')(concated)
x = Dense(num_classes, activation='sigmoid')(dense_intermediate)

model = Model(inputs=inputs, outputs=x)
model.summary()
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 50, 300)]    0                                            
__________________________________________________________________________________________________
spatial_dropout1d (SpatialDropo (None, 50, 300)      0           input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 50, 14)       17248       spatial_dropout1d[0][0]          
__________________________________________________________________________________________________
global_average_pooling1d (Globa (None, 14)           0           bidirectional_1[0][0]            
______________________________________________________________________________________________

In [38]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, train_enc_labels,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20


In [39]:
score = model.evaluate(X_test, test_enc_labels)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 1.2631239891052246
Test accuracy: 0.7086156606674194


In [40]:

inputs = Input(shape=(X_train.shape[1], X_train.shape[2]))
l_cov1= Conv1D(128, 5, activation='relu')(inputs)
l_pool1 = MaxPooling1D(2)(l_cov1)
l_cov2 =Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(2)(l_cov2)
l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(5)(l_cov3)  # global max pooling
l_flat = Flatten()(l_pool3)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(64, activation='softmax')(l_dense)
preds = Dense(num_classes, activation='softmax')(l_dense)

model = Model(inputs=inputs, outputs=preds)

model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])

In [41]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, train_enc_labels,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20


In [42]:
score = model.evaluate(X_test, test_enc_labels)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 1.5832996368408203
Test accuracy: 0.7086156606674194


Закончилась память на сервере, не могу попробовать больше моделей.