## Homework #4

In [24]:
import os
import sys
import re

import numpy as np
import pandas as pd

import requests

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer

import gensim
from gensim.models.word2vec import Word2Vec

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import tensorflow as tf

import keras

from keras.preprocessing.text import Tokenizer

from keras.utils import to_categorical, pad_sequences

from keras.layers import Activation, Conv2D, Embedding, Reshape, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Conv1D, GlobalMaxPool1D
from keras.layers import LSTM, Input
from keras.layers import MaxPool1D

from keras.optimizers import RMSprop
from keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam
from keras.models import Sequential

##### Параметры

In [25]:
RANDOM_STATE = 1234
MAX_ROWS = 2000
BATCH_SIZE = 128
EPOCHS = 3

DATASET_TRUE_URL = "https://raw.githubusercontent.com/danilkladnitsky/nlp-tasks/hw_4/hw_4/true.csv"
DATASET_FAKE_URL = "https://raw.githubusercontent.com/danilkladnitsky/nlp-tasks/hw_4/hw_4/fake.csv"

##### Вспомогательные функции

In [26]:
def download_file(url, filename):
    already_downloaded = os.path.isfile(filename)

    if (already_downloaded):
        return True

    r = requests.get(url, allow_redirects=True)
    open(filename, 'wb').write(r.content)


def preprocess_text_context(content):
    sentence = re.sub('[^a-zA-Z]', ' ', content)

    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    sentence = re.sub(r'\s+', ' ', sentence)

    stops = stopwords.words('english')
    porter = PorterStemmer()
    for word in sentence.split():
        if word in stops:
            sentence = sentence.replace(word, '')
        sentence = sentence.replace(word, porter.stem(word))
    return sentence.lower()

##### Подготовка

In [27]:
download_file(DATASET_TRUE_URL, 'true.csv')
download_file(DATASET_FAKE_URL, 'fake.csv')
nltk.download('stopwords')

true_news_frame = pd.read_csv('true.csv')
fake_news_frame = pd.read_csv('fake.csv')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\firem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
print(true_news_frame.shape)
print(fake_news_frame.shape)

(21417, 4)
(23481, 4)


In [29]:
fake_news_frame.reset_index(drop=True, inplace=True)
fake_news_frame['is_fake'] = 1
fake_news_frame.drop(columns=['subject', 'date'], inplace=True)
fake_news_frame = fake_news_frame.head(MAX_ROWS)


true_news_frame.reset_index(drop=True, inplace=True)
true_news_frame['is_fake'] = 0
true_news_frame.drop(columns=['subject', 'date'], inplace=True)
true_news_frame = true_news_frame.head(MAX_ROWS)

In [30]:
true_news_frame.head(5)

Unnamed: 0,title,text,is_fake
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,0


In [31]:
fake_news_frame.head(5)

Unnamed: 0,title,text,is_fake
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,1


##### Объединим данные в один фрейм

In [32]:
news_frame = pd.concat([fake_news_frame, true_news_frame],
                       ignore_index=True, sort=False)

In [33]:
print(news_frame.shape)

(4000, 3)


In [34]:
news_frame.tail()

Unnamed: 0,title,text,is_fake
3995,Trump rescinds Obama limits on transfer of mil...,WASHINGTON (Reuters) - U.S. President Donald T...,0
3996,Lawmakers should OK relief for Harvey victims:...,WASHINGTON (Reuters) - U.S. House of Represent...,0
3997,Energy Secretary Perry cancels Kazakhstan visi...,ALMATY (Reuters) - United States Energy Secret...,0
3998,Trump's firm sought Moscow real estate deal du...,WASHINGTON (Reuters) - Donald Trump’s company ...,0
3999,Trump renews threat to scrap NAFTA going into ...,WASHINGTON (Reuters) - U.S. President Donald T...,0


##### Подготовим текстовые данные

In [35]:
plain_news_content = []

for content in news_frame.text:
    plain_news_content.append(preprocess_text_context(content).split())

print(plain_news_content[:5])

[['nld', 'trump', 'wh', 'ri', 'hppi', 'new', 'yer', 'lev', 'th', 'sted', 'give', 'sh', 'enemi', 'rs', 't', 'dhonest', 'fke', 'news', 'di', 't', 'r', 'rely', 'show', 'str', 'one', 'job', 'countri', 'rpidli', 'grow', 'stronger', 'smrter', 'wnt', 'wh', 'friend', 'support', 'enemi', 'rs', 'even', 't', 'dhonest', 'fke', 'news', 'di', 'hppi', 'lthi', 'new', 'yer', 'presid', 'ngri', 'pnt', 'tweet', 'gre', 'yer', 'ric', 'countri', 'rpidli', 'grow', 'stronger', 'smrter', 'wnt', 'wh', 'friend', 'support', 'enemi', 'rs', 'even', 't', 'dhonest', 'fke', 'news', 'di', 'hppi', 'lthi', 'new', 'yer', 'gre', 'yer', 'ric', 'nld', 'trump', 'relnldtrump', 'decemr', 'trump', 'tweet', 'went', 'wn', 'b', 'wel', 'expect', 'wh', 'kd', 'presid', 'send', 'new', 'yer', 'greet', 'like', 't', 'despic', 'petti', 'fntil', 'gibrh', 'onli', 'trump', 'hi', 'lck', 'decenc', 'even', 'ow', 're', 't', 'gutter', 'long', 'eugh', 'wh', 't', 'ri', 'cizens', 'hppi', 'new', 'yer', 'bhop', 'tlrt', 'swn', 'tlrtswn', 'decemr', 'one',

In [36]:
w2v_model = Word2Vec(plain_news_content, sample=500,
                     window=3, min_count=1, workers=4)

In [37]:
token = Tokenizer(RANDOM_STATE)
token.fit_on_texts(news_frame.text)
text = token.texts_to_sequences(news_frame.text)
text = pad_sequences(text, 75)
print(text[:2])

[[  13  201   28  126   32  749    9  744   50   85   24    1  111   12
     7    6    1  116  520  159   16   20   50  166  974  749   26    4
     1  448  749    9    4  358   98  440   34  729  749    2   56    1
    48   34  642   11   24    4   13   31  374  445   13  370   93  387
    40   63  687   98  440  838  220   37  374    5   83   13    9    4
    98  577   22  135  132]
 [  15    7  569   28    1   17    1   12    1  362   78   35  124  288
   416   61  133  933  810  170  590    1  281   38  589    2   46  156
   145    2  544  462    5   94  156    5  184  170   17    3    1  716
   937    2 1045    2    1 1210   41    5   12   83    4   17  292  441
   504  334    9  357   24  505   37    4  554   14    9    4  774  289
   577   22  377  135  132]]


In [38]:
le = preprocessing.LabelEncoder()
y = le.fit_transform(news_frame.is_fake)
y = to_categorical(y)
y[:2]

array([[0., 1.],
       [0., 1.]], dtype=float32)

#### Реализовать классификацию двумя моделями: CNN, LSTM


## CNN

In [39]:
x_train, x_test, y_train, y_test = train_test_split(
    np.array(text), y, test_size=0.2, stratify=y
)

##### get_keras_embedding analog

In [40]:
def gensim_to_keras_embedding(model, train_embeddings=False):
    keyed_vectors = model.wv
    weights = keyed_vectors.vectors

    index_to_key = keyed_vectors.index_to_key

    layer = Embedding(
        input_dim=weights.shape[0],
        output_dim=weights.shape[1],
        weights=[weights],
        trainable=train_embeddings,
    )
    return layer

In [41]:
keras_model = Sequential()

keras_model.add(gensim_to_keras_embedding(w2v_model))

keras_model.add(Dropout(0.2))

keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))

keras_model.add(MaxPool1D())

keras_model.add(Dropout(0.2))

keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))

keras_model.add(MaxPool1D())

keras_model.add(Dropout(0.2))

keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))

keras_model.add(GlobalMaxPool1D())

keras_model.add(Dropout(0.2))

keras_model.add(Dense(200))

keras_model.add(Activation('relu'))

keras_model.add(Dropout(0.2))

keras_model.add(Dense(2))

keras_model.add(Activation('softmax'))

keras_model.compile(loss='binary_crossentropy',
                    metrics=['acc'], optimizer='adam')

In [42]:
%%time

keras_model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS,
                validation_data=(x_test, y_test))

Epoch 1/3




Epoch 2/3
Epoch 3/3
CPU times: total: 1min 23s
Wall time: 29.5 s


<keras.callbacks.History at 0x20c1b1b5430>

## LSTM

In [43]:
LSTM_VALUE = 64
DENSITY = 256
DROPOUT_COEFFICIENT = 0.5
DENSITY = 1
INPUT_MAX_LEN = 75


def create_lstm_model():
    inputs = Input(name='inputs', shape=[INPUT_MAX_LEN])
    layer = gensim_to_keras_embedding(w2v_model)(inputs)
    layer = LSTM(LSTM_VALUE)(layer)
    layer = Dense(DENSITY, name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(DROPOUT_COEFFICIENT)(layer)
    layer = Dense(DENSITY, name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

In [44]:
model = create_lstm_model()
model.summary()
model.compile(loss='binary_crossentropy',
              optimizer=RMSprop(), metrics=['accuracy'])

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 75)]              0         
                                                                 
 embedding_3 (Embedding)     (None, 75, 100)           4813800   
                                                                 
 lstm_1 (LSTM)               (None, 64)                42240     
                                                                 
 FC1 (Dense)                 (None, 1)                 65        
                                                                 
 activation_6 (Activation)   (None, 1)                 0         
                                                                 
 dropout_11 (Dropout)        (None, 1)                 0         
                                                                 
 out_layer (Dense)           (None, 1)                 2   

In [45]:
X = text
Y = news_frame['is_fake']

le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1, 1)

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [47]:
%%time
tf.config.run_functions_eagerly(True)

model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=EPOCHS,
          validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)])

Epoch 1/3




Epoch 2/3
CPU times: total: 38.5 s
Wall time: 28.5 s


<keras.callbacks.History at 0x20c1a900130>

In [48]:
accuracy = model.evaluate(X_test, Y_test)

accuracy



[0.693257212638855, 0.48750001192092896]

#### Сравнить качество обученных моделей

Запуск LSTM и RNN моделей показал, что для RNN достаточно относительно небольшего количества эпох (3-5) для обучения. 
LSTM же для большей точности требует ~8-10.

In [50]:
model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=10,
          validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)])

accuracy = model.evaluate(X_test, Y_test)
accuracy

Epoch 1/10