* this notebook is very similar to the 'BiLSTM network' notebook, the major difference is the embedding layer

### Read the data

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
with open("../data/reliable_news_prep", "r") as reliable_file:
    rel = [line.strip() for line in reliable_file]
with open("../data/fake_news_prep", "r") as fake_file:
    fake = [line.strip() for line in fake_file]

In [8]:
from training_preprocess import prepare_fastText_embedding_matrix as prepare_embedding
from training_preprocess import sequence_vectorize
from training_preprocess import train_val_test_split as split

import numpy as np

In [4]:
def label_to_vector(labels):
    array = np.zeros([len(labels), 2])
    for i, label in enumerate(labels):
        array[i, label] = 1
    return array

In [7]:
length=1000

# we decreased the sentence length because the ELMo embedding used to much memory, 
max_sentence_len=512

text = rel[:length]+fake[:length]
labels = [0 if i<length else 1 for i in range(2*length)] # reliable - 0; fake - 1

X_train, X_val, X_test, Y_train, Y_test, Y_val = split(text, labels, 0.2, 0.1)
X_train = [' '.join(t.split()[:max_sentence_len]) for t in X_train]
X_val = [' '.join(t.split()[:max_sentence_len]) for t in X_val]

x_train = np.array(X_train, dtype=object)[:, np.newaxis]
x_val = np.array(X_val, dtype=object)[:, np.newaxis]

y_train = label_to_vector(Y_train)
y_val = label_to_vector(Y_val)


### Define the model

In [2]:
def bidirectional_LSTM(dropout_rate):
    
    # Add an Input Layer
    input_layer = layers.Input((1, ), dtype="string")

    # Add the word embedding Layer
    embedding_layer = ElmoEmbeddingLayer()(input_layer)
    embedding_layer = layers.SpatialDropout1D(dropout_rate)(embedding_layer)
    
    # Add the LSTM Layer
    lstm_layer = layers.Bidirectional(layers.LSTM(20))(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(70, activation="relu")(lstm_layer)
    output_layer2 = layers.Dropout(dropout_rate)(output_layer1)
    output_layer3 = layers.Dense(2, activation="softmax")(output_layer2)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer3)
    model.compile(optimizer='Adam', loss="binary_crossentropy", metrics=['accuracy'])
    model.summary()
    
    return model

In [3]:
from keras import backend as K
import tensorflow_hub as hub
import tensorflow as tf
from keras import layers, models, optimizers, regularizers
from keras.engine import Layer

#this code is based on https://github.com/strongio/keras-elmo/blob/master/Elmo%20Keras.ipynb this repository
class ElmoEmbeddingLayer(Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024
        self.trainable=True
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=self.trainable,
                               name="{}_module".format(self.name))

        self.trainable_weights += K.tf.trainable_variables(scope="^{}_module/.*".format(self.name))
        super(ElmoEmbeddingLayer, self).build(input_shape)

    def call(self, x, mask=None):
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),
                      as_dict=True,
                      signature='default',
                      )['elmo']
        return result

    def compute_mask(self, inputs, mask=None):
        return K.not_equal(inputs, '--PAD--')

    def compute_output_shape(self, input_shape):
        return (input_shape[0], 48, self.dimensions)

Using TensorFlow backend.


In [4]:
model = bidirectional_LSTM(0.5)

Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
elmo_embedding_layer_1 (Elmo (None, 48, 1024)          4         
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 48, 1024)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 40)                167200    
_________________________________________________________________
dense_1 (Dense)              (None, 70)                2870      
_________________________________________________________________
dropout_1 (Dropout)          (None, 70)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 142       
Total para

In [14]:
from keras import callbacks

callbacks = [callbacks.EarlyStopping(monitor='val_loss', patience=3)]

model.fit(x_train,
          y_train,
          epochs=10,
          callbacks=callbacks,
          validation_data=(x_val, y_val),
          verbose=2,  # Logs once per epoch.
          batch_size=16)

model.save_weights('./BiLSTM_ELMo.h5')

Train on 1400 samples, validate on 400 samples
Epoch 1/10
 - 243s - loss: 0.5280 - acc: 0.7321 - val_loss: 0.3710 - val_acc: 0.8275
Epoch 2/10
 - 238s - loss: 0.3362 - acc: 0.8593 - val_loss: 0.3263 - val_acc: 0.8575
Epoch 3/10
 - 238s - loss: 0.2711 - acc: 0.8957 - val_loss: 0.2748 - val_acc: 0.8650
Epoch 4/10
 - 239s - loss: 0.2149 - acc: 0.9121 - val_loss: 0.2766 - val_acc: 0.8825
Epoch 5/10
 - 238s - loss: 0.1938 - acc: 0.9236 - val_loss: 0.2445 - val_acc: 0.9050
Epoch 6/10
 - 238s - loss: 0.1675 - acc: 0.9350 - val_loss: 0.2594 - val_acc: 0.8875
Epoch 7/10
 - 238s - loss: 0.1745 - acc: 0.9307 - val_loss: 0.2481 - val_acc: 0.8950
Epoch 8/10
 - 238s - loss: 0.1563 - acc: 0.9371 - val_loss: 0.2565 - val_acc: 0.8850


### Testing

In [5]:
# if machine is true we read the machine vs real task, else we read the fake vs real task

def read_data(machine):
    if(machine):
        with open("../../test_data/gpt2_generated.txt", "r") as gpt2, open("../../test_data/grover_generated.txt", "r") as grover:
            X_test = [line for line in gpt2]+[line for line in grover]
        
        with open("../../test_data/x_test.txt") as data, open("../../test_data/y_test.txt") as label_file:
            index = 0
            labels = [label.strip() for label in label_file]
            for i, line in enumerate(data):
                if(index==180):
                    break
                if(labels[i]=="0"):
                    X_test.append(line)
                    index += 1
        Y_test = ["fake" if i < 180 else "real" for i in range(360)]
    else:
        with open("../../test_data/x_test.txt") as f:
            X_test = [line for line in f]
        with open("../../test_data/y_test.txt") as f:
            Y_test = ["fake" if line.strip() == "1" else "real" for line in f]
    return X_test, Y_test

In [6]:
X_test, Y_test = read_data(False)

In [9]:
max_sentence_len = 512
X_test = [' '.join(t.split()[:max_sentence_len]) for t in X_test]
x_test = np.array(X_test, dtype=object)[:, np.newaxis]

In [10]:
from text_preprocess import en_lemmatize, filter_stopwords, filter_punctuation, filter_urls

def predict(model, datas):
    probs = model.predict(datas)
    result = []
    for i in probs:
        if np.argmax(i)==1:
            result.append("fake")
        else:
            result.append("real")
    return result

In [11]:
model.load_weights('./BiLSTM_ELMo.h5')

In [12]:
from sklearn.metrics import classification_report

In [13]:
prediction = predict(model, x_test)

In [54]:
# machine vs real
# print(classification_report(Y_test, prediction))

              precision    recall  f1-score   support

        fake       0.53      0.48      0.50       180
        real       0.53      0.58      0.55       180

    accuracy                           0.53       360
   macro avg       0.53      0.53      0.53       360
weighted avg       0.53      0.53      0.53       360



In [14]:
# fake vs real
print(classification_report(Y_test, prediction))

              precision    recall  f1-score   support

        fake       0.71      0.88      0.79      4947
        real       0.85      0.65      0.74      5053

    accuracy                           0.76     10000
   macro avg       0.78      0.76      0.76     10000
weighted avg       0.78      0.76      0.76     10000

