<a href="https://colab.research.google.com/github/fanghe-ma/IRC_SET_NLP_Fake_News_repo/blob/master/comparison_BiLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
from keras import backend as K
import pandas as pd
import numpy as np
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Embedding, Flatten, LSTM, Input, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import confusion_matrix as cm
from keras.initializers import Constant

model_file = '/content/gdrive/My Drive/My fake news stuff/project/results/comparison/comparison_BiLSTM.h5'

In [None]:
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

class Data():
  '''
  Data object to facilitate loading data for training or testing

  3 data files
  popat_2019_res.csv - evi from Popat et al., 2019,, claim vs evidence/reporting article
  popat_2019_modified_res.csv - evi from google, 'distant supervision', article vs evidence
  liar_res.csv - evi from google, claim vs evidence 

  instantiating a data object takes one argument
    mode :
      all : returns all 3 datasets, merged and shuffled
      
  '''

  files = ['/content/gdrive/My Drive/My fake news stuff/project/data/' + file for file in os.listdir('/content/gdrive/My Drive/My fake news stuff/project/data')]
  test_split = 0.1
  max_words = 60000
  maxlen = 500


  def __init__(self, return_all = False):
    if return_all:
      self.return_all = True
    else:
      self.return_all = False

    self.target_files = self.get_file(return_all)
    self.tokenizer = self.get_tokenizer()

  def get_tokenizer(self):
    frames = []
    for file in self.files:
      if file[-4:] == '.csv':
        df = pd.read_csv(file, index_col = 0)
        df = df.dropna()
        frames.append(df)
    res = pd.concat(frames)
    tokenizer = Tokenizer(num_words = self.max_words)
    print('fitting on corpus')
    tokenizer.fit_on_texts(res['article'])
    tokenizer.fit_on_texts(res['evidence'])
    print('fitting completed')
    return tokenizer

  def vectorize_data(self, input_arr, t, maxlen):
    print('vectorizing data')
    input_list = input_arr.tolist() 
    seq = t.texts_to_sequences(input_list)
    output = pad_sequences(seq, maxlen = maxlen)
    return output
  
  def get_file(self, return_all):
    if return_all:
      target_files = self.files
    else:
      target_files = []
      for file in self.files:
        state = input(file + ': y/n?')
        if state == 'y':
          target_files.append(file)

    return target_files
  
  def load_data(self):
    frames = []
    for file in self.target_files:
        if file[-4:] == '.csv':
            df = pd.read_csv(file, index_col = 0)
            df = df.dropna()
            frames.append(df)

    res = pd.concat(frames) #join frames
    print('dataframes read')
    res = res.sample(frac = 1) #shuffles frame

    article = np.asarray(res['article'])
    label = np.asarray(res['label'])
    evidence = np.asarray(res['evidence'])

    split = self.test_split
    assert article.shape[0] == label.shape[0]
    assert article.shape[0] == evidence.shape[0]
    length = article.shape[0]

    print('debug 1')

    article = self.vectorize_data(article, self.tokenizer, self.maxlen)
    evidence = self.vectorize_data(evidence, self.tokenizer, self.maxlen)

    print('debug 2')


    test_len = int(length * split) * -1
    train_data = (article[:test_len], evidence[:test_len], label[:test_len])
    test_data = (article[test_len:], evidence[test_len:], label[test_len:])

    return train_data, test_data

  def get_ratio(self):
    true_count = 0
    fake_count = 0
    for i in range(len(list(self.res['title']))):
      if self.res.iloc[i]['label'] == 0:
        fake_count += 1
      elif self.res.iloc[i]['label']== 1:
        true_count += 1 
    
    return (true_count, fake_count, true_count / fake_count)



In [None]:
data = Data(return_all = True)
train_data, test_data = data.load_data()

word_index = data.tokenizer.word_index
embeddings_index = {}
f = open('/content/gdrive/My Drive/My fake news stuff/project/glove/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_dim = 100
max_words = data.max_words
maxlen = data.maxlen
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

fitting on corpus
fitting completed
dataframes read
debug 1
vectorizing data
vectorizing data
debug 2


In [None]:
article_in = Input(shape=(500,))
embedded_article = Embedding(
    max_words, 100, 
    input_length=500,
    embeddings_initializer=Constant(embedding_matrix))(article_in)
article_representation = Bidirectional(LSTM(32))(embedded_article)
article_representation = Dense(16, activation = 'relu')(article_representation)
#article_model = Model(inputs = article_in, outputs=article_representation)

evidence_in = Input(shape=(500,))
embedded_evidence = Embedding(
    max_words, 100, 
    input_length=500,
    embeddings_initializer=Constant(embedding_matrix))(evidence_in)
evidence_representation = Bidirectional(LSTM(32))(embedded_evidence)
evidence_representation = Dense(16, activation = 'relu')(evidence_representation)
#evidence_model = Model(inputs = evidence_in, outputs=evidence_representation)

x = keras.layers.concatenate([article_representation, evidence_representation], axis = -1)
x = Dense(32, activation = 'relu')(x)
comparison = Dense(1, activation = 'sigmoid')(x)

pipeline = Model(inputs = [article_in, evidence_in], outputs = comparison)
pipeline.compile(
            optimizer='rmsprop',
            loss='binary_crossentropy',
            metrics=['acc', f1_m, precision_m, recall_m])

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
es = EarlyStopping(
      monitor = 'val_loss', 
      mode = 'min', 
      verbose = 1, 
      patience = 1)

mc = ModelCheckpoint(
      model_file, 
      monitor = 'val_loss', 
      mode = 'min', 
      verbose = 1, 
      save_best_only = True, 
      save_weights_only = False)


In [None]:
class_weight = {
    0 : 1,
    1 : 2.5
}

history = pipeline.fit(
    [train_data[0], train_data[1]],
    train_data[-1],
    epochs=20,
    validation_split=0.1,
    batch_size = 512,
    callbacks = [es, mc],
    class_weight = class_weight
    )

Train on 105607 samples, validate on 11735 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.74849, saving model to /content/gdrive/My Drive/My fake news stuff/project/results/comparison/comparison_BiLSTM.h5
Epoch 2/20

Epoch 00002: val_loss improved from 0.74849 to 0.55621, saving model to /content/gdrive/My Drive/My fake news stuff/project/results/comparison/comparison_BiLSTM.h5
Epoch 3/20

Epoch 00003: val_loss improved from 0.55621 to 0.29733, saving model to /content/gdrive/My Drive/My fake news stuff/project/results/comparison/comparison_BiLSTM.h5
Epoch 4/20

Epoch 00004: val_loss improved from 0.29733 to 0.22659, saving model to /content/gdrive/My Drive/My fake news stuff/project/results/comparison/comparison_BiLSTM.h5
Epoch 5/20

Epoch 00005: val_loss improved from 0.22659 to 0.21030, saving model to /content/gdrive/My Drive/My fake news stuff/project/results/comparison/comparison_BiLSTM.h5
Epoch 6/20

Epoch 00006: val_loss improved from 0.21030 to 0.20944, savin