In [0]:
%tensorflow_version 1.x
import tensorflow
print(tensorflow.__version__)

In [0]:
from google.colab import drive
drive.mount('/gdrive')

In [0]:
%cd /gdrive/My Drive/fakenews_data

In [0]:
import nltk
nltk.download('punkt')

In [0]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re

from bs4 import BeautifulSoup

import sys
import os

MAX_SENT_LENGTH = 500
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.25

data_train_fake = pd.read_csv('politifact_fake.csv')
data_train_real = pd.read_csv('politifact_real.csv')

In [0]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import MultiLabelBinarizer

from nltk import tokenize

import cv2

texts = []
metas = []
img_data = []
replies = []

for idx in range(data_train_fake.text.shape[0]):
    try:
      img_data.append(cv2.resize(cv2.imread(str(data_train_fake.id[idx])+'.jpg', cv2.IMREAD_COLOR), (224, 224), interpolation=cv2.INTER_CUBIC))
    except:
      # print("Error!")
      continue
    texts.append(str(data_train_fake.text[idx]))
    metas.append(set(str(data_train_fake.meta_data[idx]).split(' ')))
    # replies.append(str(data_train_fake.replies[idx]))

train_size = len(texts)
labels = list(np.ones(train_size, dtype=int))

for idx in range(data_train_real.text.shape[0]):
    try:
      img_data.append(cv2.resize(cv2.imread(str(data_train_real.id[idx])+'.jpg', cv2.IMREAD_COLOR), (224, 224), interpolation=cv2.INTER_CUBIC).tolist())
    except:
      # print("Error!")
      continue
    texts.append(str(data_train_real.text[idx]))
    metas.append(set(str(data_train_real.meta_data[idx]).split(' ')))
    # replies.append(str(data_train_real.replies[idx]))

labels += list(np.zeros(len(texts)-train_size, dtype=int))

# print(labels)

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

data = np.zeros((len(texts), MAX_SENT_LENGTH), dtype='int32')

one_hot = MultiLabelBinarizer()
metas = one_hot.fit_transform(metas)

In [0]:
for i, text in enumerate(texts):
  wordTokens = text_to_word_sequence(text)
  k = 0
  for _, word in enumerate(wordTokens):
    if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
      data[i, k] = tokenizer.word_index[word]
      k = k + 1

word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
metas = metas[indices]
img_data = np.array(img_data)[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
x_meta_train = metas[:-nb_validation_samples]
x_img_train = img_data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
x_meta_val = metas[-nb_validation_samples:]
x_img_val = img_data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in traing and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

GLOVE_DIR = "."
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# print(data[0:5])

In [0]:
from keras.layers import Input, Embedding, LSTM, Dense, Concatenate, LeakyReLU, Dropout
from keras.models import Model
from keras.applications.vgg16 import VGG16
from numpy.random import randint
# from keras.applications.mobilenet_v2 import MobileNetV2

def get_generative(input_tensor, main_input, auxiliary_input):
  # Input tensor: the cover image
  base_model = VGG16(include_top=True, weights='imagenet', input_tensor=input_tensor, input_shape=None, pooling=None, classes=1000)
  im = base_model.output
  # print(im.shape)
  dense1_1 = Dense(EMBEDDING_DIM, activation='relu', name='img_output')
  im = dense1_1(im)

  embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True)
  
  # Main input: the news content
  embedded_sequences = embedding_layer(main_input)
  lstm_layer = LSTM(EMBEDDING_DIM, name='text_output')
  x = lstm_layer(embedded_sequences) # MLP, LSTM, BiGRU here

  # Auxiliary input: the news profile/meta data
  dense2_1 = Dense(EMBEDDING_DIM, activation='relu')
  me = dense2_1(auxiliary_input)
  dense2_2 = Dense(EMBEDDING_DIM, activation='relu', name='meta_output')
  me = dense2_2(me)

  out = Concatenate(axis=1)([x, me, im])

  # We stack a deep densely-connected network on top
  out = Dense(EMBEDDING_DIM, activation='relu')(out)

  # And finally we add the output layer
  main_output = Dense(2, activation='sigmoid', name='main_output')(out)

  G = Model(inputs=[input_tensor, main_input, auxiliary_input], outputs=[main_output, x], name="generator")
  G.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['acc'])
  return G, main_output

def get_discriminative(D_in, drate = .2, leak=.2):
  x = Dense(EMBEDDING_DIM)(D_in)
  x = LeakyReLU(leak)(x)
  x = Dropout(drate)(x)

  x = Dense(EMBEDDING_DIM)(x)
  x = LeakyReLU(leak)(x)
  D_out = Dense(3, activation='sigmoid', name='discriminator_output')(x)

  D = Model(inputs=D_in, outputs=D_out, name="discriminator")
  D.compile(optimizer='rmsprop',
            loss='mean_squared_error',
            metrics=['acc'])
  return D, D_out

def set_trainability(model, trainable=False):
  model.trainable = trainable
  for layer in model.layers:
    layer.trainable = trainable

def make_gan(GAN_input_tensor, GAN_main_input, GAN_auxiliary_input, G, D):
  set_trainability(D, False)
  G_output, x = G([GAN_input_tensor, GAN_main_input, GAN_auxiliary_input])
  # im = G.get_layer('img_output').output
  # x = G.get_layer('text_output').output
  # me = G.get_layer('meta_output').output

  GAN_out = D(x)

  losses = {
      "generator": "categorical_crossentropy",
      "discriminator": "mean_squared_error"
      }
  lossWeights = {"generator": 1.0, "discriminator": 0.1}
  GAN = Model(inputs=[GAN_input_tensor, GAN_main_input, GAN_auxiliary_input], outputs=[GAN_out, G_output])
  GAN.compile(loss=losses, loss_weights=lossWeights, optimizer='rmsprop')
  return GAN, GAN_out

def pretrain(G, D, x_img_train, x_train, x_meta_train, batch_size=32):
  print("Pretraining...")
  print("Getting results from G...")
  
  L = batch_size//3
  img_indices= np.random.randint(L*3, size=L)
  text_indices= np.random.randint(L*3, size=L)
  meta_indices= np.random.randint(L*3, size=L)
  x_img_train, x_train, x_meta_train = x_img_train[img_indices], x_train[text_indices], x_meta_train[meta_indices]

  img_layer_model = Model(inputs=G.inputs, outputs=G.get_layer('img_output').output)
  text_layer_model = Model(inputs=G.inputs, outputs=G.get_layer('text_output').output)
  meta_layer_model = Model(inputs=G.inputs, outputs=G.get_layer('meta_output').output)
  img_output = img_layer_model.predict([x_img_train, x_train, x_meta_train])
  text_output = text_layer_model.predict([x_img_train, x_train, x_meta_train])
  meta_output = meta_layer_model.predict([x_img_train, x_train, x_meta_train])

  X = tensorflow.concat([img_output, text_output, meta_output], 0)
  y = np.zeros((L*3, 3))
  y[:L, 0] = 1
  y[L:2*L, 1] = 1
  y[2*L:, 2] = 1

  set_trainability(D, True)
  print("Getting results from D...")
  D.fit(X, y, steps_per_epoch=batch_size)

def train(GAN, G, D, dataset, y_train, epochs=10, batch_size=32, verbose=True, v_freq=1):
  print("Training...")
  x_img_train, x_train, x_meta_train = dataset
  # print(x_img_train.shape)
  d_loss = []
  g_loss = []
  e_range = range(epochs)
  # if verbose:
  #   e_range = tqdm(e_range)
    
  for epoch in e_range:
    img_layer_model = Model(inputs=G.inputs, outputs=G.get_layer('img_output').output)
    text_layer_model = Model(inputs=G.inputs, outputs=G.get_layer('text_output').output)
    meta_layer_model = Model(inputs=G.inputs, outputs=G.get_layer('meta_output').output)
    img_output = img_layer_model.predict([x_img_train, x_train, x_meta_train])
    text_output = text_layer_model.predict([x_img_train, x_train, x_meta_train])
    meta_output = meta_layer_model.predict([x_img_train, x_train, x_meta_train])

    L = img_output.shape[0]
    img_indices= np.random.randint(L, size=L//3)
    text_indices= np.random.randint(L, size=L//3)
    meta_indices= np.random.randint(L, size=L-2*(L//3))
    X = tensorflow.concat([img_output[img_indices], text_output[text_indices], meta_output[meta_indices]], 0)
    y = np.zeros((L, 3))
    y[:L//3, 0] = 1 #image
    y[L//3:2*(L//3), 1] = 1 #text
    y[2*(L//3):, 2] = 1 #meta data

    print("Training D...")
    set_trainability(D, True)
    d_loss.append(D.train_on_batch(X, y))
    
    print("Training GAN...")
    set_trainability(D, False)

    y = np.zeros((L, 3))
    y[:, 0] = 1 #fool D that all texts are images
    g_loss.append(GAN.train_on_batch([x_img_train, x_train, x_meta_train], [y, y_train]))

    y[:, 2] = 1 #fool D that all texts data are meta data
    g_loss.append(GAN.train_on_batch([x_img_train, x_train, x_meta_train], [y, y_train]))
        
    if verbose and (epoch + 1) % v_freq == 0:
      print("Epoch #{}: Generative Loss: {}, Discriminative Loss: {}".format(epoch + 1, g_loss[-1], d_loss[-1]))
        
  return d_loss, g_loss

In [0]:
G_input_tensor = Input(shape=[224, 224, 3])
G_main_input = Input(shape=[MAX_SENT_LENGTH])
G_auxiliary_input = Input(shape=[metas.shape[1]])
G, G_out = get_generative(G_input_tensor, G_main_input, G_auxiliary_input)
G.summary()

D_in = Input(shape=(EMBEDDING_DIM,))
D, D_out = get_discriminative(D_in)
D.summary()

GAN_input_tensor = Input([224, 224, 3])
GAN_main_input = Input([MAX_SENT_LENGTH])
GAN_auxiliary_input = Input([metas.shape[1]])
GAN, GAN_out = make_gan(GAN_input_tensor, GAN_main_input, GAN_auxiliary_input, G, D)
GAN.summary()

dataset = [x_img_train, x_train, x_meta_train]
pretrain(G, D, x_img_train, x_train, x_meta_train)
d_loss, g_loss = train(GAN, G, D, dataset, y_train, verbose=True)

In [0]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score, roc_auc_score, confusion_matrix, auc, roc_curve, precision_recall_curve
yhat_probs, _ = G.predict([x_img_val, x_val, x_meta_val])
yhat_classes = np.argmax(yhat_probs, axis=1)
yhat_probs = yhat_probs[:,1]
y_classes = np.argmax(y_val, axis=1)
accuracy = accuracy_score(y_classes, yhat_classes)
print('Accuracy: %f' % accuracy)
precision = precision_score(y_classes, yhat_classes)
print('Precision: %f' % precision)
recall = recall_score(y_classes, yhat_classes)
print('Recall: %f' % recall)
f1 = f1_score(y_classes, yhat_classes)
print('F1 score: %f' % f1)