* the base of this code https://github.com/spierre91/medium_code/blob/master/fake_news_classifcation.py


### Read the data

In [3]:
import pandas as pd 
import numpy as np 
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel
import torch
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

import sys

sys.path.append("/home/work/deep_learning/you_are_fake_news/src/")

from training_preprocess import prepare_fastText_embedding_matrix as prepare_embedding
from training_preprocess import sequence_vectorize
from training_preprocess import train_val_test_split as split

In [20]:
rel = []
fake = []
length = 10

with open("../you_are_fake_news/data/reliable_news_prep", "r") as reliable_file:
    for i, line in enumerate(reliable_file):
        if(i==length):
            break
        rel.append(line.strip())
with open("../you_are_fake_news/data/fake_news_prep", "r") as fake_file:
    for i, line in enumerate(fake_file):
        if(i==length):
            break
        fake.append(line.strip())


In [22]:
text = rel+fake
labels = ["reliable" if i<length else "fake" for i in range(2*length)] # reliable - 0; fake - 1
X_train, _, _, Y_train, _, _ = split(text, labels, 0.2, 0.1)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Every text must begin with the special symbol CLS
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], X_train))
train_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens))

# The max sequence length what BERT can handle is 512
train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")
train_y = np.array(Y_train) == 'fake'

# mask the padded tokens
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]

#make the input tensors
train_masks_tensor = torch.tensor(train_masks)
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

# the batch size options were limited: with 8 batch size we had resource exhaustion error
BATCH_SIZE = 4

train_dataset =  torch.utils.data.TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler =  torch.utils.data.RandomSampler(train_dataset)
train_dataloader =  torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)


### Define the modell

In [6]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()
        
        # pre-trained BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        
        # the final dense layer, with sigmoid activation
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

In [7]:
bert_clf = BertBinaryClassifier()

# we used GPU acceleration
bert_clf = bert_clf.cuda()
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6)

In [8]:
bert_clf.train()
train_loss = 0

In [9]:
# train the network
for step_num, batch_data in enumerate(train_dataloader):
    token_ids, masks, labels = tuple(t.to('cuda') for t in batch_data)
    probas = bert_clf(token_ids, masks)
    loss_func = nn.BCELoss()
    batch_loss = loss_func(probas, labels)
    train_loss += batch_loss.item()
    bert_clf.zero_grad()
    batch_loss.backward()
    optimizer.step()
    print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(X_train) / BATCH_SIZE, train_loss / (step_num + 1)))

0/3499.75 loss: 0.7116120457649231 
1/3499.75 loss: 0.684618353843689 
2/3499.75 loss: 0.7199406623840332 
3/3499.75 loss: 0.7274846732616425 
4/3499.75 loss: 0.7098124980926513 
5/3499.75 loss: 0.7021535933017731 
6/3499.75 loss: 0.6916425994464329 
7/3499.75 loss: 0.6898057758808136 
8/3499.75 loss: 0.6951464944415622 
9/3499.75 loss: 0.6933646082878113 
10/3499.75 loss: 0.7016787366433577 
11/3499.75 loss: 0.7028011232614517 
12/3499.75 loss: 0.6978159913649926 
13/3499.75 loss: 0.6896444771971021 
14/3499.75 loss: 0.6923034429550171 
15/3499.75 loss: 0.6914227902889252 
16/3499.75 loss: 0.7004877118503346 
17/3499.75 loss: 0.7004280818833245 
18/3499.75 loss: 0.69693481608441 
19/3499.75 loss: 0.6940961807966233 
20/3499.75 loss: 0.6921637398856026 
21/3499.75 loss: 0.6892447796734896 
22/3499.75 loss: 0.6878418196802554 
23/3499.75 loss: 0.6842647343873978 
24/3499.75 loss: 0.6832451176643372 
25/3499.75 loss: 0.6805199659787692 
26/3499.75 loss: 0.6809845853734899 
27/3499.75 los

In [10]:
torch.save(bert_clf, "./models/trained_bert.pt")

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


### Testing

In [24]:
from collections import Counter

In [50]:
# if machine is true we read the machine vs real task, else we read the fake vs real task
def read_data(machine):
    if(machine):
        with open("../test_data/gpt2_generated.txt", "r") as gpt2, open("../test_data/grover_generated.txt", "r") as grover:
            X_test = [line for line in gpt2]+[line for line in grover]
        
        with open("../test_data/x_test.txt") as data, open("../test_data/y_test.txt") as label_file:
            index = 0
            labels = [label.strip() for label in label_file]
            for i, line in enumerate(data):
                if(index==180):
                    break
                if(labels[i]=="0"):
                    X_test.append(line)
                    index += 1
        Y_test = ["fake" if i < 180 else "reliable" for i in range(360)]
    else:
        with open("../test_data/x_test.txt") as f:
            X_test = [line for line in f]
        with open("../test_data/y_test.txt") as f:
            Y_test = ["fake" if line.strip() == "1" else "reliable" for line in f]
    return X_test, Y_test

In [51]:
X_test, Y_test = read_data(True)

In [53]:
# test input preprocessing
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], X_test))

test_y = np.array(Y_test) == 'fake'
test_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens))
test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]
test_masks_tensor = torch.tensor(test_masks)
test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()


test_dataset =  torch.utils.data.TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler =  torch.utils.data.SequentialSampler(test_dataset)
test_dataloader =  torch.utils.data.DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [54]:
# test prediction

bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):
        token_ids, masks, labels = tuple(t.to('cuda') for t in batch_data)
        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])

In [55]:
# machine vs real news testing
print(classification_report(test_y, bert_predicted))

              precision    recall  f1-score   support

       False       0.48      0.93      0.63       180
        True       0.07      0.01      0.01       180

    accuracy                           0.47       360
   macro avg       0.28      0.47      0.32       360
weighted avg       0.28      0.47      0.32       360



In [32]:
# Fake vs reliable news testing 
# print(classification_report(test_y, bert_predicted))

              precision    recall  f1-score   support

       False       0.81      0.95      0.87      5053
        True       0.93      0.77      0.84      4947

    accuracy                           0.86     10000
   macro avg       0.87      0.86      0.86     10000
weighted avg       0.87      0.86      0.86     10000

