### Imports and input data

In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from transformers import BertModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch import nn
from IPython.display import clear_output
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.linear_model import LogisticRegression
# from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
import torch
from keras.preprocessing.sequence import pad_sequences

In [2]:
# df_fake = pd.read_csv('../datasets/News _dataset/Fake.csv')
# df_fake['label'] = 0
# df_true = pd.read_csv('../datasets/News _dataset/True.csv')
# df_true['label'] = 1

In [3]:
# df_fake.head()

In [4]:
# df = pd.concat([df_fake, df_true])
# df.shape

In [5]:
# df.head()

In [6]:
### Create a dataset
# dataset_out = pd.DataFrame(columns=['data', 'label'], data=np.array([list(map('  '.join, zip(df.title.values, df.text.values))), df.label.values]).T)
# #
# dataset_out.to_csv('../datasets/News _dataset/concated_news_dataset.csv', index=False)

In [7]:
dataset_out = pd.read_csv('../datasets/News _dataset/concated_news_dataset.csv')

In [8]:
dataset_out.head()

Unnamed: 0,data,label
0,Donald Trump Sends Out Embarrassing New Year’...,0
1,Drunk Bragging Trump Staffer Started Russian ...,0
2,Sheriff David Clarke Becomes An Internet Joke...,0
3,Trump Is So Obsessed He Even Has Obama’s Name...,0
4,Pope Francis Just Called Out Donald Trump Dur...,0


### Preprocessing

In [9]:
# shuffle the dataset
dataset_out = dataset_out.sample(frac=1).reset_index(drop=True)
dataset_out.head()

# split into train and test
training_data = dataset_out[:int(len(dataset_out)*0.8)]
testing_data = dataset_out[int(len(dataset_out)*0.8):]


In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


In [11]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], training_data.data.values))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], testing_data.data.values))

len(train_tokens), len(test_tokens)

(35918, 8980)

In [12]:
train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

train_tokens_ids.shape, test_tokens_ids.shape

((35918, 512), (8980, 512))

In [13]:
train_y = training_data.label.astype(np.float32).values
test_y = testing_data.label.astype(np.float32).values
train_y.shape, test_y.shape, np.mean(train_y), np.mean(test_y)

((35918,), (8980,), 0.47872934, 0.4701559)

In [14]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

### Baseline model

In [15]:
#
# baseline_model = make_pipeline(CountVectorizer(ngram_range=(1,3)), LogisticRegression()).fit(training_data.data.values, training_data.label.values)
#
# baseline_predicted = baseline_model.predict(testing_data.data.values)
#
# print(classification_report(testing_data.label.values, baseline_predicted))

### Bert model

In [16]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased', return_dict=False)

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, tokens, masks=None):
        pooled_output = self.bert(tokens, attention_mask=masks, output_hidden_states=False)[1]
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba


In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

bert_clf = BertBinaryClassifier() # BertForSequenceClassification.from_pretrained('bert-base-uncased')
bert_clf = bert_clf.cuda()

# str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'
#
# x = torch.tensor(train_tokens_ids[:3]).to(device)
# y = bert_clf.bert(x)
# x, y

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
BATCH_SIZE = 4
EPOCHS = 10

train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)


param_optimizer = list(bert_clf.sigmoid.named_parameters())
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = Adam(bert_clf.parameters(), lr=3e-6)

torch.cuda.empty_cache()

for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
        logits = bert_clf(token_ids, masks)

        loss_func = nn.BCELoss()

        batch_loss = loss_func(logits, labels)
        train_loss += batch_loss.item()

        bert_clf.zero_grad()
        batch_loss.backward()

        clip_grad_norm_(parameters=bert_clf.parameters(), max_norm=1.0)
        optimizer.step()
        clear_output(wait=True)
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(training_data) / BATCH_SIZE, train_loss / (step_num + 1)))

Epoch:  1
2051/8979.5 loss: 0.06623308316329446 
1765.73184M


KeyboardInterrupt: 

In [19]:
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()

        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])



In [20]:
np.mean(bert_predicted)

0.47126948775055677

In [23]:
bert_predicted

[False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 True,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 True,
 Fals

In [32]:
pd.DataFrame(columns=['testing_data', 'predictions'], data=np.array([
    testing_data.label.values, [int(x) for x in bert_predicted]]).T)

Unnamed: 0,testing_data,predictions
0,0,0
1,0,0
2,1,1
3,0,0
4,0,0
...,...,...
8975,0,0
8976,1,1
8977,0,0
8978,1,1


In [22]:
torch.save(bert_clf, '../datasets/bert_clf.pth')

In [21]:
print(classification_report(testing_data.label.values, bert_predicted))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4758
           1       1.00      1.00      1.00      4222

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

