# Overview
This notebook contains the code for clickbait detecting Bert.

In [None]:
#This code block has just standard setup code for running in Python

# Import PyTorch
import torch
from torch import nn
from torch.utils.data import DataLoader, random_split
import numpy as np

# Fix the random seed for reproducability
torch.random.manual_seed(8942764)
torch.cuda.manual_seed(8942764)
np.random.seed(8942764)

# Please set your device by uncommenting the right version below

# On colab or on a machine with access to an Nvidia GPU  use the following setting
device = 'cuda:0'

# if you have an Apple Silicon machine with a GPU, use the following setting
# this should about 3-4 times faster that running it on a plain CPU
# device = 'mps'

# If you will use a cpu, this is the setting
# device='cpu'

# note that in handin.py these next two steps would need to be removed
# if you are going run this on you personal machine these would need to be done
# in the shell/terminal to update your python libraries

!pip install transformers
!pip install datasets

from transformers import AutoTokenizer, BertModel
from datasets import load_dataset




In [None]:
# load the data set from the huggingface repositories. Please make sure the dataset is modified to be in a similar format.

dataset = load_dataset("christinacdl/clickbait_notclickbait_dataset")
dataset


In [None]:
# initialize pretrained BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
# if you want you can look at some sample  data items
print(dataset["train"][8])
print(dataset["validation"][6])
print(dataset["test"][0])

In [None]:
# This dataset has 3 splits, train, validation and test, and each has a  text  and label.

# Data from the dataset can generally be accessed like a Python dict.


# Print the original sentence.
print('Original: ', dataset['train'][8]['text'])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(dataset['train'][8]['text']))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(dataset['train'][8]['text'])))

In [None]:
#code for tokenizing training data
def tokenize(batch):
  '''
  Transform the text under the 'sentence' key to
    batch has the following structure:
    [
      {
        k1: v1,
        k2: v2,
        ...
      },
      ...
    ]
  '''
  sentences = [x['text'] for x in batch]
  labels = torch.LongTensor([x['label'] for x in batch])
  new_batch = dict(tokenizer(sentences, padding=True, truncation=True, return_tensors="pt"))
  new_batch['label'] = labels
  return new_batch

In [None]:
# This code evaluates a trained model on a dataset. It also uses  train() to train model
# You probably should not be making any changes to this code.
# During training, it will be printing some progress messages
from tqdm import tqdm
@torch.no_grad()
def evaluate(model, dataset, batch_size, device, collate_fn=None):
  model = model.eval().to(device)
  dataloader = DataLoader(dataset, batch_size, shuffle=False, collate_fn=collate_fn)
  lossfn = nn.NLLLoss()

  loss_history = []
  acc_history = []
  for i, batch in enumerate(dataloader):
      batch = {k:v.to(device) for k,v in batch.items() if isinstance(v, torch.Tensor)}
      y = batch.pop('label')

      logits = model(**batch)
      loss = lossfn(logits, y)

      pred = logits.argmax(1)
      acc = (pred == y).float().mean()
      loss_history.append(loss.item())
      acc_history.append(acc.item())
  return np.mean(loss_history), np.mean(acc_history)

def train(model,
          train_dataset,
          val_dataset,
          num_epochs,
          batch_size,
          optimizer_cls,
          lr,
          weight_decay,
          device,
          collate_fn=None):
  model = model.train().to(device)
  dataloader = DataLoader(train_dataset, batch_size, shuffle=True,
                          collate_fn=collate_fn)

  if optimizer_cls == 'SGD':
    optimizer = torch.optim.SGD(model.parameters(), lr, weight_decay=weight_decay)
  elif optimizer_cls == 'Adam':
    optimizer = torch.optim.Adam(model.parameters(), weight_decay=weight_decay)

  train_loss_history = []
  train_acc_history = []
  val_loss_history = []
  val_acc_history = []

  lossfn = nn.NLLLoss()
  for e in tqdm(range(num_epochs)):
    epoch_loss_history = []
    epoch_acc_history = []
    for i, batch in enumerate(dataloader):
      batch = {k:v.to(device) for k,v in batch.items() if isinstance(v, torch.Tensor)}
      y = batch.pop('label')

      logits = model(**batch)
      loss = lossfn(logits, y)

      pred = logits.argmax(1)
      acc = (pred == y).float().mean()

      epoch_loss_history.append(loss.item())
      epoch_acc_history.append(acc.item())

      if (i % 100 == 0):
        print(f'epoch: {e}\t iter: {i}\t train_loss: {np.mean(epoch_loss_history):.3e}\t train_accuracy:{np.mean(epoch_acc_history):.3f}')
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    val_loss, val_acc = evaluate(model, val_dataset, batch_size, device, collate_fn=collate_fn)

    train_loss_history.append(np.mean(epoch_loss_history))
    train_acc_history.append(np.mean(epoch_acc_history))
    val_loss_history.append(val_loss.item())
    val_acc_history.append(val_acc.item())
    print(f'epoch: {e}\t train_loss: {train_loss_history[-1]:.3e}\t train_accuracy:{train_acc_history[-1]:.3f}\t val_loss: {val_loss_history[-1]:.3e}\t val_accuracy:{val_acc_history[-1]:.3f}')

  return model, (train_loss_history, train_acc_history, val_loss_history, val_acc_history)


In [None]:
# This code defines the test classification class using BERT.
# The classifier is defined on top of the final layer of BERT.

class BertForTextClassification(nn.Module):
  def __init__(self, bert_pretrained_config_name, num_classes, freeze_bert=False):
    '''
    BeRT with a classification MLP
    args:
    - bert_pretrained_config_name (str): model name from huggingface hub
    - num_classes (int): number of classes in the classification task
    - freeze_bert (bool): [default False] If true gradients are not computed for
                          BeRT's parameters.
    '''
    super().__init__()
    self.bert = BertModel.from_pretrained(bert_pretrained_config_name)
    self.bert.requires_grad_(not freeze_bert)
    self.classifier = nn.Sequential(
        nn.Linear(self.bert.config.hidden_size, 132),
        nn.ReLU(),
        nn.Dropout(p=0.1),
        nn.Linear(132, num_classes),
        nn.LogSoftmax(dim=-1)
    )


  def forward(self, **bert_kwargs):
     output=self.bert(**bert_kwargs)
     cls_embed = output.pooler_output
     logits = self.classifier(cls_embed)
     return logits

In [None]:
# This is where fine-tuning of the classifier happens.
# It is set to batch size 32 for 5 epochs, feel free to fine-tune.

torch.random.manual_seed(8942764)
torch.cuda.manual_seed(8942764)
np.random.seed(8942764)

bert_cls = BertForTextClassification('bert-base-uncased', 2, freeze_bert=False)

print(f'num_trainable_params={sum([p.numel() for p in bert_cls.parameters() if p.requires_grad])}\n')

bert_cls, bert_cls_logs = train(bert_cls, dataset['train'], dataset['validation'],
                                num_epochs=10, batch_size=16, optimizer_cls='Adam',
                                lr=2e-5, weight_decay=1e-4, device=device,
                                collate_fn=tokenize)

# this is where you run the test data (from huggingface) over the trained model and compute test loss and test accuracy
print('\n')
print('Starting test run')
test_loss, test_acc=evaluate(bert_cls,dataset['test'],batch_size=16, device=device, collate_fn=tokenize)
print(f'Test Complete.\t Test Loss: {test_loss:.3e}\t Test Accuracy: {test_acc:.3f}')
