In [None]:
!pip install -qq transformers
!pip install -q -U watermark

: 

In [None]:
# %reload_ext watermark
# %watermark -v -p numpy,pandas,torch,transformers

In [None]:
import transformers
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
import torch

import numpy as np
import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, auc
# from collections import defaultdict

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [None]:
data = pd.read_csv("news.csv")
data_train, data_test = train_test_split(data, test_size=0.5)
data_val, data_test = train_test_split(data_test, test_size=0.5)

data_train.shape, data_val.shape, data_test.shape

((23481, 5), (11740, 5), (11741, 5))

In [None]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class NewsDataset(Dataset):

  def __init__(self, text, target, tokenizer):
    self.text = text
    self.target = target
    self.tokenizer = tokenizer
    self.max_len = 512
  
  def __len__(self):
    return len(self.text)
  
  def __getitem__(self, index):
    text = str(self.text[index])
    target = self.target[index]
    
    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True, # Add '[CLS]' and '[SEP]',
      truncation = True,
      max_length=self.max_len,
      padding = 'max_length',
      return_attention_mask=True,
      return_tensors='pt'
    )   
    # return {'ids': encoding.flatten(), 
    #         'target': torch.tensor(target, dtype=torch.long)
    #         # 'attention_mask': encoding['attention_mask'].flatten()
    #         }
    return {'ids': encoding['input_ids'].flatten(),
            'target': torch.tensor(target, dtype=torch.long),
            'attention_mask': encoding['attention_mask'].flatten()
            }
    

In [None]:
BATCH_SIZE = 8
train_data_loader = DataLoader(NewsDataset(data_train.text.to_numpy(), data_train.target.to_numpy(), tokenizer),
                               BATCH_SIZE, num_workers=2)
val_data_loader = DataLoader(NewsDataset(data_val.text.to_numpy(), data_val.target.to_numpy(), tokenizer),
                             BATCH_SIZE, num_workers=2)
test_data_loader = DataLoader(NewsDataset(data_test.text.to_numpy(), data_test.target.to_numpy(), tokenizer),
                              BATCH_SIZE, num_workers=2)

In [None]:
#@title 

# class NewsDataset(Dataset):

#   def __init__(self, text, target, tokenizer):
#     self.text = text
#     self.target = target
#     self.tokenizer = tokenizer
#     self.max_len = 512
  
#   def __len__(self):
#     return len(self.text)
  
#   def __getitem__(self, index):
#     text = str(self.text[index])
#     target = self.target[index]

#     encoding = self.tokenizer.encode(
#       text,
#       add_special_tokens=True, # Add '[CLS]' and '[SEP]',
#       truncation = True,
#       max_length=self.max_len,
#       padding = 'max_length',
#       return_tensors='pt',
#     )   
#     return {'ids': encoding.flatten(), 
#             'target': torch.tensor(target, dtype=torch.long)}
  
# def create_data_loader(df, tokenizer, batch_size):
#   ds = NewsDataset(
#         text=df.text.to_numpy(),
#         target=df.target.to_numpy(),
#         tokenizer=tokenizer,
#       )

#   return DataLoader(
#     ds,
#     batch_size=batch_size,
#     num_workers=4
#   )
# BATCH_SIZE = 16
# train_data_loader = create_data_loader(data_train, tokenizer, BATCH_SIZE)
# val_data_loader = create_data_loader(data_val, tokenizer,  BATCH_SIZE)
# test_data_loader = create_data_loader(data_test, tokenizer,  BATCH_SIZE)

In [None]:
class NewsClassifier(nn.Module):

  def __init__(self, n_classes = 2):
    super(NewsClassifier, self).__init__()
    self.bert = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME)
    for param in self.bert.parameters(): 
      param.requires_grad = False
    # self.drop = nn.Dropout(p=0.3)
    self.linear = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask):
    pooled_output = self.bert(input_ids, attention_mask)[1]
    # output = self.drop(pooled_output)
    output = self.linear(pooled_output)
    return output

In [None]:
model = model.to(device)
for param in model.parameters(): 
    param.requires_grad = False
param.requires_grad = True

In [None]:
#@title 

def train_epoch(model, data_loader, loss_fn, optimizer, device, n_examples):
  model = model.train()

  losses = []
  correct_predictions = 0
  
  for d in data_loader:
    input_ids = d["ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    target = d["target"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, target)

    correct_predictions += torch.sum(preds == target)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      target = d["target"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, target)

      correct_predictions += torch.sum(preds == target)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
EPOCHS = 4

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
loss_fn = nn.CrossEntropyLoss().to(device)

In [1]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,    
    loss_fn, 
    optimizer, 
    device,  
    len(data_train)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn, 
    device, 
    len(data_val)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc