<a href="https://colab.research.google.com/github/epadam/Machine-Learning-Tutorial-Demo-Resources/blob/master/notebooks/nlp/Bert_sst5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
!pip install transformers==3
!pip install datasets

Collecting transformers==3
  Downloading transformers-3.0.0-py3-none-any.whl (754 kB)
[K     |████████████████████████████████| 754 kB 4.0 MB/s 
Collecting tokenizers==0.8.0-rc4
  Downloading tokenizers-0.8.0rc4-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 28.9 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 39.6 MB/s 
Installing collected packages: tokenizers, sentencepiece, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.10.3
    Uninstalling tokenizers-0.10.3:
      Successfully uninstalled tokenizers-0.10.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.10.2
    Uninstalling transformers-4.10.2:
      Successfully uninstalled transformers-4.10.2
Successfully installed sentencepiece-0.1.96 tokenizers-0.8.0rc4 trans



In [41]:
import time
import numpy as np
import pandas as pd

import torch
from torch.utils.data import ConcatDataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [42]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [43]:
from datasets import load_dataset
raw_data = load_dataset('sst')
train = raw_data['train']
validation = raw_data['validation']
test = raw_data['test']

No config specified, defaulting to: sst/default
Reusing dataset sst (/root/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


  0%|          | 0/3 [00:00<?, ?it/s]

In [44]:
train_text = train['sentence']
train_label = train['label']
validation_text = validation['sentence']
validation_label = validation['label']



In [37]:
train['sentence'] = list(map(lambda x: x.lower(), train['sentence']))
validation_text = list(map(lambda x: x.lower(), validation_text))

TypeError: ignored

In [45]:
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup

In [46]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [47]:
class sstDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        super().__init__()

        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)
    

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [48]:
def create_data_loader(train_text, train_label, tokenizer, max_len, batch_size):
    dataset = sstDataset(
        reviews=np.asarray(train_text),
        labels= np.asarray(train_label),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(
        dataset,
        batch_size=batch_size
    )

In [49]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super().__init__()

        self.bert = bert_model
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [50]:
NUM_CLASSES = 1
model = SentimentClassifier(NUM_CLASSES)
model = model.to(device)

In [51]:
BATCH_SIZE = 8
train_data_loader = create_data_loader(train_text, train_label, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(validation_text, validation_label , tokenizer, MAX_LEN, BATCH_SIZE)

In [52]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
MAX_LEN = 512
loss_fn = nn.MSELoss().to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
EPOCHS = 5
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

In [54]:
def train(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0
  for index, batch in enumerate(data_loader):
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    targets = batch["label"].to(device)

    # forward pass
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask,
    )

    # loss
    loss = loss_fn(outputs, targets)

    # accuracy
    _, preds = torch.max(outputs, dim=1)
    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

    # backward pass
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
# optimization
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

    if index % 100 == 0:
        print(f"processed {index} batches")
  return correct_predictions.double() / n_examples, np.mean(losses)

In [55]:
def evaluation(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      targets = batch["label"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        return_dict=False
      )
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, targets)
      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)

In [56]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [57]:
torch.cuda.empty_cache()

In [58]:

from collections import defaultdict

import time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

In [59]:
history = defaultdict(list)
best_valid_loss = float('inf')

for epoch in range(EPOCHS):
    start_time = time.time()

    train_acc, train_loss = train(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train_text))
    val_acc, val_loss = evaluation(model, val_data_loader, loss_fn, device, len(validation_text))
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if val_loss < best_valid_loss:
        best_valid_loss = val_loss
        torch.save(model.state_dict(), 'model.pt')
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_acc*100:.2f}%')
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


TypeError: ignored