In [19]:
import torch
from torch import nn
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [None]:
import pandas as pd

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
import csv

In [7]:
df = pd.read_csv("./bbc_encoded.csv")

In [13]:
torch.manual_seed(42)

<torch._C.Generator at 0x79782e7a6330>

In [10]:
#texts = df['text'].astype(str).tolist()
#texts = (df['text'].astype(str) + ' ' + df['lsa_summary'].astype(str)).tolist()
labels = df['label_encoded'].tolist()
texts = df['text'].astype(str).tolist()

In [20]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size = 0.2, random_state=42 )

In [58]:
class TextClassificationDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=128):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len
  def __len__(self):
    return len(self.texts)
  def __getitem__(self, idx):
    text = self.texts[idx]
    label = self.labels[idx]
    encoding = self.tokenizer(
        text,
        add_special_tokens=True,
        max_length = self.max_len,
        padding = 'max_length',
        truncation=True,
        return_tensors='pt'
    )
    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'token_type_ids': encoding['token_type_ids'].flatten(),
        'label':torch.tensor(label, dtype=torch.long)
    }

In [59]:
class BertClassifier(nn.Module):
  def __init__(self, bert_model_name, num_classes):
    super(BertClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(bert_model_name)
    self.dropout = nn.Dropout(0.1)
    self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
  def forward(self, input_ids,attention_mask, token_type_ids):
    output = self.bert(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
    pooled_output= output.last_hidden_state[:,0,:]
    pooled_output = self.dropout(pooled_output)
    logits = self.classifier(pooled_output)
    return logits, pooled_output

In [60]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer,max_len=128)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_len=128)

In [61]:
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset,batch_size = batch_size)

In [62]:
num_classes = len(labels)
model = BertClassifier('bert-base-uncased', num_classes)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [63]:
optimizer= AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

In [64]:
def train():
  best_accuracy = 0
  for epoch in range(epochs):
    print(f"Epoch {epoch + 1} / {epochs}")
    model.train()
    train_loss = 0
    for batch in train_dataloader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      token_type_ids = batch['token_type_ids'].to(device)
      labels= batch['label'].to(device)
      model.zero_grad()
      logits, _ = model(input_ids, attention_mask, token_type_ids)
      loss_fn = nn.CrossEntropyLoss()
      loss  = loss_fn(logits, labels)
      train_loss += loss.item()
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
      optimizer.step()
      scheduler.step()

    avg_train_loss = train_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss}")
    model.eval()
    val_accuracy = 0
    predictions =[]
    true_labels = []
    with torch.no_grad():
          for batch in val_dataloader:
              input_ids = batch['input_ids'].to(device)
              attention_mask = batch['attention_mask'].to(device)
              token_type_ids = batch['token_type_ids'].to(device)
              labels = batch['label'].to(device)
              logits, _ = model(input_ids, attention_mask, token_type_ids)
              preds = torch.argmax(logits, dim=1).flatten()
              predictions.extend(preds.cpu().numpy())
              true_labels.extend(labels.cpu().numpy())
          accuracy = accuracy_score(true_labels, predictions)
          print(f"Validation accuracy: {accuracy:.4f}")
          if int(accuracy) > int(best_accuracy):
            best_accuracy = accuracy
            torch.save(model.state_dict(), 'best_bert_classifier.pt')
            print("Saved best model!")
          print(classification_report(true_labels, predictions))

In [65]:
train()

Epoch 1 / 4
Average training loss: 2.426787297580844
Validation accuracy: 0.9624
              precision    recall  f1-score   support

           0       0.99      0.90      0.94       104
           1       0.92      1.00      0.96        78
           2       0.95      0.99      0.97        73
           3       0.99      1.00      0.99        98
           4       0.96      0.93      0.94        73

    accuracy                           0.96       426
   macro avg       0.96      0.96      0.96       426
weighted avg       0.96      0.96      0.96       426

Epoch 2 / 4
Average training loss: 0.22362154635174253
Validation accuracy: 0.9742
              precision    recall  f1-score   support

           0       1.00      0.91      0.95       104
           1       0.95      1.00      0.97        78
           2       0.97      0.99      0.98        73
           3       0.99      1.00      0.99        98
           4       0.95      0.99      0.97        73

    accuracy         

In [66]:
def get_improved_embeddings(texts, model, tokenizer):
    model.eval()
    embeddings = []

    with torch.no_grad():
        for text in texts:
            encoding = tokenizer(
                text,
                add_special_tokens=True,
                max_length=128,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            token_type_ids = encoding['token_type_ids'].to(device)

            _, pooled_output = model(input_ids, attention_mask, token_type_ids)
            embeddings.append(pooled_output.cpu().numpy())

    return np.vstack(embeddings)

In [70]:
improved_embeddings = get_improved_embeddings(texts, model, tokenizer)

In [72]:
embeddings_array = np.array(improved_embeddings)
np.save('bert_embeddings.npy', embeddings_array)

In [73]:
np.load('bert_embeddings.npy')

array([[-0.08014025, -0.13654245,  0.64201427, ...,  0.3688584 ,
         0.35854894,  0.5685002 ],
       [-0.00840259, -0.3292898 ,  0.72282505, ...,  0.48212647,
         0.33529934,  0.52673876],
       [-0.03428885, -0.32123187,  0.7387707 , ...,  0.48857313,
         0.30526623,  0.512945  ],
       ...,
       [ 0.33852705, -0.5042293 , -0.03658124, ..., -0.9534209 ,
         0.12205429,  0.82690305],
       [ 0.32440546, -0.48330057, -0.14341506, ..., -1.0444727 ,
         0.13929965,  0.833652  ],
       [ 0.3512009 , -0.47030488, -0.27021798, ..., -1.0455874 ,
         0.1081123 ,  0.81076807]], dtype=float32)

In [74]:
embeddings_array.shape

(2127, 768)