In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers datasets sentencepiece torchinfo

In [3]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import DataLoader, TensorDataset
from torchinfo import summary
from datasets import Dataset

import sentencepiece
import transformers
import pandas as pd
import numpy as np
import torch

In [4]:
prefix = "tedtalks-1k-4sent"
dir = "/content/drive/MyDrive/PTvsBR/data/1k-4sent"
train_dataset = pd.read_csv(f'{dir}/{prefix}-train-raw.csv',sep=";")
dev_dataset = pd.read_csv(f'{dir}/{prefix}-dev-raw.csv',sep=";")
test_dataset = pd.read_csv(f'{dir}/{prefix}-test-raw.csv',sep=";")
train_dataset.label = train_dataset.label.apply(lambda x: 0 if x == "BR" else 1)
dev_dataset.label = dev_dataset.label.apply(lambda x: 0 if x == "BR" else 1)
test_dataset.label = test_dataset.label.apply(lambda x: 0 if x == "BR" else 1)

## Model Training

In [None]:
model_name = "PORTULAN/albertina-ptbr-base"
model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

model.deberta.requires_grad_(False)
#model.pooler.requires_grad_(False)

input_data = torch.zeros((32, 512), dtype=torch.int32).to(device)
summary(model, input_data=input_data, dtypes=['torch.IntTensor'], device=device)

In [None]:
print([x for x in model.parameters() if x.requires_grad])

In [6]:
criterion = torch.nn.BCELoss()
learning_rate = 1e-3
batch_size = 128
epochs = 10
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)

In [None]:
train_subset = train_dataset.sample(1000, random_state=42, ignore_index=True)
dev_subset = dev_dataset.sample(500, random_state=42, ignore_index=True)

train_subset = train_subset.dropna()
dev_subset = dev_subset.dropna()

train_subset = Dataset.from_pandas(train_subset)
dev_subset = Dataset.from_pandas(dev_subset)

tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
  return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_train_subset = train_subset.map(tokenize,batched=True)
tokenized_dev_subset = dev_subset.map(tokenize,batched=True)

data_collator = transformers.DataCollatorWithPadding(tokenizer)

tokenized_train_subset = tokenized_train_subset.remove_columns(["text"])
tokenized_dev_subset = tokenized_dev_subset.remove_columns(["text"])
tokenized_train_subset = tokenized_train_subset.rename_column("label", "labels")
tokenized_dev_subset = tokenized_dev_subset.rename_column("label", "labels")

train_dataloader = DataLoader(tokenized_train_subset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
dev_dataloader = DataLoader(tokenized_dev_subset, batch_size=batch_size, collate_fn=data_collator)

print(train_dataloader.dataset)
for batch in train_dataloader:
  break
print({k:v.shape for k,v in batch.items()})

In [8]:
def train_model(model, criterion, optimizer, train_loader, val_loader, epochs=10, device='cpu'):
    softmax = torch.nn.Softmax(dim=1)
    train_loss_history = []
    val_loss_history = []
    for epoch in range(epochs):
        model.train()
        number_of_batches = 0
        train_loss_history.append(0)

        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            X = batch["input_ids"].to(device)
            y = torch.nn.functional.one_hot(batch["labels"], num_classes=2).float().to(device)
            optimizer.zero_grad()
            y_pred = softmax(model(X).logits).to(torch.float32)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()

            train_loss_history[-1] += loss.item()
            number_of_batches += 1

        train_loss_history[-1] /= number_of_batches

        model.eval()
        number_of_batches = 0
        val_loss_history.append(0)
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                X = batch["input_ids"]
                y = torch.nn.functional.one_hot(batch["labels"], num_classes=2).float().to(device)
                y_pred = softmax(model(X).logits).to(torch.float32)
                loss = criterion(y_pred, y)
                val_loss_history[-1] += loss.item()
                number_of_batches += 1

        val_loss_history[-1] /= number_of_batches
        print('Epoch: {} - Train Loss: {:.6f} - Val Loss: {:.6f}'.format(epoch+1, train_loss_history[-1], val_loss_history[-1]))

    return model, train_loss_history, val_loss_history

In [9]:
model, train_loss, val_loss = train_model(model, criterion, optimizer, train_dataloader, dev_dataloader, epochs, device)

OutOfMemoryError: ignored

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_loss, label=f'Train Loss')
plt.plot(val_loss, label=f'Val Loss')
plt.legend()
plt.show()

In [18]:
def get_metrics(model, val_loader, device='cpu'):
  softmax = torch.nn.Softmax(dim=1)
  model.eval()
  y_pred = []
  y_true = []
  with torch.no_grad():
      for batch in val_loader:
          batch = {k: v.to(device) for k, v in batch.items()}
          X = batch["input_ids"]
          y = torch.nn.functional.one_hot(batch["labels"], num_classes=2).float().to(device)
          y_pred.append(torch.argmax(softmax(model(X).logits).to(torch.float32).cpu(), dim=1))
          y_true.append(batch["labels"].cpu())
  y_pred = np.concatenate(y_pred).flatten()
  y_true = np.concatenate(y_true)
  y_pred = np.where(y_pred > 0.5, 1, 0)
  print(y_pred, y_true)
  return accuracy_score(y_true, y_pred), precision_score(y_true, y_pred), recall_score(y_true, y_pred), f1_score(y_true, y_pred)

In [None]:
metrics = get_metrics(model, dev_dataloader, device=device)

In [None]:
print(f"Accuracy: {metrics[0]}")
print(f"Precision: {metrics[1]}")
print(f"Recall: {metrics[2]}")
print(f"F1-Score: {metrics[3]}")

Accuracy: 0.614
Precision: 0.6406926406926406
Recall: 0.5736434108527132
F1-Score: 0.6053169734151329
