In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch.utils.data import Dataset
import os
import transformers

import pandas as pd

from dataset.besstie import dataset_besstie

root_folder = "dataset/besstie/"
splits = {'train': 'train.csv', 'validation': 'valid.csv'}
if not os.path.exists(root_folder):
    os.makedirs(root_folder)
if not os.path.exists(os.path.join(root_folder, splits["train"])) or not os.path.exists(os.path.join(root_folder, splits["validation"])):
    print("Downloading BESSTIE dataset...")
    # Login using e.g. `huggingface-cli login` to access this dataset
    df = pd.read_csv("hf://datasets/unswnlporg/BESSTIE/" + splits["train"])
    df.to_csv(os.path.join(root_folder, splits["train"]), index=False)
    df = pd.read_csv("hf://datasets/unswnlporg/BESSTIE/" + splits["validation"])
    df.to_csv(os.path.join(root_folder, splits["validation"]), index=False)


In [15]:
CFG = {
    'lr': 2e-5,
    'epochs': 30,
    'batch_size': 8,
    'max_length': 200,
    'min_length': 1,
    'task': 'Sentiment',
    'variety': 'en-UK',
    'source': 'Google',
    'model_name': 'bert-base-uncased'
}

df_train = pd.read_csv(os.path.join(root_folder, splits['train']))
labels_count = df_train["label"].value_counts().sort_index()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(labels_count)
print("Using device:", device)

label
0    12092
1     5668
Name: count, dtype: int64
Using device: cuda


In [19]:
bert_model_name = CFG['model_name']
tokenizer = transformers.BertTokenizer.from_pretrained(bert_model_name)

# load classifier model
model = transformers.BertForSequenceClassification.from_pretrained(
    bert_model_name,
    num_labels=2
).to(device)

train_ds = dataset_besstie.BesstieDataSet(
    root_folder=root_folder,
    file_name=splits['train'],
    classes=['0', '1'],
    tokenizer=tokenizer,
    min_length=CFG['min_length'],
    max_length=CFG['max_length'],
    variety=CFG['variety'],
    source=CFG['source'],
    task=CFG['task']
)

optimizer = torch.optim.AdamW(model.parameters(), lr=CFG['lr'])
criterion = torch.nn.CrossEntropyLoss(
    weight=torch.tensor(labels_count.values/sum(labels_count), dtype=torch.float).to(device)
)

train_loader = torch.utils.data.DataLoader(
    train_ds,
    batch_size=CFG['batch_size'],
    shuffle=True
)

for epoch in range(CFG['epochs']):
    train_loss = 0.0
    print(f"Epoch {epoch+1}/{CFG['epochs']}")
    for batch in train_loader:
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device)
        }
        local_labels = batch['label'].tolist()
        outputs = model(**inputs, labels=torch.tensor(local_labels).to(device))
        loss = criterion(outputs.logits, torch.tensor(local_labels).to(device))
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        train_loss += loss.item() * inputs['input_ids'].size(0)

    epoch_loss = train_loss / len(train_ds)
    print(f"Training Loss: {epoch_loss:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/30
Training Loss: 0.2813
Epoch 2/30
Training Loss: 0.0773
Epoch 3/30


KeyboardInterrupt: 