In [1]:
from pathlib import Path
import numpy as np
import random
from collections import Counter
from omnibelt import load_json

import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from torch.utils.data import Dataset, DataLoader
import torch


In [2]:

root = Path('../data/backup')
path = root / 'cladder-v1-q-aggregate.json'
path = root / 'cladder-v1-q-commonsense.json'
path = root / 'cladder-v1-q-balanced.json'
# path = '../data/cladder-v1-common-easy.json'
full = load_json(path)
full_ids = {entry['question_id']: entry for entry in full}
models = load_json(root / 'cladder-v1-meta-models.json')
model_table = {info['model_id']: info for info in models}
len(full), len(models)

(10112, 7064)

In [3]:
prompts = []
labels = []
for entry in full:
	model = models[entry['meta']['model_id']]
	prompt = f'{model["background"]}\n{entry["given_info"]}\n{entry["question"]}'
	prompts.append(prompt)
	labels.append(entry['answer'])
len(prompts), len(labels)

(10112, 10112)

In [4]:
model_id = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": {0: 'no', 1: 'yes'}, "label2id": {'no': 0, 'yes': 1}})
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:

# Data Preparation
class SimpleDataset(Dataset):
    def __init__(self, tokenizer, prompts, labels, max_length=256):
        self.tokenizer = tokenizer
        self.prompts = prompts
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.prompts)

    def __getitem__(self, idx):
        text = self.prompts[idx]
        label = 1 if self.labels[idx] == 'yes' else 0  # Convert yes/no to 1/0
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(label)}


# Splitting the data into train, validation, and test sets (70-20-10 split)
train_prompts, temp_prompts, train_labels, temp_labels = train_test_split(prompts, labels, test_size=0.3, random_state=42)
val_prompts, test_prompts, val_labels, test_labels = train_test_split(temp_prompts, temp_labels, test_size=1/3, random_state=42)

train_dataset = SimpleDataset(tokenizer, train_prompts, train_labels)
val_dataset = SimpleDataset(tokenizer, val_prompts, val_labels)
test_dataset = SimpleDataset(tokenizer, test_prompts, test_labels)
len(train_dataset), len(val_dataset), len(test_dataset)

(7078, 2022, 1012)

In [6]:
# Model Training
training_args = TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    logging_dir='./results/logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=10,
    save_total_limit=2,
    output_dir='./results',
    push_to_hub=False,
    logging_first_step=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    remove_unused_columns=False,
    fp16=True if torch.cuda.is_available() else False,
)


# Define a compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1).tolist()
    return {'accuracy': accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


In [7]:

trainer.train()




Step,Training Loss,Validation Loss



KeyboardInterrupt



In [None]:

# Evaluating on test set
results = trainer.evaluate(test_dataset)


In [None]:

# You can save the model after training
model.save_pretrained('./my_finetuned_roberta')

In [2]:

dataset_id = "ag_news"
# relace the value with your model: ex <hugging-face-user>/<model-name>
repository_id = "achimoraites/roberta-base_ag_news"

In [4]:
# Load dataset
dataset = load_dataset(dataset_id)
# Training and testing datasets
train_dataset = dataset['train']
test_dataset = dataset["test"].shard(num_shards=2, index=0)

# Validation dataset
val_dataset = dataset['test'].shard(num_shards=2, index=1)

Downloading builder script:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset ag_news/default (download: 29.88 MiB, generated: 30.23 MiB, post-processed: Unknown size, total: 60.10 MiB) to /is/ei/fleeb/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548...


Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Dataset ag_news downloaded and prepared to /is/ei/fleeb/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [8]:



# This function tokenizes the input text using the RoBERTa tokenizer. 
# It applies padding and truncation to ensure that all sequences have the same length (256 tokens).
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
# Set dataset format
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [10]:
# We will need this to directly output the class names when using the pipeline without mapping the labels later.
# Extract the number of classes and their names
num_labels = dataset['train'].features['label'].num_classes
class_names = dataset["train"].features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
id2label = {i: label for i, label in enumerate(class_names)}

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": {0: 'no', 1: 'yes'}})

number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


In [11]:
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
