In [41]:
import torch
from transformers import RobertaTokenizer
from datasets import load_dataset
import numpy as np

In [18]:
MAX_LENGTH=32

In [19]:
# Load the tokenizer for the RoBERTa model

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [20]:
# Load the dataset from hugging face

dataset = load_dataset("telord/ner-mountains-first-dataset")

In [21]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'tokens', 'labels'],
        num_rows: 3064
    })
    test: Dataset({
        features: ['sentence', 'tokens', 'labels'],
        num_rows: 340
    })
})

In [22]:
# function to prepare data labels for training (padding and changing label classes)

def prepare_labels(data, max_length):
    padded_data = []
    for label in data:
        label = list(map(lambda x: int(x != 0), label))
        
        if len(label) + 1 <= max_length:
            padded_label = [0] * (max_length - (len(label) + 1)) + label + [0]
        else:
            padded_label = [0] + label[:max_length - 2] + [0]
            
        padded_data.append(padded_label)
    return padded_data

In [23]:
# tokenize train data tokens

train_data = tokenizer(dataset["train"]["tokens"],
                      truncation=True,
                      padding="max_length",
                      max_length=MAX_LENGTH,
                      is_split_into_words=True,
                      return_tensors="pt",
                      return_attention_mask=True)

# tokenize test data_tokens

test_data = tokenizer(dataset["test"]["tokens"],
                      truncation=True,
                      padding="max_length",
                      max_length=MAX_LENGTH,
                      is_split_into_words=True,
                      return_tensors="pt",
                      return_attention_mask=True)

In [24]:
# prepare train labels
train_labels = torch.Tensor(prepare_labels(dataset["train"]["labels"], MAX_LENGTH))
# prepare test labels
test_labels = torch.Tensor(prepare_labels(dataset["test"]["labels"], MAX_LENGTH))

In [25]:
train_labels.shape, test_labels.shape

(torch.Size([3064, 32]), torch.Size([340, 32]))

In [26]:
train_data["labels"] = train_labels
test_data["labels"] = test_labels

In [27]:
class CustomRoBERTaDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, labels, max_length):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

In [28]:
# create datasets
train_dataset = CustomRoBERTaDataset(train_data["input_ids"],
                                     train_data["attention_mask"],
                                     train_data["labels"], max_length=MAX_LENGTH)

test_dataset = CustomRoBERTaDataset(test_data["input_ids"],
                                    test_data["attention_mask"],
                                    test_data["labels"], max_length=MAX_LENGTH)

In [29]:
# save datasets

torch.save(train_dataset, "data/processed_train_dataset.pt")
torch.save(test_dataset, "data/processed_test_dataset.pt")

In [39]:
s = 0
for lst in dataset["train"]["labels"]:
    lst = list(map(lambda x: int(x != 0), lst))
    s += any(lst)
s / len(dataset["train"]["labels"])

0.5003263707571801