In [1]:
import torch
from transformers import RobertaTokenizerFast
from datasets import load_dataset
import numpy as np
import os
import dotenv

In [3]:
# load .env file
dotenv.load_dotenv()

True

In [42]:
MAX_LENGTH = 32
DATA_PATH = os.getenv("DATA_PATH")

In [3]:
# Load the tokenizer for the RoBERTa model

tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', add_prefix_space=True)

In [4]:
tokenizer("some sentence")

{'input_ids': [0, 103, 3645, 2], 'attention_mask': [1, 1, 1, 1]}

In [5]:
# Load the dataset from hugging face

dataset = load_dataset("telord/ner-mountains-first-dataset")

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'tokens', 'labels'],
        num_rows: 3064
    })
    test: Dataset({
        features: ['sentence', 'tokens', 'labels'],
        num_rows: 340
    })
})

In [22]:
# function to prepare dataset for training 
# function solves the problem of subtokens and special tokens

def prepare_dataset(data, tokenizer, max_length):
    # tokenize the dataset
    tokenized_inputs = tokenizer(data["tokens"],
                                 truncation=True,
                                 padding="max_length",
                                 max_length=MAX_LENGTH,
                                 is_split_into_words=True,
                                 return_tensors="pt",
                                 return_attention_mask=True) 
    
    new_labels = [] # processed labels
    
    for i, label in enumerate(data["labels"]):
        word_id = tokenized_inputs.word_ids(batch_index=i) # get word ids 
                
        new_label = [] # processed one label
        for j in word_id:
            if j is None:
                new_label.append(0) # give label 0 for spacial tokens (<pad>, <s>, </s>)
            else:
                new_label.append(int(label[j] != 0)) # for each subword give its word label
        new_labels.append(new_label)
    
    new_labels = torch.Tensor(new_labels) # convert to Tensor
    tokenized_inputs["labels"] = new_labels
    
    return tokenized_inputs

In [23]:
# create torch dataset object

class CustomRoBERTaDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, labels, max_length):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

In [24]:
# train data
train_data = prepare_dataset(data=dataset["train"],
                             tokenizer=tokenizer,
                             max_length=MAX_LENGTH)

# test data
test_data = prepare_dataset(data=dataset["test"],
                             tokenizer=tokenizer,
                             max_length=MAX_LENGTH)

In [35]:
# create datasets
train_dataset = CustomRoBERTaDataset(train_data["input_ids"],
                                     train_data["attention_mask"],
                                     train_data["labels"], max_length=MAX_LENGTH)

test_dataset = CustomRoBERTaDataset(test_data["input_ids"],
                                    test_data["attention_mask"],
                                    test_data["labels"], max_length=MAX_LENGTH)

In [36]:
# save datasets

torch.save(train_dataset, os.path.join(DATA_PATH, "processed_train_dataset.pt"))
torch.save(test_dataset, os.path.join(DATA_PATH, "processed_test_dataset.pt"))

In [39]:
# calculate the percentage of instances without mentioning mountains

s = 0
for lst in dataset["train"]["labels"]:
    lst = list(map(lambda x: int(x != 0), lst))
    s += any(lst)
s / len(dataset["train"]["labels"])

0.5003263707571801