In [18]:
import torch
from transformers import RobertaTokenizer
from datasets import load_dataset
import numpy as np

In [19]:
MAX_LENGTH=32

In [20]:
# Load the tokenizer for the RoBERTa model

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [21]:
# Load the dataset from hugging face

dataset = load_dataset("telord/ner-mountains-first-dataset")

In [21]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'tokens', 'labels'],
        num_rows: 3064
    })
    test: Dataset({
        features: ['sentence', 'tokens', 'labels'],
        num_rows: 340
    })
})

In [22]:
# function to prepare data labels for training (padding and changing label classes)

def prepare_labels(data, max_length):
    padded_data = []
    for label in data:
        label = list(map(lambda x: int(x != 0), label))
        
        if len(label) + 1 <= max_length:
            padded_label = [0] * (max_length - (len(label) + 1)) + label + [0]
        else:
            padded_label = [0] + label[:max_length - 2] + [0]
            
        padded_data.append(padded_label)
    return padded_data

In [23]:
# tokenize train data tokens

train_data = tokenizer(dataset["train"]["tokens"],
                      truncation=True,
                      padding="max_length",
                      max_length=MAX_LENGTH,
                      is_split_into_words=True,
                      return_tensors="pt",
                      return_attention_mask=True)

# tokenize test data_tokens

test_data = tokenizer(dataset["test"]["tokens"],
                      truncation=True,
                      padding="max_length",
                      max_length=MAX_LENGTH,
                      is_split_into_words=True,
                      return_tensors="pt",
                      return_attention_mask=True)

In [24]:
# prepare train labels
train_labels = torch.Tensor(prepare_labels(dataset["train"]["labels"], MAX_LENGTH))
# prepare test labels
test_labels = torch.Tensor(prepare_labels(dataset["test"]["labels"], MAX_LENGTH))

In [25]:
train_labels.shape, test_labels.shape

(torch.Size([3064, 32]), torch.Size([340, 32]))

In [26]:
train_data["labels"] = train_labels
test_data["labels"] = test_labels

In [27]:
class CustomRoBERTaDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, labels, max_length):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

In [28]:
# create datasets
train_dataset = CustomRoBERTaDataset(train_data["input_ids"],
                                     train_data["attention_mask"],
                                     train_data["labels"], max_length=MAX_LENGTH)

test_dataset = CustomRoBERTaDataset(test_data["input_ids"],
                                    test_data["attention_mask"],
                                    test_data["labels"], max_length=MAX_LENGTH)

In [29]:
# save datasets

torch.save(train_dataset, "data/processed_train_dataset.pt")
torch.save(test_dataset, "data/processed_test_dataset.pt")

In [39]:
s = 0
for lst in dataset["train"]["labels"]:
    lst = list(map(lambda x: int(x != 0), lst))
    s += any(lst)
s / len(dataset["train"]["labels"])

0.5003263707571801

In [60]:
import torch
import os
from transformers import RobertaTokenizer
import peft
from colorama import Fore, Back, Style, init


GREETING_TEXT = """
Hello!! Here you can interact with the NER model (Fine-Tuned RoBERTa)!!!!
To make a query just enter some sentence and the model will show you what words are Mountains and what are not.
(The mountains will be colored in blue)

Below are some EXAMPLES if you cannot come up with one:

    * "So how it was on Kilimanjaro?"

    * "White Glacier is a broad westward flowing tributary glacier which joins the Land Glacier on the north side of Mount McCoy in Marie Byrd Land ."

    * "Other notable sections of the cemetery are the cemetery of the Finnish Guard , the Artist 's Hill and the Statesmen 's Grove ."
    
    * "Why dont we hang out together? Lets go on a trip. What about Alpas?"
"""
MAX_LENGTH = 32

def inference_ner():
    print(GREETING_TEXT)
    
    path_to_model = input("Enter path to the model (recommended to use RoBERTa version):")
    sentence = input("Enter the sentence you want to pass into the model:")
    
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    tokenized_sentence = tokenizer(sentence, 
                                   truncation=True,
                                   padding="max_length",
                                   max_length=MAX_LENGTH,
                                   return_tensors="pt",
                                   return_attention_mask=True,
                                   clean_up_tokenization_spaces=True)
    
    model = torch.load(path_to_model)
    model.eval()
    
    with torch.inference_mode():
        outputs = model(input_ids=tokenized_sentence["input_ids"],
                        attention_mask=tokenized_sentence["attention_mask"])

    outputs = torch.softmax(outputs.logits, dim=2).argmax(dim=2)
    
        
    outputs = list(outputs.squeeze().cpu().numpy())
    tokenized_sentence = list(tokenized_sentence["input_ids"].squeeze().cpu().numpy())
    
    print(Fore.GREEN + "\nHERE IS THE RESULT:\n")
    
    for i in range(len(outputs)):
        label = outputs[i]
        token = tokenized_sentence[i]
        if token in [0, 1, 2]:
            continue
            
        decoded_word = tokenizer.decode(token)
        if label:
            print(Fore.ORANGE + decoded_word, end=" ")
        else:
            print(Style.RESET_ALL + decoded_word, end=" ")    
    

if __name__ == "__main__":
    inference_ner()


Hello!! Here you can interact with the NER model (Fine-Tuned RoBERTa)!!!!
To make a query just enter some sentence and the model will show you what words are Mountains and what are not.
(The mountains will be colored in blue)

Below are some EXAMPLES if you cannot come up with one:

    * "So how it was on Kilimanjaro?"

    * "White Glacier is a broad westward flowing tributary glacier which joins the Land Glacier on the north side of Mount McCoy in Marie Byrd Land ."

    * "Other notable sections of the cemetery are the cemetery of the Finnish Guard , the Artist 's Hill and the Statesmen 's Grove ."
    
    * "Why dont we hang out together? Lets go on a trip. What about Alpas?"

Enter path to the model (recommended to use RoBERTa version):/home/user/Стільниця/python/jpnb_projects/NER_mountain/data/models/roberta_fine_tuned.pt
Enter the sentence you want to pass into the model:So how it was on Kilimanjaro?


Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.


[32m
HERE IS THE RESULT:

[0mSo [0m how [0m it [0m was [0m on 

AttributeError: 'AnsiFore' object has no attribute 'ORANGE'

In [16]:
/home/user/Стільниця/python/jpnb_projects/NER_mountain/data/models/roberta_fine_tuned.pt

NameError: name 'home' is not defined

In [48]:
dataset["train"]["sentence"][24], dataset["train"]["labels"][24]

('This top , now known as Cadair Berwyn , is listed as Cadair Berwyn New Top on the Nuttall list .',
 [0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0])

In [66]:
print(Fore.LIGHTYELLOW_EX + "fsdf")

[93mfsdf
