In [7]:
import string

class DatasetFormatConverter():
    """
    """
    def __init__(self, dataset):
        self.dataset = dataset
        self.label2id = { "O": 0, "B": 1, "I": 2}

    def get_id2label(self):
        id2label = {v: k for k, v in self.label2id.items()}
        return id2label
    
    def get_label2id(self):
        return self.label2id
    
    def get_label_list(self):
        return list(self.label2id.keys())
    
    def _reformat_entities_dict(self, enitities_dicts_list):
        return [{item.get('text') : item.get('offsets')} for item in enitities_dicts_list]
    
    def _generate_char_based_labels_list(self, example):
        labels = ["O"] * len(example["sentence"])
        for entity in example['entities']:
            # print('entity: ', entity)
            start = entity["offsets"][0]
            end = entity["offsets"][1]
            type = entity["type"]
            labels[start] = f"B-{type}"
            for i in range(start+1, end):
                # print('char: ', example["sentence"][i])
                labels[i] = f"I-{type}"
        return labels
    
    def _contains_punctuation(self, word):
        return any(char in string.punctuation for char in word)

    def _is_only_punctuation(self, word):
        return all(char in string.punctuation for char in word)
    
    def _remove_punctuation_and_count(self, text, punctuation_to_remove = '!"#&\'(),-./:;<=>?@[\\]^_`|'):
        """
        Remove punctuation from the beginning and end of the text and count how many characters were removed.
        """
        count_beginning = len(text) - len(text.lstrip(punctuation_to_remove))
        count_end = len(text) - len(text.rstrip(punctuation_to_remove))
        word_no_punct = text.strip(punctuation_to_remove)
        return word_no_punct, count_beginning, count_end

    def _entities_from_dict_to_labels_list(self, example, word_level=True, token_level=False, tokenizer=None):
        if word_level and token_level:
            raise ValueError("Only one of word_level and token_level can be True")
        if not word_level and not token_level:
            raise ValueError("One of word_level and token_level must be True")
        if token_level and tokenizer is None:
            raise ValueError("tokenizer must be provided if token_level is True")
        if word_level:
            words = example["sentence"].split()
        elif token_level:
            raise NotImplementedError
        labels = [0] * len(words)
        # print(example["entities"])
        chars_based_labels = self._generate_char_based_labels_list(example)
        word_starting_position = 0
        for i, word in enumerate(words):
            # print(f'processing word: {word}\n starting position: {word_starting_position}\n encompassing labels {chars_based_labels[word_starting_position:word_starting_position+len(word)]}')
            if self._is_only_punctuation(word):
                word_starting_position = word_starting_position + len(word) + 1
                continue
            if self._contains_punctuation(word):
                _, count_beginning, count_end = self._remove_punctuation_and_count(word)
                # print(f'remove punctuation from word: {word}\n count beginning: {count_beginning}\n count end: {count_end}')
            else:
                count_beginning, count_end = 0, 0
            word_length = len(word)
            start_word = word_starting_position + count_beginning
            end_word = word_starting_position + word_length - count_end
            chars_labels_of_this_word = chars_based_labels[start_word : end_word]
            if (chars_labels_of_this_word[0].startswith("B-") or chars_labels_of_this_word[0].startswith("I-")) \
                and all([label.startswith("I-") for label in chars_labels_of_this_word[1:]]):
                labels[i] = self.label2id.get(chars_labels_of_this_word[0][0], -1)
            word_starting_position = word_starting_position + word_length + 1
        # print(labels)
        example['words'] = words
        example['word_level_labels'] = labels
        return example

    def apply(self):
        self.dataset = self.dataset.map(self._entities_from_dict_to_labels_list)
        self.dataset = self.dataset.rename_column("word_level_labels", "ner_tags")
        self.dataset = self.dataset.rename_column("words", "tokens")

    def set_tokenizer(self, tokenizer):
        self.tokenizer = tokenizer

    def set_max_seq_length(self, max_seq_length):
        self.max_seq_length = max_seq_length

    # def tokenize_and_align_labels(self, examples): COPIED FROM HF, WRONG
    #     """
    #     """
    #     tokenized_inputs = self.tokenizer(examples["tokens"], is_split_into_words=True, padding='longest', max_length=self.max_seq_length, truncation=True)

    #     labels = []
    #     for i, label in enumerate(examples[f"ner_tags"]):
    #         word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
    #         previous_word_idx = None
    #         label_ids = []
    #         for word_idx in word_ids:  # Set the special tokens to -100.
    #             if word_idx is None:
    #                 label_ids.append(-100)
    #             elif word_idx != previous_word_idx:  # Only label the first token of a given word.
    #                 label_ids.append(label[word_idx])
    #             else:
    #                 label_ids.append(-100)
    #             previous_word_idx = word_idx
    #         labels.append(label_ids)
    #     tokenized_inputs["labels"] = labels
    #     return tokenized_inputs

    def tokenize_and_align_labels(self, examples):
        tokenized_inputs = self.tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

        labels = []
        for i, words_label in enumerate(examples[f"ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
            label_ids = []
            for k, word_idx in enumerate(word_ids): 
                same_word_as_previous  = False if (word_idx != word_ids[k-1] or k==0) else True
                if word_idx is None:
                    token_label = -100
                elif words_label[word_idx] == self.label2id['O']:
                    token_label = self.label2id['O']
                elif same_word_as_previous:
                    token_label = self.label2id['I']
                elif not same_word_as_previous:
                    token_label = words_label[word_idx]
                label_ids.append(token_label)
                # if word_idx is not None:#  and k>12:
                #     print("word_label: ", words_label[word_idx])
                # print(tokenizer.decode(tokenized_inputs[i].ids[k]), ": ",word_idx,  "\nassigned_token_label:",  label_ids[k], '\n')
            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
        return tokenized_inputs
        

In [8]:
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
from transformers.pipelines.pt_utils import KeyDataset
from peft import PeftModel, PeftConfig
from dotenv import dotenv_values
import torch
from tqdm.auto import tqdm

# from utils import DataPreprocessor, DatasetFormatConverter
from src.billm.modeling_mistral import MistralForTokenClassification

WANDB_KEY = dotenv_values(".env.base")['WANDB_KEY']
LLAMA_TOKEN = dotenv_values(".env.base")['LLAMA_TOKEN']
HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']

adapters = "ferrazzipietro/LS_Mistral-7B-v0.1_adapters_en.layer1_NoQuant_16_32_0.01_2_0.0002"
peft_config = PeftConfig.from_pretrained(adapters)
BASE_MODEL_CHECKPOINT = peft_config.base_model_name_or_path

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_CHECKPOINT,token =HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token
# seqeval = evaluate.load("seqeval")
DATASET_CHEKPOINT="ferrazzipietro/e3c-sentences" 
TRAIN_LAYER="en.layer1"
preprocessor = DataPreprocessor(BASE_MODEL_CHECKPOINT, 
                                tokenizer)
dataset = load_dataset(DATASET_CHEKPOINT) #download_mode="force_redownload"
dataset = dataset[TRAIN_LAYER]
dataset = dataset.shuffle(seed=1234)  
dataset_format_converter = DatasetFormatConverter(dataset)
dataset_format_converter.apply()
ds = dataset_format_converter.dataset
label2id = dataset_format_converter.label2id
id2label = dataset_format_converter.get_id2label()
label_list = dataset_format_converter.get_label_list()
dataset_format_converter.set_tokenizer(tokenizer)
dataset_format_converter.set_max_seq_length(256)
tokenized_ds = ds.map(lambda x: dataset_format_converter.tokenize_and_align_labels(x), batched=True)# dataset_format_converter.dataset.map(tokenize_and_align_labels, batched=True)
train_data, val_data, test_data = preprocessor.split_layer_into_train_val_test_(tokenized_ds, TRAIN_LAYER)


In [9]:
tokenized_ds = ds.map(lambda x: dataset_format_converter.tokenize_and_align_labels(x), batched=True)# dataset_format_converter.dataset.map(tokenize_and_align_labels, batched=True)
train_data, val_data, test_data = preprocessor.split_layer_into_train_val_test_(tokenized_ds, TRAIN_LAYER)
train_data[0]['labels']

[-100, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [4]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, words_label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        label_ids = []
        for k, word_idx in enumerate(word_ids): 
            same_word_as_previous  = False if (word_idx != word_ids[k-1] or k==0) else True
            if word_idx is None:
                token_label = -100
            elif words_label[word_idx] == label2id['O']:
                token_label = label2id['O']
            elif same_word_as_previous:
                token_label = label2id['I']
            elif not same_word_as_previous:
                token_label = words_label[word_idx]
            label_ids.append(token_label)
            # if word_idx is not None:#  and k>12:
            #     print("word_label: ", words_label[word_idx])
            # print(tokenizer.decode(tokenized_inputs[i].ids[k]), ": ",word_idx,  "\nassigned_token_label:",  label_ids[k], '\n')
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs
tokenized_ds = ds.map(lambda x: tokenize_and_align_labels(x), batched=True)# dataset_format_converter.dataset.map(tokenize_and_align_labels, batched=True)
train_data, val_data, test_data = preprocessor.split_layer_into_train_val_test_(tokenized_ds, TRAIN_LAYER)
train_data[0]['labels']

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map: 100%|██████████| 1520/1520 [00:00<00:00, 12776.36 examples/s]


[-100, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [3]:
bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16,
                )

model = MistralForTokenClassification.from_pretrained(
    peft_config.base_model_name_or_path,
    num_labels=len(label2id), id2label=id2label, label2id=label2id,
    token = HF_TOKEN,
    cache_dir='/data/disk1/share/pferrazzi/.cache',
    device_map='auto',
    quantization_config = bnb_config)
model = PeftModel.from_pretrained(model, adapters, token = HF_TOKEN)
model = model.merge_and_unload()

INFO:BiLLM:Here is the Bi-MistralModel! BiLLM_START_INDEX=0
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.36s/it]
Some weights of MistralForTokenClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [27]:
class OutputGeneration():
    def __init__(self, model, tokenizer, id2label):
        self.model = model
        self.tokenizer = tokenizer
        self.id2label = label2id
    
    def _create_prediction_list(self, model_output):
        model_output_logits = model_output.logits.cpu().detach().float().numpy()
        preds = np.argmax(model_output_logits, axis=2)
        preds_list = []
        for pred in preds:
            preds_list.append([self.label2id[label] for label in pred])
        return preds_list

    def _generate_batch(self, input_sentences, model, tokenizer):
        encodeds = tokenizer(input_sentences, return_tensors="pt", add_special_tokens=True, padding=True)
        model_inputs = encodeds.to('cuda')
        generated_ids = model(**model_inputs)
        preds = self._create_prediction_list(generated_ids)
        return preds
    

    def add_output_column(self, data, model, tokenizer, batch_size:int) -> None:
        """
        Adds a column with the response of the model to the actual query.
        
        params:
        model: the model to use to generate the response
        tokenizer: the tokenizer to use to generate the response
        batch_size: the batch size to use to process the examples. Increasing this makes it faster but requires more GPU. Default is 8.
        """
        responses_col = []
        total_rows = len(data)
        indexes = [i for i in range(len(data)) if i % batch_size == 0]
        max_index = data.shape[0]

        with tqdm(total=total_rows, desc="generating responses") as pbar:
            for i, idx in enumerate(indexes[:-1]):
                indici = list(range(idx, indexes[i+1]))
                tmp = self._generate_batch(data.select(indici)['sentence'], model, tokenizer)
                responses_col.extend(tmp)
                pbar.update(batch_size)
            indici = list(range(indexes[len(indexes[:-1])], max_index))
            tmp = self._generate_batch(data.select(indici)['sentence'], model, tokenizer)
            responses_col.extend(tmp)
            pbar.update(batch_size)
        
        print('len=', len(responses_col), responses_col)
        data = data.add_column('model_responses', responses_col)
        return data
   
output_generation = OutputGeneration(model, tokenizer, id2label)
tmp = output_generation.add_output_column(train_data.select(range(6)), model, tokenizer, 3) 


generating responses:   0%|          | 0/6 [00:00<?, ?it/s]

sequence_output.shape=torch.Size([3, 48, 4096])


generating responses:  50%|█████     | 3/6 [00:00<00:00,  5.00it/s]

sequence_output.shape=torch.Size([3, 76, 4096])


generating responses: 100%|██████████| 6/6 [00:01<00:00,  4.85it/s]

len= 6 [['B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'O', 'O', 'I', 'B', 'B', 'O', 'B', 'B', 'O', 'O', 'B', 'B', 'O', 'O'], ['B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'B', 'B', 'B', 'B', 'O', 'B', 'O', 'B', 'O'], ['I', 'O', 'O', 'B', 'B', 'B', 'B', 'I', 'B', 'B', 'O', 'B', 'O', 'O', 'B', 'O', 'O', 'B', 'B', 'B', 'B', 'B', 'B', 'O', 'I', 'I', 'B', 'O', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'O', 'O', 'B', 'B', 'O', 'B', 'B', 'O', 'O', 'O'], ['B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'B', 'O', 'I', 'B', 'O', 'B', 'I', 'B', 'B', 'B', 'O', 'O', 'B', 'O', 'B', 'O', 




In [26]:
examples = [train_data['sentence'][0], train_data['sentence'][5]]
input_sentences = examples
encodeds = tokenizer(input_sentences, return_tensors="pt", add_special_tokens=True, padding=True)
model_inputs = encodeds.to('cuda')
generated_ids = model(**model_inputs)
gen = OutputGeneration(model, tokenizer, id2label)
pl = gen._create_prediction_list(generated_ids)

sequence_output.shape=torch.Size([2, 76, 4096])


NameError: name 'OutputGeneration' is not defined

In [91]:
for i in range(len(examples)):
    print("sentence:", examples[i])
    print("predicted labels:", pl[i])
    print("tokens", encodeds[i])
    print("n processed tokens", len(encodeds[i]))
    print("n tokens", len(tokenizer.tokenize(examples[i])))
    print("mask: ", encodeds[i].attention_mask)


sentence: Their child acquired walking at the age of 14 months.
predicted labels: ['B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'I', 'O', 'I', 'B', 'B', 'O', 'O', 'B', 'O', 'O', 'B', 'O', 'O', 'O']
tokens Encoding(num_tokens=76, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
n processed tokens 76
n tokens 13
mask:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
sentence: Pertinent laboratory studies included a hemoglobin level of 10 g/dL, platelet count was normal, blood urea of 1,2 g/l (0,18-0,45 g/

In [76]:
print(len(encodeds[1]))
print(len(tmp[1]['model_responses']))
print(len(tmp[1]['original_id']))
print(len(tokenizer(tmp[1]['sentence'], return_tensors='pt')['input_ids'].squeeze(0).tolist()))

75
48
8
14


In [162]:
data['ner_tags'][3]

'[0 1 0 0 1 1 0 0 0 1 0 0 1 2 2 2 0 0 0 1 1 0 0 0 0 0 0 0 1 2 2]'

In [14]:
import pandas as pd


token_classifier = pipeline("token-classification", model=model, 
                            tokenizer=tokenizer, 
                            aggregation_strategy="simple", batch_size=12)

l = []
for out in tqdm(token_classifier(KeyDataset(train_data.select(range(24)), "sentence"))):
    l.append(out)

tmp = train_data.select(range(24)).add_column('model_output', l)
tmp[6]['model_output']

The model 'MistralForTokenClassification' is not supported for token-classification. Supported models are ['AlbertForTokenClassification', 'BertForTokenClassification', 'BigBirdForTokenClassification', 'BioGptForTokenClassification', 'BloomForTokenClassification', 'BrosForTokenClassification', 'CamembertForTokenClassification', 'CanineForTokenClassification', 'ConvBertForTokenClassification', 'Data2VecTextForTokenClassification', 'DebertaForTokenClassification', 'DebertaV2ForTokenClassification', 'DistilBertForTokenClassification', 'ElectraForTokenClassification', 'ErnieForTokenClassification', 'ErnieMForTokenClassification', 'EsmForTokenClassification', 'FalconForTokenClassification', 'FlaubertForTokenClassification', 'FNetForTokenClassification', 'FunnelForTokenClassification', 'GPT2ForTokenClassification', 'GPT2ForTokenClassification', 'GPTBigCodeForTokenClassification', 'GPTNeoForTokenClassification', 'GPTNeoXForTokenClassification', 'IBertForTokenClassification', 'LayoutLMForToken

sequence_output.shape=torch.Size([12, 166, 4096])


 50%|█████     | 1/2 [00:04<00:04,  4.01s/it]

sequence_output.shape=torch.Size([12, 43, 4096])


24it [00:05,  4.52it/s]                      


[{'end': 32,
  'entity_group': 'B',
  'score': 0.615,
  'start': 18,
  'word': 'found Glasgow'},
 {'end': 41, 'entity_group': 'I', 'score': 0.9956, 'start': 38, 'word': '(G'},
 {'end': 43, 'entity_group': 'B', 'score': 0.5854, 'start': 41, 'word': 'CS'},
 {'end': 48, 'entity_group': 'B', 'score': 0.768, 'start': 44, 'word': 'of '},
 {'end': 51, 'entity_group': 'B', 'score': 0.99, 'start': 50, 'word': '/'},
 {'end': 55, 'entity_group': 'I', 'score': 0.983, 'start': 53, 'word': '('},
 {'end': 58, 'entity_group': 'B', 'score': 0.961, 'start': 55, 'word': 'eye'},
 {'end': 69,
  'entity_group': 'I',
  'score': 0.724,
  'start': 58,
  'word': 'opening at'},
 {'end': 71, 'entity_group': 'I', 'score': 0.995, 'start': 70, 'word': '5'},
 {'end': 76,
  'entity_group': 'B',
  'score': 0.907,
  'start': 71,
  'word': ', ver'},
 {'end': 91,
  'entity_group': 'I',
  'score': 0.803,
  'start': 76,
  'word': 'bal response at'},
 {'end': 93, 'entity_group': 'I', 'score': 0.818, 'start': 92, 'word': '2'}

In [20]:
print(train_data[0]['tokens'])
example = train_data[0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['Their', 'child', 'acquired', 'walking', 'at', 'the', 'age', 'of', '14', 'months.']


['<s>',
 '▁Their',
 '▁child',
 '▁acquired',
 '▁walking',
 '▁at',
 '▁the',
 '▁age',
 '▁of',
 '▁',
 '1',
 '4',
 '▁months',
 '.']

In [61]:
import pandas as pd


token_classifier = pipeline("ner", model=model, 
                            tokenizer=tokenizer, 
                            aggregation_strategy="simple", batch_size=12)


#token_classifier(train_data[0]['tokens'])
l = []
for out in tqdm(token_classifier(KeyDataset(train_data.select(range(24)), "sentence"))):
    l.append(out)

tmp = train_data.select(range(24)).add_column('model_output', l)
tmp[6]['model_output']

The model 'MistralForTokenClassification' is not supported for ner. Supported models are ['AlbertForTokenClassification', 'BertForTokenClassification', 'BigBirdForTokenClassification', 'BioGptForTokenClassification', 'BloomForTokenClassification', 'BrosForTokenClassification', 'CamembertForTokenClassification', 'CanineForTokenClassification', 'ConvBertForTokenClassification', 'Data2VecTextForTokenClassification', 'DebertaForTokenClassification', 'DebertaV2ForTokenClassification', 'DistilBertForTokenClassification', 'ElectraForTokenClassification', 'ErnieForTokenClassification', 'ErnieMForTokenClassification', 'EsmForTokenClassification', 'FalconForTokenClassification', 'FlaubertForTokenClassification', 'FNetForTokenClassification', 'FunnelForTokenClassification', 'GPT2ForTokenClassification', 'GPT2ForTokenClassification', 'GPTBigCodeForTokenClassification', 'GPTNeoForTokenClassification', 'GPTNeoXForTokenClassification', 'IBertForTokenClassification', 'LayoutLMForTokenClassification', 

sequence_output.shape=torch.Size([12, 166, 4096])


 50%|█████     | 1/2 [00:03<00:03,  3.95s/it]

sequence_output.shape=torch.Size([12, 43, 4096])


24it [00:05,  4.56it/s]                      


[{'end': 32,
  'entity_group': 'B',
  'score': 0.615,
  'start': 18,
  'word': 'found Glasgow'},
 {'end': 41, 'entity_group': 'I', 'score': 0.9956, 'start': 38, 'word': '(G'},
 {'end': 43, 'entity_group': 'B', 'score': 0.5854, 'start': 41, 'word': 'CS'},
 {'end': 48, 'entity_group': 'B', 'score': 0.768, 'start': 44, 'word': 'of '},
 {'end': 51, 'entity_group': 'B', 'score': 0.99, 'start': 50, 'word': '/'},
 {'end': 55, 'entity_group': 'I', 'score': 0.983, 'start': 53, 'word': '('},
 {'end': 58, 'entity_group': 'B', 'score': 0.961, 'start': 55, 'word': 'eye'},
 {'end': 69,
  'entity_group': 'I',
  'score': 0.724,
  'start': 58,
  'word': 'opening at'},
 {'end': 71, 'entity_group': 'I', 'score': 0.995, 'start': 70, 'word': '5'},
 {'end': 76,
  'entity_group': 'B',
  'score': 0.907,
  'start': 71,
  'word': ', ver'},
 {'end': 91,
  'entity_group': 'I',
  'score': 0.803,
  'start': 76,
  'word': 'bal response at'},
 {'end': 93, 'entity_group': 'I', 'score': 0.818, 'start': 92, 'word': '2'}

In [18]:
import pandas as pd
import ast
import evaluate
seqeval = evaluate.load("seqeval")
data = load_dataset("csv", data_files="data/evaluation/train_data_LS_Mistral-7B-v0.1_adapters_en.layer1_NoQuant_16_32_0.01_2_0.0002.csv")
def helper(example):
    example['model_output'] = ast.literal_eval(example['model_output'].replace('\n', ','))
    return example
data = data.map(lambda x: helper(x))

def compute_metrics(logits, labels):
    model_output_logits = logits.cpu().detach().float().numpy()    
    predictions = np.argmax(model_output_logits, axis=1)
    print(predictions)
    print(type(predictions))
    print(type(labels))
    lista = []
    for i in range(len(labels)):
        print('pred: ', predictions[i], 'label: ', labels[i])
        if labels[i] != -100:
         lista.append(label_list[predictions[i]])
    print('lista: ', lista)

    print( [label_list[prediction] for i, prediction in enumerate(predictions) if labels[i] != -100 ] )

    
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["soverall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [109]:
label_list

['O', 'B', 'I']

In [76]:
def _create_prediction_list(self, model_output):
        model_output_logits = model_output.logits.cpu().detach().float().numpy()
        preds = np.argmax(model_output_logits, axis=2)
        preds_list = []
        for pred in preds:
            preds_list.append([self.id2label[label] for label in pred])
        return preds_list


[['B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'B',
  'I',
  'O',
  'I',
  'B',
  'B',
  'O',
  'O',
  'B',
  'O',
  'O',
  'B',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'B',
  'O',
  'O',
  'O',
  'B',
  'B',
  'O',
  'O',
  'B',
  'O',
  'B',
  'O',
  'O',
  'I',
  'O',
  'B',
  'O',
  'O',
  'O',
  'B',
  'B',
  'O',
  'B',
  'B',
  'O',
  'O',
  'I',
  'O',
  'O',
  'B',
  'B',
  'O',
  'B',
  'O',
  'B',
  'O',
  'B',
  'B',
  'O',
  'I',
  'I',
  'B',
  'B',
  'O',
  'I',
  'I',
  'O',
  'B',
  'O',
  'O',
  'O',
  'O',
  'B',
  'B',
  'B',
  'O',
  'B',
  'B',
  'I',
  'I',
  'O',
  'B',
  'O'

In [112]:
generated_ids['logits'][0]
print(type(label_list))

<class 'list'>


In [24]:
print(len(train_data[0]['labels']))
print(len(train_data[0]['sentence']))
print(len(generated_ids['logits'][0]))

166
53
76


In [29]:
examples = train_data.select(range(2))
examples

Dataset({
    features: ['sentence', 'entities', 'original_text', 'original_id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2
})

In [55]:
print((examples[0]['ner_tags']))
print(len(examples[0]['sentence']))
print((examples[0]['sentence']))
print((examples[0]['labels']))
#print(len(examples['logits'][0]))
print(labels[0])
label2id

[1, 2, 1, 1, 0, 0, 0, 0, 0, 0]
53
Their child acquired walking at the age of 14 months.
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,

{'O': 0, 'B': 1, 'I': 2}

In [120]:
dataset = load_dataset(DATASET_CHEKPOINT) #download_mode="force_redownload"
dataset = dataset[TRAIN_LAYER]
dataset = dataset.shuffle(seed=1234)  
dataset_format_converter = DatasetFormatConverter(dataset)
dataset_format_converter.apply()
ds = dataset_format_converter.dataset
label2id = dataset_format_converter.label2id
id2label = dataset_format_converter.get_id2label()
label_list = dataset_format_converter.get_label_list()
dataset = ds.map(lambda x: tokenize_and_align_labels(x), batched = True)

Map: 100%|██████████| 1520/1520 [00:00<00:00, 13777.85 examples/s]


In [126]:
train_data, val_data, test_data = preprocessor.split_layer_into_train_val_test_(dataset, TRAIN_LAYER)
for i in range(8):
    print('sentece: ', train_data[i]['sentence'], '\nlabels: ', train_data[i]['labels'])

sentece:  Their child acquired walking at the age of 14 months. 
labels:  [-100, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
sentece:  An abdominal ultrasound examination was reported as normal. 
labels:  [-100, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 2]
sentece:  Her blood pressure was 100/70 mmHg with a pulse rate of 98 beats/min, respiratory rate about 16/min and oral temperature of 37°C. 
labels:  [-100, 0, 1, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 1, 0, 1, 2, 2, 2, 2, 0, 0, 1, 0, 1, 2, 2, 2, 2, 2]
sentece:  Emergency neck computed tomography angiography showed a contrast-enhanced abscess cavity posterior to the left retropharyngeal space, and a low-density area surrounded by an area without contrast enhancement in the posterior neck. 
labels:  [-100, 0, 0, 1, 0, 0, 0, 1, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2]
sentece:  The mitotic rate was extremely high (1

In [117]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, words_label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        label_ids = []
        for k, word_idx in enumerate(word_ids): 
            same_word_as_previous  = False if (word_idx != word_ids[k-1] or k==0) else True
            if word_idx is None:
                token_label = -100
            elif words_label[word_idx] == label2id['O']:
                token_label = label2id['O']
            elif same_word_as_previous:
                token_label = label2id['I']
            elif not same_word_as_previous:
                token_label = words_label[word_idx]
            label_ids.append(token_label)
            # if word_idx is not None:#  and k>12:
            #     print("word_label: ", words_label[word_idx])
            # print(tokenizer.decode(tokenized_inputs[i].ids[k]), ": ",word_idx,  "\nassigned_token_label:",  label_ids[k], '\n')
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs
tokenize_and_align_labels(ds.select(range(2, 4)))

{'input_ids': [[1, 1094, 534, 2920, 1475, 9271, 12423, 687, 20976, 403, 5745, 390, 4123, 28723], [1, 21127, 277, 4475, 8289, 302, 272, 16594, 28765, 6642, 369, 23096, 8894, 654, 5278, 354, 264, 9194, 302, 1581, 6752, 1716, 404, 325, 5072, 28731, 325, 5072, 28770, 28781, 28725, 8204, 28740, 28774, 28725, 8204, 28740, 28734, 28725, 8204, 28750, 28750, 304, 19966, 5278, 354, 8204, 28781, 28782, 28731, 8735, 288, 334, 7016, 9237, 3000, 678, 628, 305, 1082, 721, 806, 23096, 297, 2088, 434, 352, 28723]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 2], [-100, 0, 1, 2, 2, 0, 1, 2, 2, 1, 0, 1, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 2, 2, 2, 1, 2, 1, 2, 0,

In [19]:
# examples = [train_data['sentence'][0] , train_data['sentence'][5]]
# input_sentences = examples
# encodeds = tokenizer(input_sentences, return_tensors="pt", add_special_tokens=True, padding=True)
# model_inputs = encodeds.to('cuda')
# generated_ids = model(**model_inputs)
# model_output_logits = generated_ids.logits.cpu().detach().float().numpy()
examples = [train_data['sentence'][0], train_data['sentence'][5]]
input_sentences = examples
encodeds = tokenizer(input_sentences, return_tensors="pt", add_special_tokens=True, padding=True)
model_inputs = encodeds.to('cuda')
generated_ids = model(**model_inputs)
#gen = OutputGeneration(model, tokenizer, id2label)
#pl = gen._create_prediction_list(generated_ids)
compute_metrics(generated_ids['logits'][0],   train_data[0]['labels'])

sequence_output.shape=torch.Size([2, 76, 4096])
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 0 2 1 1 0 0 1 0 0 1 0
 0 0]
<class 'numpy.ndarray'>
<class 'list'>
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 label:  -100
pred:  1 

IndexError: index 76 is out of bounds for axis 0 with size 76

In [81]:
train_data[0]['labels']

[-100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,

In [78]:
train_data[0].keys()

dict_keys(['sentence', 'entities', 'original_text', 'original_id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'])

In [66]:
example = tmp[1]
labels = [label_list[i] for i in example[f"ner_tags"]]
print(labels)
print(example['tokens'])
print(tokenizer(example['sentence']))
print(tokenizer.decode(tokenizer(example['sentence'])['input_ids']))
print(example['model_output'])

['O', 'O', 'O', 'B', 'O', 'B', 'O', 'B']
['An', 'abdominal', 'ultrasound', 'examination', 'was', 'reported', 'as', 'normal.']
{'input_ids': [1, 1094, 534, 2920, 1475, 9271, 12423, 687, 20976, 403, 5745, 390, 4123, 28723], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
<s> An abdominal ultrasound examination was reported as normal.
[{'end': 8, 'entity_group': 'B', 'score': 0.648, 'start': 5, 'word': 'dom'}, {'end': 15, 'entity_group': 'B', 'score': 0.928, 'start': 12, 'word': 'ul'}, {'end': 35, 'entity_group': 'B', 'score': 0.9463, 'start': 23, 'word': 'examination'}, {'end': 48, 'entity_group': 'B', 'score': 0.8286, 'start': 39, 'word': 'reported'}, {'end': 58, 'entity_group': 'B', 'score': 0.6094, 'start': 51, 'word': 'normal'}]
