In [1]:
import json
import sys
import numpy as np
import evaluate
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
from dotenv import dotenv_values
import datetime
import os


from utils import DataPreprocessor, DatasetFormatConverter
from src.billm.modeling_mistral import MistralForTokenClassification


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

WANDB_KEY = dotenv_values(".env.base")['WANDB_KEY']
BASE_MODEL_CHECKPOINT = 'mistralai/Mistral-7B-Instruct-v0.2'
LLAMA_TOKEN = dotenv_values(".env.base")['LLAMA_TOKEN']
HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']


tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_CHECKPOINT,
                                          token =HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token
# seqeval = evaluate.load("seqeval")

DATASET_CHEKPOINT="ferrazzipietro/e3c-sentences" 
TRAIN_LAYER="en.layer1"
offset=False
instruction_on_response_format='Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}].'# 'Return the result in a json format.'
simplest_prompt=False
dataset_text_field="prompt"
preprocessor = DataPreprocessor(BASE_MODEL_CHECKPOINT, 
                                tokenizer)
dataset = load_dataset(DATASET_CHEKPOINT) #download_mode="force_redownload"
dataset = dataset[TRAIN_LAYER]
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset here
dataset = preprocessor.preprocess_data_one_layer(dataset, 
                                                instruction_on_response_format=instruction_on_response_format,
                                                simplest_prompt=simplest_prompt)
dataset = dataset.map(lambda samples: tokenizer(samples[dataset_text_field]), batched=True)
dataset_format_converter = DatasetFormatConverter(dataset)
dataset_format_converter.apply()
ds = dataset_format_converter.dataset
ds = ds.rename_column("word_level_labels", "ner_tags")
ds = ds.rename_column("words", "tokens")
label2id = dataset_format_converter.label2id
id2label = {v: k for k, v in label2id.items()}
label_list = list(label2id.keys())

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], is_split_into_words=True, padding='longest', max_length=256, truncation=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)# dataset_format_converter.dataset.map(tokenize_and_align_labels, batched=True)
train_data, val_data, test_data = preprocessor.split_layer_into_train_val_test_(tokenized_ds, TRAIN_LAYER)


In [52]:
train_data[0]

{'sentence': 'Their child acquired walking at the age of 14 months.',
 'entities': [{'id': '10101',
   'offsets': [12, 20],
   'role': '',
   'semantic_type_id': '',
   'text': 'acquired',
   'type': 'EVENT'},
  {'id': '10116',
   'offsets': [21, 28],
   'role': '',
   'semantic_type_id': '',
   'text': 'walking',
   'type': 'EVENT'},
  {'id': '10541',
   'offsets': [0, 11],
   'role': 'PATIENT',
   'semantic_type_id': '',
   'text': 'Their child',
   'type': 'ACTOR'}],
 'original_text': "Patient information: a 9-month-old boy presented to the emergency room with a 3-day history of refusal to bear weight on the right lower extremity and febrile peaks of up to 38.5°C for 24 hours. His parents had noted an ankle trauma in the previous week. The primary care physician had initially suspected a talus fracture. His right ankle was immobilized by a plaster splint and his parents were instructed to apply ice. However, the child continued to experience ankle pain and fever. He was returned to 

In [3]:
base_model = MistralForTokenClassification.from_pretrained(
    BASE_MODEL_CHECKPOINT, 
    num_labels=len(label2id), 
    id2label=id2label, 
    label2id=label2id,
    token = LLAMA_TOKEN,
    load_in_4bit=True,
    device_map = 'auto',
    cache_dir='/data/disk1/share/pferrazzi/.cache'
    )# .bfloat16()
adapters = "ferrazzipietro/LS_Mistral-7B-v0.1_adapters_en.layer1_NoQuant_16_32_0.01_2_0.0002"
merged_model = PeftModel.from_pretrained(base_model, 
                                                     adapters, 
                                                     token=HF_TOKEN, 
                                                     device_map='auto',
                                                     is_trainable = False)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
INFO:BiLLM:Here is the Bi-MistralModel! BiLLM_START_INDEX=0
Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.45s/it]
Some weights of MistralForTokenClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [84]:
# BUONo
model_inputs

{'input_ids': tensor([[  415,  6676,  2068,   298,   272,  6556, 28386,   508,   570,  3871,
           815,   545,  3871,  4379],
        [    2,     2,   272, 28705, 28740, 28750,  1267,  1571,  2746,  2068,
           298,  1032,   272,  6676]], device='cuda:1'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:1')}

In [88]:
# cattivo
model_inputs

{'input_ids': tensor([[ 2744,  7886, 18834, 18946,   465,   302,  1560,   534,   824,   409,
           274, 28725,  9837,   395,  2725,   321,  3130, 28726,   505,  5827,
         28725,   403,  6053,   304,   272,  7749, 28742, 28713, 15193,  4644,
         11957, 28723],
        [    2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,  1094,
           534,  2920,  1475,  9271, 12423,   687, 20976,   403,  5745,   390,
          4123, 28723]], device='cuda:1'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:1')}

In [36]:
train_data['sentence'][:4]

['Their child acquired walking at the age of 14 months.',
 'An abdominal ultrasound examination was reported as normal.',
 'Her blood pressure was 100/70 mmHg with a pulse rate of 98 beats/min, respiratory rate about 16/min and oral temperature of 37°C.',
 'Emergency neck computed tomography angiography showed a contrast-enhanced abscess cavity posterior to the left retropharyngeal space, and a low-density area surrounded by an area without contrast enhancement in the posterior neck.']

In [30]:
device = "cuda"
tokenizer.padding_side = "left"
# examples=['The doctor went to the hospital Precipitevolissimevolmente', 'the 12 years old girl went to see the doctor']
examples = [train_data[0]['sentence'], train_data[3]['sentence']]
input_sentences = examples
prompts = examples
input_sentences_tokenized = tokenizer(input_sentences, return_tensors="pt", padding=True)
max_new_tokens = int(len(max(input_sentences_tokenized, key=len)) * 4)
# if self.preprocessor.model_type == 'gemma':
#     add_special_tokens = True
encodeds = tokenizer(prompts, return_tensors="pt", add_special_tokens=False, padding=True)
model_inputs = encodeds.to(device)
generated_ids = merged_model(**model_inputs, 
                             #do_sample=True, 
                             #max_new_tokens=max_new_tokens,  
                               #pad_token_id=tokenizer.pad_token_id,
                               #temperature = 1.0
                               ) # max_new_tokens=max_new_tokens,
# print(generated_ids)
# generated_ids = generated_ids[:, encodeds.input_ids.shape[1]:]
# decoded = tokenizer.batch_decode(generated_ids)

In [9]:
import torch

class Postprocessor():
    def __init__(self, tokenizer, label_list):
        self.tokenizer = tokenizer
        self.label_list = label_list
        self.id2label = {v: k for k, v in label_list.items()}
        self.label2id = label_list

    # def aggregate_tokens_into_words(self, tokenized_sentence, labels):
    #     words = []
    #     word = []
    #     word_label = None
    #     for token, label in zip(tokens, labels):
    #         print(token)
    #         if token.startswith("##"):
    #             word.append(token[2:])
    #         else:
    #             if word:
    #                 words.append((word, word_label))
    #             word = [token]
    #             word_label = label
    #     if word:
    #         words.append((word, word_label))
    #     return words
    
    def postprocess(self, model_output):
        model_output_logits = model_output.logits.cpu().detach().numpy()
        preds = np.argmax(model_output_logits, axis=2)
        preds_list = []
        for pred in preds:
            preds_list.append([self.id2label[label] for label in pred])
        return preds_list

In [33]:
len(preds_list[0])
len(tokens[0])
tokens

['▁Their',
 '▁child',
 '▁acquired',
 '▁walking',
 '▁at',
 '▁the',
 '▁age',
 '▁of',
 '▁',
 '1',
 '4',
 '▁months',
 '.']

In [34]:
tokens = tokenizer.tokenize(examples[1], is_split_into_words=False)
postprocessor = Postprocessor(tokenizer, label2id)
preds_list = postprocessor.postprocess(generated_ids)

for i in range(len(preds_list[1])):
    print(tokens[i], preds_list[1][i])

▁Emer O
gency O
▁neck O
▁computed O
▁tom B
ography O
▁ang O
i O
ography O
▁showed O
▁a O
▁contrast B
- I
enh B
anced O
▁ab O
sc I
ess O
▁c O
avity O
▁posterior O
▁to O
▁the O
▁left B
▁retro O
ph B
ary I
n I
ge I
al O
▁space O
, O
▁and O
▁a O
▁low O
- I
density O
▁area O
▁surrounded I
▁by O
▁an O
▁area O
▁without O
▁contrast O
▁enhancement O
▁in O
▁the O
▁posterior B
▁neck I
. O


In [91]:
postprocessor = Postprocessor(tokenizer, label2id)
preds_list = postprocessor.postprocess(generated_ids)
print("predictions", preds_list)
postprocessor.aggregate_tokens_into_words(input_sentences_tokenized['input_ids'], preds_list[0])
input_sentences

predictions [['O', 'O', 'O', 'O', 'B', 'I', 'O', 'O', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
▁Prec
ip
ite
vol
iss
ime
vol
mente


["Percutaneous drainage of both abscesses, combined with antimicrobial treatment, was successful and the patient's clinical condition improved.",
 'An abdominal ultrasound examination was reported as normal.']

In [51]:
for el in input_sentences_tokenized['input_ids'].numpy():
    print(el)

[   2    2    2    2    2    2    1  415 6676 2068  298  272 6556]
[    1   272 28705 28740 28750  1267  1571  2746  2068   298  1032   272
  6676]


In [1]:
print(base_model.state_dict())

NameError: name 'base_model' is not defined

In [1]:

import json
import sys
import numpy as np
import evaluate
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType, PeftModel, PeftConfig
from dotenv import dotenv_values
import datetime
import os
import torch



from utils import DataPreprocessor, DatasetFormatConverter
from src.billm.modeling_mistral import MistralForTokenClassification



WANDB_KEY = dotenv_values(".env.base")['WANDB_KEY']
BASE_MODEL_CHECKPOINT = 'mistralai/Mistral-7B-Instruct-v0.2'
LLAMA_TOKEN = dotenv_values(".env.base")['LLAMA_TOKEN']
HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']


tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_CHECKPOINT,
                                          token =HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token
# seqeval = evaluate.load("seqeval")

DATASET_CHEKPOINT="ferrazzipietro/e3c-sentences" 
TRAIN_LAYER="en.layer1"
offset=False
instruction_on_response_format='Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}].'# 'Return the result in a json format.'
simplest_prompt=False
dataset_text_field="prompt"
preprocessor = DataPreprocessor(BASE_MODEL_CHECKPOINT, 
                                tokenizer)
dataset = load_dataset(DATASET_CHEKPOINT) #download_mode="force_redownload"
dataset = dataset[TRAIN_LAYER]
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset here
dataset = preprocessor.preprocess_data_one_layer(dataset, 
                                                instruction_on_response_format=instruction_on_response_format,
                                                simplest_prompt=simplest_prompt)
dataset = dataset.map(lambda samples: tokenizer(samples[dataset_text_field]), batched=True)
dataset_format_converter = DatasetFormatConverter(dataset)
dataset_format_converter.apply()
ds = dataset_format_converter.dataset
ds = ds.rename_column("word_level_labels", "ner_tags")
ds = ds.rename_column("words", "tokens")
label2id = dataset_format_converter.label2id
id2label = {v: k for k, v in label2id.items()}
label_list = list(label2id.keys())

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], is_split_into_words=True, padding='longest', max_length=256, truncation=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)# dataset_format_converter.dataset.map(tokenize_and_align_labels, batched=True)
train_data, val_data, test_data = preprocessor.split_layer_into_train_val_test_(tokenized_ds, TRAIN_LAYER)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
adapters = "ferrazzipietro/LS_Mistral-7B-v0.1_adapters_en.layer1_NoQuant_16_32_0.01_2_0.0002"
peft_config = PeftConfig.from_pretrained(adapters)
base_model = MistralForTokenClassification.from_pretrained(
    peft_config.base_model_name_or_path, 
    num_labels=len(label2id), 
    id2label=id2label, 
    label2id=label2id,
    token = HF_TOKEN,
    device_map = 'auto',
    torch_dtype = torch.bfloat16,
    cache_dir='/data/disk1/share/pferrazzi/.cache'
    )#.bfloat16()
merged_model = PeftModel.from_pretrained(base_model, adapters,  token=HF_TOKEN,  
                                         device_map='auto',
                                         is_trainable = False)


merged_model = merged_model.merge_and_unload()

device = "cuda"
tokenizer.padding_side = "left"
# examples=['The doctor went to the hospital Precipitevolissimevolmente', 'the 12 years old girl went to see the doctor']

Downloading shards: 100%|██████████| 2/2 [02:31<00:00, 75.93s/it] 
INFO:BiLLM:Here is the Bi-MistralModel! BiLLM_START_INDEX=0
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.88s/it]
Some weights of MistralForTokenClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:


class Postprocessor():
    def __init__(self, tokenizer, label_list):
        self.tokenizer = tokenizer
        self.label_list = label_list
        self.id2label = {v: k for k, v in label_list.items()}
        self.label2id = label_list
    def postprocess(self, model_output):
        model_output_logits = model_output.logits.cpu().detach().float().numpy()
        preds = np.argmax(model_output_logits, axis=2)
        preds_list = []
        for pred in preds:
            preds_list.append([self.id2label[label] for label in pred])
        return preds_list


In [7]:
train_data['sentence'][0]

'Their child acquired walking at the age of 14 months.'

In [23]:
from transformers import pipeline

token_classifier = pipeline("token-classification", model=base_model, tokenizer=tokenizer, aggregation_strategy="simple")
#tokenizer.ma
tokens = token_classifier("I live in Hong Kong. I am a student at Hong Kong PolyU.") #train_data['sentence'][0])
print(tokens)


The model 'MistralForTokenClassification' is not supported for token-classification. Supported models are ['AlbertForTokenClassification', 'BertForTokenClassification', 'BigBirdForTokenClassification', 'BioGptForTokenClassification', 'BloomForTokenClassification', 'BrosForTokenClassification', 'CamembertForTokenClassification', 'CanineForTokenClassification', 'ConvBertForTokenClassification', 'Data2VecTextForTokenClassification', 'DebertaForTokenClassification', 'DebertaV2ForTokenClassification', 'DistilBertForTokenClassification', 'ElectraForTokenClassification', 'ErnieForTokenClassification', 'ErnieMForTokenClassification', 'EsmForTokenClassification', 'FalconForTokenClassification', 'FlaubertForTokenClassification', 'FNetForTokenClassification', 'FunnelForTokenClassification', 'GPT2ForTokenClassification', 'GPT2ForTokenClassification', 'GPTBigCodeForTokenClassification', 'GPTNeoForTokenClassification', 'GPTNeoXForTokenClassification', 'IBertForTokenClassification', 'LayoutLMForToken

TypeError: zeros_like(): argument 'input' (position 1) must be Tensor, not NoneType

In [7]:

lista = [4 * i for i in range(1, 3)]
for i in range(len(lista)-1):
    examples = train_data['sentence'][lista[i]:lista[i+1]]
    input_sentences = examples
    encodeds = tokenizer(input_sentences, return_tensors="pt", add_special_tokens=False, padding=True)
    model_inputs = encodeds.to(device)
    generated_ids = merged_model(**model_inputs,)
    postprocessor = Postprocessor(tokenizer, label2id)
    preds_list = postprocessor.postprocess(generated_ids)

    for el in range(len(examples)):
        tokens = tokenizer.convert_ids_to_tokens(encodeds['input_ids'][el])
        for k in range(len(tokens)):
            print(f"{tokens[k]} : {preds_list[el][k]}")

</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
<

In [15]:
examples = [train_data['sentence'][0], train_data['sentence'][3]]
input_sentences = examples
encodeds = tokenizer(input_sentences, return_tensors="pt", add_special_tokens=False, padding=True)
model_inputs = encodeds.to(device)
generated_ids = merged_model(**model_inputs,)
postprocessor = Postprocessor(tokenizer, label2id)
preds_list = postprocessor.postprocess(generated_ids)
for el in range(len(examples)):
    tokens = tokenizer.convert_ids_to_tokens(encodeds['input_ids'][el])
    print( [el for el in zip(tokens, preds_list[el]) ])

[('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('▁Their', 'O'), ('▁child', 'B'), ('▁acquired', 'O'), ('▁walking', 'O'), ('▁at', 'O'), ('▁the', 'O'), ('▁age', 'B'), ('▁of', 'O'), ('▁', 'O'), ('1', 'O'), ('4', 'O'), ('▁months', 'O'), ('.', 'O')]
[('▁Emer', 'O'), ('gency', 'O'), ('▁neck', 'B'), ('▁computed', 'O'), ('▁tom', 'B'), ('ography', 'O'), ('▁ang', 'O'), ('i', 'B'), ('ography', 'O'), ('▁showed', 'O'), ('▁a', 'O'), ('▁contrast', 'O'), ('-', 'I'), ('enh', 'B'), ('anced', 'O'), ('▁ab',

In [19]:
print([el for el in zip(train_data[3]['sentence'].split(' '), train_data[3]['ner_tags']) ])

[('Emergency', 0), ('neck', 1), ('computed', 0), ('tomography', 0), ('angiography', 1), ('showed', 1), ('a', 0), ('contrast-enhanced', 0), ('abscess', 0), ('cavity', 1), ('posterior', 0), ('to', 0), ('the', 1), ('left', 2), ('retropharyngeal', 2), ('space,', 2), ('and', 0), ('a', 0), ('low-density', 0), ('area', 1), ('surrounded', 1), ('by', 0), ('an', 0), ('area', 0), ('without', 0), ('contrast', 0), ('enhancement', 0), ('in', 0), ('the', 1), ('posterior', 2), ('neck.', 2)]


In [1]:
from transformers import AutoTokenizer, pipeline
from peft import PeftModel, PeftConfig
# from billm import MistralForTokenClassification
from src.billm.modeling_mistral import MistralForTokenClassification
from dotenv import dotenv_values
import torch 

HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']

label2id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
id2label = {v: k for k, v in label2id.items()}
model_id = 'WhereIsAI/billm-mistral-7b-conll03-ner'
tokenizer = AutoTokenizer.from_pretrained(model_id)
peft_config = PeftConfig.from_pretrained(model_id)
model = MistralForTokenClassification.from_pretrained(
    peft_config.base_model_name_or_path,
    num_labels=len(label2id), id2label=id2label, label2id=label2id,
    token = HF_TOKEN,
    device_map = 'auto',
    torch_dtype = torch.bfloat16,
    cache_dir='/data/disk1/share/pferrazzi/.cache'
)
model = PeftModel.from_pretrained(model, model_id)
# merge and unload is necessary for inference
model = model.merge_and_unload()

  from .autonotebook import tqdm as notebook_tqdm
INFO:BiLLM:Here is the Bi-MistralModel! BiLLM_START_INDEX=0
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]
Some weights of MistralForTokenClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
token_classifier = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
sentence = ["I live in Hong Kong. I am a student at Hong Kong PolyU.", "hello im here"]
tokens = token_classifier(sentence)
print(tokens)

The model 'MistralForTokenClassification' is not supported for token-classification. Supported models are ['AlbertForTokenClassification', 'BertForTokenClassification', 'BigBirdForTokenClassification', 'BioGptForTokenClassification', 'BloomForTokenClassification', 'BrosForTokenClassification', 'CamembertForTokenClassification', 'CanineForTokenClassification', 'ConvBertForTokenClassification', 'Data2VecTextForTokenClassification', 'DebertaForTokenClassification', 'DebertaV2ForTokenClassification', 'DistilBertForTokenClassification', 'ElectraForTokenClassification', 'ErnieForTokenClassification', 'ErnieMForTokenClassification', 'EsmForTokenClassification', 'FalconForTokenClassification', 'FlaubertForTokenClassification', 'FNetForTokenClassification', 'FunnelForTokenClassification', 'GPT2ForTokenClassification', 'GPT2ForTokenClassification', 'GPTBigCodeForTokenClassification', 'GPTNeoForTokenClassification', 'GPTNeoXForTokenClassification', 'IBertForTokenClassification', 'LayoutLMForToken

TypeError: zeros_like(): argument 'input' (position 1) must be Tensor, not NoneType

In [5]:
from transformers import AutoTokenizer, pipeline
from peft import PeftModel, PeftConfig
from src.billm.modeling_mistral import MistralForTokenClassification

from dotenv import dotenv_values
import torch 

HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']

label2id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
id2label = {v: k for k, v in label2id.items()}
model_id = 'WhereIsAI/billm-mistral-7b-conll03-ner'
tokenizer = AutoTokenizer.from_pretrained(model_id)
peft_config = PeftConfig.from_pretrained(model_id)
model = MistralForTokenClassification.from_pretrained(
    peft_config.base_model_name_or_path,
    num_labels=len(label2id), id2label=id2label, label2id=label2id,
    token = HF_TOKEN,
    cache_dir='/data/disk1/share/pferrazzi/.cache'
)
model = PeftModel.from_pretrained(model, model_id)
# merge_and_unload is necessary for inference
model = model.merge_and_unload()

token_classifier = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
sentence = "I live in Hong Kong. I am a student at Hong Kong PolyU."
tokens = token_classifier(sentence)
print(tokens)


INFO:BiLLM:Here is the Bi-MistralModel! BiLLM_START_INDEX=0
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.68s/it]
Some weights of MistralForTokenClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The model 'MistralForTokenClassification' is not supported for token-classification. Supported models are ['AlbertForTokenClassification', 'BertForTokenClassification', 'BigBirdForTokenClassification', 'BioGptForTokenClassification', 'BloomForTokenClassification', 'BrosForTokenClassification', 'CamembertForTokenClassification', 'CanineForTokenClassification', 'ConvBertForTokenClassification', 'Data2VecTextForTokenClassification', 'DebertaForTokenClassification', 'DebertaV2ForTokenClassification', 'DistilBertForTokenClassification', 'ElectraForTokenClassificati

TypeError: zeros_like(): argument 'input' (position 1) must be Tensor, not NoneType