In [5]:

import json
import sys
import numpy as np
import evaluate
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType, PeftModel, PeftConfig
from dotenv import dotenv_values
import datetime
import os
import torch



from utils import DataPreprocessor, DatasetFormatConverter
from src.billm.modeling_mistral import MistralForTokenClassification



WANDB_KEY = dotenv_values(".env.base")['WANDB_KEY']
BASE_MODEL_CHECKPOINT = 'mistralai/Mistral-7B-v0.1'
LLAMA_TOKEN = dotenv_values(".env.base")['LLAMA_TOKEN']
HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']


tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_CHECKPOINT,
                                          token =HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token
# seqeval = evaluate.load("seqeval")

DATASET_CHEKPOINT="ferrazzipietro/e3c-sentences" 
TRAIN_LAYER="en.layer1"
offset=False
instruction_on_response_format='Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}].'# 'Return the result in a json format.'
simplest_prompt=False
dataset_text_field="prompt"
preprocessor = DataPreprocessor(BASE_MODEL_CHECKPOINT, 
                                tokenizer)
dataset = load_dataset(DATASET_CHEKPOINT) #download_mode="force_redownload"
dataset = dataset[TRAIN_LAYER]
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset here
dataset = preprocessor.preprocess_data_one_layer(dataset, 
                                                instruction_on_response_format=instruction_on_response_format,
                                                simplest_prompt=simplest_prompt)
dataset = dataset.map(lambda samples: tokenizer(samples[dataset_text_field]), batched=True)
dataset_format_converter = DatasetFormatConverter(dataset)
dataset_format_converter.apply()
ds = dataset_format_converter.dataset
ds = ds.rename_column("word_level_labels", "ner_tags")
ds = ds.rename_column("words", "tokens")
label2id = dataset_format_converter.label2id
id2label = {v: k for k, v in label2id.items()}
label_list = list(label2id.keys())

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], is_split_into_words=True, padding='longest', max_length=256, truncation=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)# dataset_format_converter.dataset.map(tokenize_and_align_labels, batched=True)
train_data, val_data, test_data = preprocessor.split_layer_into_train_val_test_(tokenized_ds, TRAIN_LAYER)

ModuleNotFoundError: No module named 'dotenv'

In [1]:
import json
import sys
import numpy as np
import evaluate
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType, PeftModel, PeftConfig
from dotenv import dotenv_values
import datetime
import os
import torch



from utils import DataPreprocessor, DatasetFormatConverter
from src.billm.modeling_mistral import MistralForTokenClassification



WANDB_KEY = dotenv_values(".env.base")['WANDB_KEY']
BASE_MODEL_CHECKPOINT = 'mistralai/Mistral-7B-v0.1'
LLAMA_TOKEN = dotenv_values(".env.base")['LLAMA_TOKEN']
HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']


tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_CHECKPOINT,
                                          token =HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token
# seqeval = evaluate.load("seqeval")

DATASET_CHEKPOINT="ferrazzipietro/e3c-sentences" 
TRAIN_LAYER="en.layer1"

dataset = load_dataset(DATASET_CHEKPOINT) #download_mode="force_redownload"
dataset = dataset[TRAIN_LAYER]
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset here
preprocessor = DataPreprocessor(BASE_MODEL_CHECKPOINT, 
                                tokenizer)
train_data, val_data, test_data = preprocessor.split_layer_into_train_val_test_(dataset, TRAIN_LAYER, input_column='sentence')
dataset_format_converter = DatasetFormatConverter(dataset)
label2id = dataset_format_converter.label2id
id2label = dataset_format_converter.get_id2label()
label_list = dataset_format_converter.get_label_list()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
adapters = "ferrazzipietro/LS_Mistral-7B-v0.1_adapters_en.layer1_NoQuant_16_32_0.01_2_0.0002"
peft_config = PeftConfig.from_pretrained(adapters)
base_model = MistralForTokenClassification.from_pretrained(
    peft_config.base_model_name_or_path, 
    num_labels=len(label2id), 
    id2label=id2label, 
    label2id=label2id,
    token = HF_TOKEN,
    device_map = 'cuda:0',
    load_in_4bit = True,
    # torch_dtype = torch.bfloat16,
    cache_dir='/data/disk1/share/pferrazzi/.cache'
    )#.bfloat16()
merged_model = PeftModel.from_pretrained(base_model, adapters,  token=HF_TOKEN,  
                                         device_map='cuda:0',
                                         is_trainable = False)


merged_model = merged_model.merge_and_unload()

device = "cuda:0"
tokenizer.padding_side = "left"
# examples=['The doctor went to the hospital Precipitevolissimevolmente', 'the 12 years old girl went to see the doctor']

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
INFO:BiLLM:Here is the Bi-MistralModel! BiLLM_START_INDEX=0
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.23s/it]
Some weights of MistralForTokenClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [6]:


class Postprocessor():
    def __init__(self, tokenizer, label_list):
        self.tokenizer = tokenizer
        self.label_list = label_list
        self.id2label = {v: k for k, v in label_list.items()}
        self.label2id = label_list
    def postprocess(self, model_output):
        model_output_logits = model_output.logits.cpu().detach().float().numpy()
        preds = np.argmax(model_output_logits, axis=2)
        preds_list = []
        for pred in preds:
            preds_list.append([self.id2label[label] for label in pred])
        return preds_list


In [7]:
train_data['sentence'][0]

'Their child acquired walking at the age of 14 months.'

In [3]:
# LA PIPELINE NON FUNZIONA SU MULTIPLE GPU

from transformers import pipeline

token_classifier = pipeline("token-classification", model=base_model, tokenizer=tokenizer, aggregation_strategy="simple")
#tokenizer.ma
tokens = token_classifier("I live in Hong Kong. I am a student at Hong Kong PolyU.") #train_data['sentence'][0])
print(tokens)


The model 'MistralForTokenClassification' is not supported for token-classification. Supported models are ['AlbertForTokenClassification', 'BertForTokenClassification', 'BigBirdForTokenClassification', 'BioGptForTokenClassification', 'BloomForTokenClassification', 'BrosForTokenClassification', 'CamembertForTokenClassification', 'CanineForTokenClassification', 'ConvBertForTokenClassification', 'Data2VecTextForTokenClassification', 'DebertaForTokenClassification', 'DebertaV2ForTokenClassification', 'DistilBertForTokenClassification', 'ElectraForTokenClassification', 'ErnieForTokenClassification', 'ErnieMForTokenClassification', 'EsmForTokenClassification', 'FalconForTokenClassification', 'FlaubertForTokenClassification', 'FNetForTokenClassification', 'FunnelForTokenClassification', 'GPT2ForTokenClassification', 'GPT2ForTokenClassification', 'GPTBigCodeForTokenClassification', 'GPTNeoForTokenClassification', 'GPTNeoXForTokenClassification', 'IBertForTokenClassification', 'LayoutLMForToken

TypeError: zeros_like(): argument 'input' (position 1) must be Tensor, not NoneType

In [7]:

lista = [4 * i for i in range(1, 3)]
for i in range(len(lista)-1):
    examples = train_data['sentence'][lista[i]:lista[i+1]]
    input_sentences = examples
    encodeds = tokenizer(input_sentences, return_tensors="pt", add_special_tokens=False, padding=True)
    model_inputs = encodeds.to(device)
    generated_ids = merged_model(**model_inputs,)
    postprocessor = Postprocessor(tokenizer, label2id)
    preds_list = postprocessor.postprocess(generated_ids)

    for el in range(len(examples)):
        tokens = tokenizer.convert_ids_to_tokens(encodeds['input_ids'][el])
        for k in range(len(tokens)):
            print(f"{tokens[k]} : {preds_list[el][k]}")

</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
</s> : O
<

In [15]:
examples = [train_data['sentence'][0], train_data['sentence'][3]]
input_sentences = examples
encodeds = tokenizer(input_sentences, return_tensors="pt", add_special_tokens=False, padding=True)
model_inputs = encodeds.to(device)
generated_ids = merged_model(**model_inputs,)
postprocessor = Postprocessor(tokenizer, label2id)
preds_list = postprocessor.postprocess(generated_ids)
for el in range(len(examples)):
    tokens = tokenizer.convert_ids_to_tokens(encodeds['input_ids'][el])
    print( [el for el in zip(tokens, preds_list[el]) ])

[('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('</s>', 'O'), ('▁Their', 'O'), ('▁child', 'B'), ('▁acquired', 'O'), ('▁walking', 'O'), ('▁at', 'O'), ('▁the', 'O'), ('▁age', 'B'), ('▁of', 'O'), ('▁', 'O'), ('1', 'O'), ('4', 'O'), ('▁months', 'O'), ('.', 'O')]
[('▁Emer', 'O'), ('gency', 'O'), ('▁neck', 'B'), ('▁computed', 'O'), ('▁tom', 'B'), ('ography', 'O'), ('▁ang', 'O'), ('i', 'B'), ('ography', 'O'), ('▁showed', 'O'), ('▁a', 'O'), ('▁contrast', 'O'), ('-', 'I'), ('enh', 'B'), ('anced', 'O'), ('▁ab',

In [6]:
print([el for el in zip(train_data[0]['sentence'].split(' '), train_data[0]['ner_tags']) ])

NameError: name 'train_data' is not defined

In [5]:
from transformers import AutoTokenizer, pipeline
from peft import PeftModel, PeftConfig
from src.billm.modeling_mistral import MistralForTokenClassification

from dotenv import dotenv_values
import torch 

HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']

label2id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
id2label = {v: k for k, v in label2id.items()}
model_id = 'WhereIsAI/billm-mistral-7b-conll03-ner'
tokenizer = AutoTokenizer.from_pretrained(model_id)
peft_config = PeftConfig.from_pretrained(model_id)
model = MistralForTokenClassification.from_pretrained(
    peft_config.base_model_name_or_path,
    num_labels=len(label2id), id2label=id2label, label2id=label2id,
    token = HF_TOKEN,
    cache_dir='/data/disk1/share/pferrazzi/.cache'
)
model = PeftModel.from_pretrained(model, model_id)
# merge_and_unload is necessary for inference
model = model.merge_and_unload()

token_classifier = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
sentence = "I live in Hong Kong. I am a student at Hong Kong PolyU."
tokens = token_classifier(sentence)
print(tokens)


INFO:BiLLM:Here is the Bi-MistralModel! BiLLM_START_INDEX=0
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.68s/it]
Some weights of MistralForTokenClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The model 'MistralForTokenClassification' is not supported for token-classification. Supported models are ['AlbertForTokenClassification', 'BertForTokenClassification', 'BigBirdForTokenClassification', 'BioGptForTokenClassification', 'BloomForTokenClassification', 'BrosForTokenClassification', 'CamembertForTokenClassification', 'CanineForTokenClassification', 'ConvBertForTokenClassification', 'Data2VecTextForTokenClassification', 'DebertaForTokenClassification', 'DebertaV2ForTokenClassification', 'DistilBertForTokenClassification', 'ElectraForTokenClassificati

TypeError: zeros_like(): argument 'input' (position 1) must be Tensor, not NoneType