In [1]:
import json
import sys
import numpy as np
import evaluate
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
from dotenv import dotenv_values
import datetime
import os


from utils import DataPreprocessor, DatasetFormatConverter
from src.billm.modeling_mistral import MistralForTokenClassification


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

WANDB_KEY = dotenv_values(".env.base")['WANDB_KEY']
BASE_MODEL_CHECKPOINT = 'mistralai/Mistral-7B-Instruct-v0.2'
LLAMA_TOKEN = dotenv_values(".env.base")['LLAMA_TOKEN']
HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']


tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_CHECKPOINT,
                                          token =HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token
# seqeval = evaluate.load("seqeval")

DATASET_CHEKPOINT="ferrazzipietro/e3c-sentences" 
TRAIN_LAYER="en.layer1"
offset=False
instruction_on_response_format='Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}].'# 'Return the result in a json format.'
simplest_prompt=False
dataset_text_field="prompt"
preprocessor = DataPreprocessor(BASE_MODEL_CHECKPOINT, 
                                tokenizer)
dataset = load_dataset(DATASET_CHEKPOINT) #download_mode="force_redownload"
dataset = dataset[TRAIN_LAYER]
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset here
dataset = preprocessor.preprocess_data_one_layer(dataset, 
                                                instruction_on_response_format=instruction_on_response_format,
                                                simplest_prompt=simplest_prompt)
dataset = dataset.map(lambda samples: tokenizer(samples[dataset_text_field]), batched=True)
dataset_format_converter = DatasetFormatConverter(dataset)
dataset_format_converter.apply()
ds = dataset_format_converter.dataset
ds = ds.rename_column("word_level_labels", "ner_tags")
ds = ds.rename_column("words", "tokens")
label2id = dataset_format_converter.label2id
id2label = {v: k for k, v in label2id.items()}
label_list = list(label2id.keys())

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], is_split_into_words=True, padding='longest', max_length=256, truncation=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)# dataset_format_converter.dataset.map(tokenize_and_align_labels, batched=True)
train_data, val_data, test_data = preprocessor.split_layer_into_train_val_test_(tokenized_ds, TRAIN_LAYER)


In [3]:
train_data

Dataset({
    features: ['sentence', 'entities', 'original_text', 'original_id', 'prompt', 'input_ids', 'attention_mask', 'tokens', 'ner_tags', 'labels'],
    num_rows: 669
})

In [4]:

base_model = MistralForTokenClassification.from_pretrained(
    BASE_MODEL_CHECKPOINT, 
    num_labels=len(label2id), 
    id2label=id2label, 
    label2id=label2id,
    token = LLAMA_TOKEN,
    load_in_4bit=True,
    device_map = 'auto',
    cache_dir='/data/disk1/share/pferrazzi/.cache'
    )# .bfloat16()
adapters = "ferrazzipietro/LS_Mistral-7B-v0.1_adapters_en.layer1_NoQuant_16_32_0.01_2_0.0002"
merged_model = PeftModel.from_pretrained(base_model, 
                                                     adapters, 
                                                     token=HF_TOKEN, 
                                                     device_map='auto',
                                                     is_trainable = False)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
INFO:BiLLM:Here is the Bi-MistralModel! BiLLM_START_INDEX=0
Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.42s/it]
Some weights of MistralForTokenClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [5]:
device = "cuda"
tokenizer.padding_side = "left"
examples=['The doctor went to the hospital', 'the 12 years old girl went to see the doctor']

input_sentences = examples
prompts = examples
input_sentences_tokenized = tokenizer(input_sentences, return_tensors="pt", padding=True)
max_new_tokens = int(len(max(input_sentences_tokenized, key=len)) * 4)
# if self.preprocessor.model_type == 'gemma':
#     add_special_tokens = True
encodeds = tokenizer(prompts, return_tensors="pt", add_special_tokens=False, padding=True)
model_inputs = encodeds.to(device)
generated_ids = merged_model(**model_inputs, 
                             #do_sample=True, 
                             #max_new_tokens=max_new_tokens,  
                               #pad_token_id=tokenizer.pad_token_id,
                               #temperature = 1.0
                               ) # max_new_tokens=max_new_tokens,
# print(generated_ids)
# generated_ids = generated_ids[:, encodeds.input_ids.shape[1]:]
# decoded = tokenizer.batch_decode(generated_ids)



In [9]:
import torch

class Postprocessor():
    def __init__(self, tokenizer, label_list):
        self.tokenizer = tokenizer
        self.label_list = label_list
        self.id2label = {v: k for k, v in label_list.items()}
        self.label2id = label_list
    
    def postprocess(self, model_output_logits):
        modmodel_output_logitsel_output = model_output_logits.cpu().deatch().numpy()
        preds = np.argmax(model_output_logits, axis=2)
        preds_list = []
        for pred in preds:
            preds_list.append([self.id2label[label] for label in pred])
        return preds_list

In [10]:
postprocessor = Postprocessor(tokenizer, label2id)
preds_list = postprocessor.postprocess(generated_ids.logits)
print("predictions", preds_list)

UnboundLocalError: cannot access local variable 'model_output' where it is not associated with a value

In [1]:
print(base_model.state_dict())

NameError: name 'base_model' is not defined

In [None]:
# def compute_metrics(p):
#     predictions, labels = p
#     predictions = np.argmax(predictions, axis=2)

#     true_predictions = [
#         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]
#     true_labels = [
#         [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]

#     results = seqeval.compute(predictions=true_predictions, references=true_labels)
#     return {
#         "precision": results["overall_precision"],
#         "recall": results["overall_recall"],
#         "f1": results["soverall_f1"],
#         "accuracy": results["overall_accuracy"],
#     }