In [None]:
from utils.data_preprocessor import DataPreprocessor
from datasets import Dataset
from tqdm import tqdm

class TestDataProcessor():
    def __init__(self, test_data: Dataset, preprocessor:DataPreprocessor, n_shots_inference:int, language:str, tokenizer) -> None:
        """
        Initialize the TestDataProcessor class.
        pass to this the same DataPreprocessor used for the training data. This will ensure that the inference prompt is formatted in the same way as the training prompt.
        """
        self.test_data = test_data
        self.preprocessor = preprocessor
        self.language = language
        self.tokenizer = tokenizer
        self.model_type = preprocessor.model_type
        self.input_sentence_field = 'sentence'
        self.few_shots_dict = {'en':{'questions':['We present a case of a 32-year-old woman with a history of gradual enlargement of the anterior neck.',
                                                   'Patient information: a 9-month-old boy presented to the emergency room with a 3-day history of refusal to bear weight on the right lower extremity and febrile peaks of up to 38.5°C for 24 hours.',
                                                   'There was no evidence of lung lesions.',
                                                   'Locally diminished actin coloration indicated atrophy of smooth muscle fibers.'],
                                        'responses':['[{"entity": "present"}, {"entity": "history"}, {"entity": "enlargement"}]',
                                                     '[{"entity": "presented"}, {"entity": "refusal"}, {"entity": "bear"}, {"entity": "peaks"}]',
                                                      '[{"entity": "evidence"}, {"entity": "lung lesions"]',
                                                      '[{"entity": "coloration"}, {"entity": "indicated"}, {"entity": "atrophy"}, {"entity": "atrophy of smooth muscle fibers"}, {"entity": "smooth muscle fibers"'],
                                        'responses_offset': ['[{"entity": "present", "offset": [3, 10]}, {"entity": "history", "offset": [48, 55]}, {"entity": "enlargement", "offset": [67, 78]}]',
                                                             '[{"entity": "presented", "offset": [39, 48]}, {"entity": "refusal", "offset": [95, 102]}, {"entity": "bear", "offset": [106, 110]}, {"entity": "peaks", "offset": [159, 164]}]',
                                                             '[{ "entity": "evidence", "offsets": [13, 21]}, {"entity": "lung lesions", "offsets": [25, 37]} ]',
                                                             '[{"entity": "coloration", "offsets": [25, 35]}, {"entity": "indicated", "offsets": [36, 45]}, {"entity": "atrophy","offsets": [46, 53]}, {"entity": "atrophy of smooth muscle fibers", "offsets": [46, 77]}, {"entity": "smooth muscle fibers", "offsets": [57, 77]} ]'],
                                    },
                                'it':{'questions':['In considerazione dell’inefficacia della terapia somministrata, in assenza di ulteriori opzioni terapeutiche standard potenzialmente efficaci e dopo colloquio con i genitori si decide di avviare la paziente a trapianto aploidentico, possibilmente NK allo reattivo, da genitore.',
                                                    'L’esame istologico dimostrava mucosa gastrica atrofica con flogosi cronica, marcato edema ed incremento del connettivo del corion, focale metaplasia intestinale, il tutto sovrastante un tessuto fibromuscolare.',
                                                    'Giunge nel nostro reparto per stranguria in assenza di altri sintomi.',
                                                    'All’età di 16 mesi, nuovo ricovero per febbre (39°C) e stato di abbattimento.'],
                                       'responses':['[{"entity": "inefficacia"}, {"entity": "opzioni"}, {"entity": "colloquio"}, {"entity": "avviare"}, {"entity": "trapianto"}, {"entity": "genitori"}, {"entity": "paziente"}, {"entity": "genitore"}]',
                                                    '[{"entity": "mucosa gastrica atrofica"}, {"entity": "flogosi\r\cronica"}]',
                                                    '[{"entity": "Giunge"}, {"entity": "stranguria"}, {"entity": "sintomi"}, {"entity": "stranguria"}]',
                                                    '[{"entity": "ricovero"}, {"entity": "febbre"}, {"entity": "stato"}, {"entity": "febbre"}, {"entity": "39°C"} ]'],
                                       'responses_offset':['[{"entity": "inefficacia", "offset": [23, 34]}, {"entity": "opzioni", "offset": [88,95]}, {"entity": "colloquio", "offset": [149,158]}, {"entity": "avviare", "offset": [187,194]}, {"entity": "trapianto", "offset": [209,218]}, {"entity": "genitori", "offset": [163,173]}, {"entity": "paziente", "offset": [195,106]}, {"entity": "genitore", "offset": [268,276]}]',
                                                           '[{"entity": "mucosa gastrica atrofica", "offset": [30,54]}, {"entity": "flogosi\r\cronica", "offset": [59,75]}]',
                                                           '[{"entity": "Giunge", "offset": [0,6]}, {"entity": "stranguria", "offset": [30,40]}, { "entity": "sintomi", "offset": [61,68]}, {"entity": "stranguria", "offset": [ 30, 40 ]} ]',
                                                           '[{"entity": "ricovero", "offset": [26,34]}, {"entity": "febbre", "offset": [ 39, 45 ]}, {"entity": "stato", "offset": [ 55, 60 ]}, {"entity": "febbre", "offset": [ 39, 45 ]}, {"entity": "39°C", "offset": [47,51]} ]']},
                                'slo': {'questions':[],
                                       'responses':[],
                                       'responses_offset':[]}}
        if len(self.few_shots_dict[self.language]['questions']) < n_shots_inference:
            raise ValueError(f'The number of shots for the inference prompt is greater than the number of examples available.')
        if len(self.few_shots_dict[self.language]['responses']) < n_shots_inference:
            raise ValueError(f'The number of shots for the inference prompt is greater than the number of responses available.')
        self.n_shots_inference = n_shots_inference
    
    def _extract_ground_truth(self, prompt:str) -> str:
        # print('PROMPT: ', prompt)
        end_of_prompt_string = self.preprocessor.special_tokens_instruction['user_end'] + self.preprocessor.special_tokens_instruction['model_start']
        # print('end_of_prompt_string: ', end_of_prompt_string)
        out = prompt.split(end_of_prompt_string, 1)
        out = out[1].strip().replace(self.preprocessor.special_tokens_instruction['model_start'], '').replace(self.preprocessor.special_tokens_instruction['model_end'], '')
        # print('OUT: ', out)
        return {'ground_truth': out}
    
    def _format_prompt_inference(self, input: str, instruction_on_response_format:str, n_shots:int, offset: bool, simplest_prompt:bool, output:str='', list_of_examples: [str]=[], list_of_responses:[str]=[]) -> str:
        """
        Format the input and output into a prompt for the finetuning

        Args:
            task: the task for which the prompt is generated, either 'finetuning' or 'inference'
            input: the input text
            instruction_on_response_format: the instruction on the response format. E.g. "The response must be a list of dictionaries, where each dictionary contains the keys 'text' and 'offset'"
            n_shots: the number of examples to provide as few shot prompting
            offset: whether to require the offset in the response
            tokenizer: the tokenizer to use
            output: the output text
            list_of_examples: the list of examples to provide as few shot prompting
            list_of_responses: the list of responses to provide as few shot prompting

        Returns:
            the formatted prompt
        """
        if output != '':
            raise ValueError("The output must be an empty string when generating prompts for the inference")

        if len(list_of_examples) != len(list_of_responses):
            raise ValueError("The number of examples and responses must be the same")
        if n_shots != len(list_of_examples):
            raise ValueError("The number of examples and shots must be the same")
        if n_shots != len(list_of_responses):
            raise ValueError("The number of responses and shots must be the same")
        
        if simplest_prompt:
            base_prompt = self.preprocessor._simplest_base_prompt_input(input)
        elif not simplest_prompt:
            base_prompt = self.preprocessor._base_prompt_input(input, instruction_on_response_format)

        one_shot_example = self.preprocessor.one_shot_example_no_offset if not offset else self.preprocessor.one_shot_example
            
        prompt = ''
        for shot_example in range(n_shots):
            prompt += one_shot_example.format(
                instruction_on_response_format=instruction_on_response_format, 
                example_query=list_of_examples[shot_example], 
                example_response=list_of_responses[shot_example],
                user_start=self.preprocessor.special_tokens_instruction['user_start'],
                user_end=self.preprocessor.special_tokens_instruction['user_end'],
                model_start=self.preprocessor.special_tokens_instruction['model_start'],
                model_end=self.preprocessor.special_tokens_instruction['model_end'])
        
        bos_token = self.preprocessor.tokenizer.bos_token
        if self.model_type == 'qwen':
            bos_token = ''
        prompt = bos_token + prompt + base_prompt + output 
                            
        return prompt
    
    def _extract_inference_prompt(self, sentence:str, simplest_prompt:bool) -> str:
        if self.preprocessor.offset:
            few_shots_responses = self.few_shots_dict[self.language]['responses_offset']
        else:
            few_shots_responses = self.few_shots_dict[self.language]['responses']
        if self.n_shots_inference == 0:
            list_of_examples = []
            list_of_responses = []
        else:
            list_of_examples = self.few_shots_dict[self.language]['questions'][0:self.n_shots_inference]
            list_of_responses = few_shots_responses[0:self.n_shots_inference]
        inference_prompt = self._format_prompt_inference(input=sentence, 
                                                        instruction_on_response_format=self.preprocessor.instruction_on_response_format,
                                                        offset=self.preprocessor.offset,
                                                        output='',
                                                        n_shots=self.n_shots_inference,
                                                        simplest_prompt=simplest_prompt,
                                                        list_of_examples=list_of_examples,
                                                        list_of_responses=list_of_responses)
        return {'inference_prompt': inference_prompt}
    
    def add_inference_prompt_column(self, simplest_prompt:bool) -> None:
        """
        Add the inferencePrompt and groundTruth columns to the test_data dataframe.
        """
        self.test_data = self.test_data.map(lambda x: self._extract_inference_prompt(x[self.input_sentence_field], simplest_prompt=simplest_prompt))
    
    def add_ground_truth_column(self) -> None:
        """
        Add the groundTruth column to the test_data dataframe.
        """
        self.test_data = self.test_data.map(lambda x: self._extract_ground_truth(x['prompt']))

    def _generate_model_response(self, examples, model, tokenizer, max_new_tokens_factor:float, stopping_criteria=[], temperature:float=1.0) -> str:
        device = "cuda"
        tokenizer.padding_side = "left"
        # if self.model_type == 'qwen':
        #     tokenizer.pad_token = '<unk>' # tokenizer.special_tokens['<extra_0>']
        input_sentences = examples[self.input_sentence_field]
        prompts = examples['inference_prompt']
        input_sentences_tokenized = tokenizer(input_sentences, return_tensors="pt", padding=True)
        max_new_tokens = int(len(max(input_sentences_tokenized, key=len)) * max_new_tokens_factor)
        # if self.preprocessor.model_type == 'gemma':
        #     add_special_tokens = True
        encodeds = tokenizer(prompts, return_tensors="pt", add_special_tokens=False, padding=True)
        model_inputs = encodeds.to(device)
        if len(stopping_criteria)>0:
            generated_ids = model.generate(**model_inputs, do_sample=True, max_new_tokens=max_new_tokens,  
                                        pad_token_id=tokenizer.pad_token_id,
                                        temperature = temperature,
                                        stopping_criteria = stopping_criteria
                                        ) # max_new_tokens=max_new_tokens,
        else:
            generated_ids = model.generate(**model_inputs, do_sample=True, max_new_tokens=max_new_tokens,  
                                        pad_token_id=tokenizer.pad_token_id,
                                        temperature = temperature) 
        #print('generated_ids: ', generated_ids)
        generated_ids = generated_ids[:, encodeds.input_ids.shape[1]:]
        decoded = tokenizer.batch_decode(generated_ids)
        # decoded = [self._postprocess_model_output(i) for i in decoded]
        return (decoded)
                
    def add_responses_column(self, model, tokenizer, batch_size:int, max_new_tokens_factor:float, stopping_criteria:list, temperature:float=1.0) -> None:
        """
        Adds a column with the response of the model to the actual query.
        
        params:
        model: the model to use to generate the response
        tokenizer: the tokenizer to use to generate the response
        batch_size: the batch size to use to process the examples. Increasing this makes it faster but requires more GPU. Default is 8.
        max_new_tokens_factor: the factor conotrolling the number of new tokens to generate. This is a factor of the length of the input sentence.
        """
        responses_col = []
        total_rows = len(self.test_data)
        indexes = [i for i in range(len(self.test_data)) if i % batch_size == 0]
        max_index = self.test_data.shape[0]


        with tqdm(total=total_rows, desc="generating responses") as pbar:
            for i, idx in enumerate(indexes[:-1]):
                indici = list(range(idx, indexes[i+1]))
                tmp = self._generate_model_response(self.test_data.select(indici), model, tokenizer, max_new_tokens_factor, stopping_criteria, temperature=temperature)
                responses_col.extend(tmp)
                pbar.update(batch_size)
            indici = list(range(indexes[len(indexes[:-1])], max_index))
            tmp = self._generate_model_response(self.test_data.select(indici), model, tokenizer, max_new_tokens_factor, stopping_criteria, temperature=temperature)
            responses_col.extend(tmp)
            pbar.update(batch_size)

        self.test_data = self.test_data.add_column('model_responses', responses_col)
    
    def _postprocess_model_output_deprecated(self, model_output: str) -> str:
        """
        Postprocess the model output to remove the instruction and return the model response.

        Args:
        model_output (str): the model output as it is returned by the model. The processing of the output is done in the function

        return:
        str: the model response, i.e. the model output without the instruction

        """
        end_of_prompt_string = self.preprocessor.special_tokens_instruction['user_end'] + self.preprocessor.special_tokens_instruction['model_start']
        return model_output.split(end_of_prompt_string, 1)[-1].strip()
    
    

class TestDataProcessSlovenian(TestDataProcessor):
    def __init__(self, test_data: Dataset, preprocessor:DataPreprocessor, n_shots_inference:int, language:str, tokenizer) -> None:
        """
        Initialize the TestDataProcessor class.
        pass to this the same DataPreprocessor used for the training data. This will ensure that the inference prompt is formatted in the same way as the training prompt.
        """
        super().__init__(test_data, preprocessor, n_shots_inference, language, tokenizer)
        self.input_sentence_field = 'sentence'
        

    def _extract_ground_truth(self, prompt:str) -> str:
        # print('PROMPT: ', prompt)
        end_of_prompt_string = self.preprocessor.special_tokens_instruction['user_end'] + self.preprocessor.special_tokens_instruction['model_start']
        # print('end_of_prompt_string: ', end_of_prompt_string)
        out = prompt.split(end_of_prompt_string, 1)
        out = out[1].strip().replace(self.preprocessor.special_tokens_instruction['model_start'], '').replace(self.preprocessor.special_tokens_instruction['model_end'], '')

        if out=='] </s>':
            out='[]'
        # print('OUT: ', out)
        return {'ground_truth': out}

In [30]:
from dotenv import dotenv_values
from datasets import load_dataset, Dataset
from utils.data_preprocessor import DataPreprocessor
from utils.test_data_processor import TestDataProcessor
from utils.generate_ft_adapters_list import generate_ft_adapters_list
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import gc
from peft import PeftModel
from tqdm import tqdm

from config import postprocessing_params_llama as postprocessing
from log import llama_7B_NoQuant_1epoch as models_params
adapters_list = generate_ft_adapters_list("llama_7B_NoQuant_1epoch", simplest_prompt=models_params.simplest_prompt)
print(adapters_list)
HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']
LLAMA_TOKEN = dotenv_values(".env.base")['LLAMA_TOKEN']

max_new_tokens_factor_list = postprocessing.max_new_tokens_factor_list
n_shots_inference_list = postprocessing.n_shots_inference_list
layer = models_params.TRAIN_LAYER
language = layer.split('.')[0]

dataset = load_dataset("ferrazzipietro/e3c-sentences", token=HF_TOKEN)
dataset = dataset[layer]
tokenizer = AutoTokenizer.from_pretrained(models_params.BASE_MODEL_CHECKPOINT, add_eos_token=False,
                                         token=LLAMA_TOKEN)
preprocessor = DataPreprocessor(model_checkpoint=models_params.BASE_MODEL_CHECKPOINT, 
                                tokenizer = tokenizer, clen=models_params.clent)
dataset = preprocessor.preprocess_data_one_layer(dataset,
                                                 models_params.instruction_on_response_format)
_, val_data, _ = preprocessor.split_layer_into_train_val_test_(dataset, layer)




from transformers import StoppingCriteria
class EosListStoppingCriteria(StoppingCriteria):
    def __init__(self, eos_sequence = [518, 29914, 25580, 29962]):
        self.eos_sequence = eos_sequence

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        last_ids = input_ids[:,-len(self.eos_sequence):].tolist()
        print('last_ids:', last_ids)
        eos_cond = 2 in last_ids
        return (self.eos_sequence in last_ids) or eos_cond



print("val_data:", val_data[0:3])

for max_new_tokens_factor in [6]:#max_new_tokens_factor_list
    for n_shots_inference in [0]:#n_shots_inference_list:
        for adapters in tqdm(adapters_list[:1], desc="adapters_list"):
            if adapters.endswith("0.0008"):
                continue
            print("PROCESSING:", adapters, "n_shots_inference:", n_shots_inference, "max_new_tokens_factor:", max_new_tokens_factor)
            if False:#not models_params.quantization:
                print("NO QUANTIZATION")
                base_model = AutoModelForCausalLM.from_pretrained(
                    models_params.BASE_MODEL_CHECKPOINT, low_cpu_mem_usage=True,
                    return_dict=True,  
                    torch_dtype=postprocessing.torch_dtype,
                    device_map= "auto",
                    token=LLAMA_TOKEN)    
            else:
                print("QUANTIZATION")
                # load_in_8bit = not models_params.load_in_4bit[0]
                # load_in_4bit = models_params.load_in_4bit[0]
                # load_in_8bit = not load_in_4bit
                bnb_4bit_use_double_quant = models_params.bnb_4bit_use_double_quant
                bnb_4bit_quant_type = models_params.bnb_4bit_quant_type[0]
                bnb_4bit_compute_dtype = models_params.bnb_4bit_compute_dtype[0]
                # llm_int8_threshold = models_params.llm_int8_threshold[0]
                # llm_int8_has_fp16_weight = models_params.llm_int8_has_fp16_weight AVOID IT AT INFERENCE TIME!
                # llm_int8_skip_modules = models_params.llm_int8_skip_modules AVOID IT AT INFERENCE TIME!

                bnb_config = BitsAndBytesConfig(
                            load_in_4bit=True,
                            # load_in_8bit=load_in_8bit,
                            bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
                            bnb_4bit_quant_type=bnb_4bit_quant_type,
                            bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
                            # llm_int8_threshold=llm_int8_threshold ,
                            # llm_int8_has_fp16_weight =True #,AVOID IT AT INFERENCE TIME!
                            # llm_int8_skip_modules=llm_int8_skip_modules AVOID IT AT INFERENCE TIME!
                            )
                base_model = AutoModelForCausalLM.from_pretrained(
                    models_params.BASE_MODEL_CHECKPOINT, low_cpu_mem_usage=True,
                    quantization_config = bnb_config,
                    return_dict=True,  
                    device_map= "auto",
                    cache_dir ='/data/disk1/share/pferrazzi/.cache',
                    token=LLAMA_TOKEN)
            merged_model = PeftModel.from_pretrained(base_model, 
                                                     adapters, 
                                                     token=HF_TOKEN, 
                                                     device_map='auto',
                                                     is_trainable = False)
            tokenizer = AutoTokenizer.from_pretrained(models_params.BASE_MODEL_CHECKPOINT, 
                                                      add_eos_token=False,
                                                      token=LLAMA_TOKEN)
            tokenizer.pad_token = tokenizer.eos_token# "<pad>" #tokenizer.eos_token
            tokenizer.padding_side = "left"
#            tokenizer = AutoTokenizer.from_pretrained(models_params.BASE_MODEL_CHECKPOINT, add_eos_token=True, token=LLAMA_TOKEN)
#            tokenizer.add_special_tokens({"pad_token":"<pad>"})
#            merged_model.resize_token_embeddings(len(tokenizer))
#            print('tokenizer.pad_token_id:', tokenizer.pad_token_id)
#            merged_model.config.pad_token_id = tokenizer.pad_token_id

            postprocessor = TestDataProcessor(test_data=val_data.select(range(8)), 
                                              preprocessor=preprocessor, 
                                              n_shots_inference=n_shots_inference, 
                                              language=language, 
                                              tokenizer=tokenizer)
            postprocessor.add_inference_prompt_column(simplest_prompt=False)

            # tmp = []
            # for example in postprocessor.test_data:
            #     tmp.append(example)
            # import pandas as pd
            # tmp = pd.DataFrame(tmp)
            # tmp = tmp.iloc[tmp['inference_prompt'].str.len().argsort()]
            # postprocessor.test_data = Dataset.from_pandas(tmp)

            postprocessor.add_ground_truth_column()
            #try:
            postprocessor.add_responses_column(model=merged_model, 
                                            tokenizer=tokenizer, 
                                            batch_size=4, 
                                            max_new_tokens_factor=max_new_tokens_factor,
                                            stopping_criteria = [EosListStoppingCriteria()],
                                            temperature=1)
            # postprocessor.test_data.to_csv(f"{postprocessing.save_directory}maxNewTokensFactor{max_new_tokens_factor}_nShotsInference{n_shots_inference}_{adapters.split('/')[1]}.csv", index=False)
            # except RuntimeError as e:
                # print("ERROR IN PROCESSING: ", e, adapters)
                # print(e.message)
            del merged_model
            if models_params.quantization: del base_model
            del tokenizer
            gc.collect()
            torch.cuda.empty_cache()





['ferrazzipietro/llama-2-7b-chat-hf_adapters_en.layer1_NoQuant_torch.bfloat16_16_16_0.02_1_0.0002_clent', 'ferrazzipietro/llama-2-7b-chat-hf_adapters_en.layer1_NoQuant_torch.bfloat16_16_32_0.02_1_0.0002_clent', 'ferrazzipietro/llama-2-7b-chat-hf_adapters_en.layer1_NoQuant_torch.bfloat16_32_16_0.02_1_0.0002_clent', 'ferrazzipietro/llama-2-7b-chat-hf_adapters_en.layer1_NoQuant_torch.bfloat16_32_32_0.02_1_0.0002_clent', 'ferrazzipietro/llama-2-7b-chat-hf_adapters_en.layer1_NoQuant_torch.bfloat16_64_16_0.02_1_0.0002_clent', 'ferrazzipietro/llama-2-7b-chat-hf_adapters_en.layer1_NoQuant_torch.bfloat16_64_32_0.02_1_0.0002_clent']
MODEL TYPE: llama
val_data: {'sentence': ['A 46-year-old man with hypertension and dyslipidemia diagnosed 4-months before, as well as new-onset diabetes mellitus unveiled 1-month earlier, was referred to emergency department for hypokalemia.', 'Hormonal study and dynamic biochemical tests performed indicated ECS.', 'Imaging and cytological findings pointed toward a l

adapters_list:   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING: ferrazzipietro/llama-2-7b-chat-hf_adapters_en.layer1_NoQuant_torch.bfloat16_16_16_0.02_1_0.0002_clent n_shots_inference: 0 max_new_tokens_factor: 6
QUANTIZATION


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



last_ids: [[29914, 25580, 29962, 29871], [29914, 25580, 29962, 29871], [29914, 25580, 29962, 29871], [29914, 25580, 29962, 29871]]
last_ids: [[25580, 29962, 29871, 518], [25580, 29962, 29871, 518], [25580, 29962, 29871, 518], [25580, 29962, 29871, 518]]
last_ids: [[29962, 29871, 518, 6377], [29962, 29871, 518, 6377], [29962, 29871, 518, 6377], [29962, 29871, 518, 6377]]
last_ids: [[29871, 518, 6377, 10041], [29871, 518, 6377, 10041], [29871, 518, 6377, 10041], [29871, 518, 6377, 10041]]
last_ids: [[518, 6377, 10041, 1115], [518, 6377, 10041, 1115], [518, 6377, 10041, 1115], [518, 6377, 10041, 1115]]
last_ids: [[6377, 10041, 1115, 5124], [6377, 10041, 1115, 5124], [6377, 10041, 1115, 376], [6377, 10041, 1115, 5124]]
last_ids: [[10041, 1115, 5124, 6525], [10041, 1115, 5124, 6525], [10041, 1115, 376, 16072], [10041, 1115, 5124, 6525]]
last_ids: [[1115, 5124, 6525, 2], [1115, 5124, 6525, 2], [1115, 376, 16072, 1492], [1115, 5124, 6525, 2]]
last_ids: [[5124, 6525, 2, 2], [5124, 6525, 2, 2],



last_ids: [[2, 2, 2, 2], [2, 2, 2, 2], [29908, 6525, 29871, 2], [2, 2, 2, 2]]
last_ids: [[29914, 25580, 29962, 29871], [29914, 25580, 29962, 29871], [29914, 25580, 29962, 29871], [29914, 25580, 29962, 29871]]
last_ids: [[25580, 29962, 29871, 518], [25580, 29962, 29871, 518], [25580, 29962, 29871, 518], [25580, 29962, 29871, 518]]
last_ids: [[29962, 29871, 518, 6377], [29962, 29871, 518, 6377], [29962, 29871, 518, 6377], [29962, 29871, 518, 6377]]
last_ids: [[29871, 518, 6377, 10041], [29871, 518, 6377, 10041], [29871, 518, 6377, 10041], [29871, 518, 6377, 10041]]
last_ids: [[518, 6377, 10041, 1115], [518, 6377, 10041, 1115], [518, 6377, 10041, 1115], [518, 6377, 10041, 1115]]
last_ids: [[6377, 10041, 1115, 5124], [6377, 10041, 1115, 5124], [6377, 10041, 1115, 376], [6377, 10041, 1115, 376]]
last_ids: [[10041, 1115, 5124, 6525], [10041, 1115, 5124, 6525], [10041, 1115, 376, 29924], [10041, 1115, 376, 25379]]
last_ids: [[1115, 5124, 6525, 2], [1115, 5124, 6525, 2], [1115, 376, 29924, 931

generating responses: 100%|██████████| 8/8 [00:39<00:00,  4.95s/it]

last_ids: [[2, 2, 2, 2], [2, 2, 2, 2], [29908, 6525, 29871, 2], [6525, 29871, 2, 2]]



adapters_list: 100%|██████████| 1/1 [00:45<00:00, 45.08s/it]


In [16]:
tokenizer = AutoTokenizer.from_pretrained(models_params.BASE_MODEL_CHECKPOINT, 
                                                      add_eos_token=False,
                                                      token=LLAMA_TOKEN)
tokenizer.decode([10041, 1115, 5124, 6525])

'entity": ""}]'

In [31]:
postprocessor.test_data['ground_truth'], postprocessor.test_data['model_responses'] 

(['[{"entity": "new-onset diabetes mellitus"}] </s>',
  '[{{"entity": ""}}]</s>',
  '[{"entity": "primary right parotid malignancy"}, {"entity": "liver metastases"}] </s>',
  '[{"entity": "ACC"}] </s>',
  '[{"entity": "hypercortisolism"}] </s>',
  '[{"entity": "cervical mass"}] </s>',
  '[{"entity": "MNG"}] </s>',
  '[{"entity": "mass"}, {"entity": "cervical compression"}, {"entity": "respiratory, digestive, laryngeal, vascular or neurologic signs"}, {"entity": "digestive, laryngeal, vascular or neurologic signs"}, {"entity": "laryngeal, vascular or neurologic signs"}, {"entity": "vascular or neurologic signs"}, {"entity": "neurologic signs"}] </s>'],
 [' [{"entity": ""}]</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>',
  ' [{"entity": ""}]</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>',
  ' [{"entity": "primary right parotid malignancy with liver metastases"}] </s>',
  ' [{"entity": ""}]</s></s></s></s></s></s></s></s></s></s></s></s></s></s>

In [26]:
postprocessor.test_data['ground_truth'], postprocessor.test_data['model_responses'] 

(['[{"entity": "new-onset diabetes mellitus"}] </s>',
  '[{{"entity": ""}}]</s>',
  '[{"entity": "primary right parotid malignancy"}, {"entity": "liver metastases"}] </s>',
  '[{"entity": "ACC"}] </s>'],
 [' [{"entity": "hypertension"}, {"entity',
  ' [{"entity": "ECS"}] </s>',
  ' [{"entity": "primary right parotid m',
  ' [{"entity": "ACC"}] </s>'])

In [35]:
postprocessor.test_data['ground_truth'], postprocessor.test_data['model_responses'] 

(['[{"entity": "new-onset diabetes mellitus"}] </s>',
  '[{{"entity": ""}}]</s>',
  '[{"entity": "primary right parotid malignancy"}, {"entity": "liver metastases"}] </s>',
  '[{"entity": "ACC"}] </s>'],
 [' [{"entity": "hypertension"}, {"entity": "dyslipidemia"}, {"entity": "diabetes mellitus"}]  [{"entity": "hypokalemia"}]  [{"entity": "diabetes mellitus"}]  [{"entity": "hypokalemia"}]  [{"entity": "diabetes',
  ' [{"entity": "ECS"}] </s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>',
  ' [{"entity": "primary right parotid malignancy"}]  [{"entity": "liver metastases"}]  [{"entity": "malignancy"}]  [{"entity": "metastases"}]</s>',
  ' The result is: [{"entity": "ACC"}] </s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s><

In [38]:
postprocessor.test_data['ground_truth'], postprocessor.test_data['model_responses'] 

(['[{"entity": "new-onset diabetes mellitus"}] </s>',
  '[{{"entity": ""}}]</s>',
  '[{"entity": "primary right parotid malignancy"}, {"entity": "liver metastases"}] </s>',
  '[{"entity": "ACC"}] </s>'],
 [' [{"entity": "hypertension"}, {"entity": "dyslipidemia"}]  [{"entity":',
  ' The result of the test indicated ECS.\n\nReturned result: [{"entity": "ECS"}] </s>',
  ' [{"entity": "primary right parotid malignancy"}, {"entity": "liver metastases"}] </s>',
  ' [{"entity": ""}]  return [{"entity": ""}]  return [{"entity": "ACC"}]  return'])

In [2]:
f"{postprocessing.save_directory}maxNewTokensFactor{max_new_tokens_factor}_nShotsInference{n_shots_inference}_{adapters.split('/')[1]}.csv"

'data/llama/7B_NoQuant_FT_cl_v2prompt/maxNewTokensFactor6_nShotsInference0_llama-2-7b-chat-hf_adapters_en.layer1_NoQuant_torch.bfloat16_64_64_0.01_1_0.0002_clent.csv'

### MISTRAL INSTRUCT simplest_prompt

In [1]:
from dotenv import dotenv_values
from datasets import load_dataset, Dataset
from utils.data_preprocessor import DataPreprocessor
from utils.evaluator import Evaluator
from config import postprocessing
from utils.test_data_processor import TestDataProcessor
import pandas as pd
from utils.generate_ft_adapters_list import generate_ft_adapters_list
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import gc
from peft import PeftModel
from tqdm import tqdm

HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']
adapters = 'ferrazzipietro/Mistral-7B-v0.1_simplest_prompt_adapters_en.layer1_4_torch.bfloat16_32_32_0.01_4_0.0002'
model_checkpoint = "mistralai/Mistral-7B-Instruct-v0.2"

layer = 'en.layer1'
language = layer.split('.')[0]


dataset = load_dataset("ferrazzipietro/e3c-sentences", token=HF_TOKEN)
dataset = dataset[layer]
preprocessor = DataPreprocessor(model_checkpoint=model_checkpoint, 
                                tokenizer = model_checkpoint)
dataset = preprocessor.preprocess_data_one_layer(dataset,
                                                 instruction_on_response_format='Return the result in a json format: [{"entity":"entity_name"}].',
                                                 simplest_prompt=True)
_, val_data, _ = preprocessor.split_layer_into_train_val_test_(dataset, layer)

bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            #load_in_8bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            # llm_int8_threshold= 6.0,
            # llm_int8_has_fp16_weight = False,
            # llm_int8_skip_modules= ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"],
            )



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_checkpoint, low_cpu_mem_usage=True,
    quantization_config = bnb_config,
    return_dict=True, 
    #torch_dtype=torch.float16,
    device_map= "auto")
merged_model = PeftModel.from_pretrained(base_model, adapters, token=HF_TOKEN, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_eos_token=False)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "left"


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.40s/it]
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [6]:
postprocessor = TestDataProcessor(test_data=val_data,
                                  preprocessor=preprocessor, 
                                  n_shots_inference=0, 
                                  language=language, 
                                  tokenizer=tokenizer)
postprocessor.add_inference_prompt_column()
postprocessor.add_ground_truth_column()
#try:
postprocessor.add_responses_column(model=merged_model, 
                                tokenizer=tokenizer, 
                                batch_size=32, 
                                max_new_tokens_factor=2)
postprocessor.test_data.to_csv(f"data/test_data_processed/maxNewTokensFactor{6}_nShotsInference{0}_{adapters.split('/')[1]}.csv", index=False)
# except Exception as e:
#     print("ERROR IN PROCESSING: ", Exception, adapters)
del merged_model
del base_model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

generating responses:   0%|          | 0/681 [00:00<?, ?it/s]

generating responses: 704it [06:34,  1.78it/s]                         
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  4.74ba/s]


In [7]:
postprocessor.test_data.to_csv(f"data/test_data_processed/maxNewTokensFactor{6}_nShotsInference{0}_{adapters.split('/')[1]}.csv", index=False)


Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  5.10ba/s]


2928976

In [34]:
postprocessor.test_data['model_responses']

['[{"entity": "46-year-old man"}, {"entity": "hypertension"}, {"entity": "dysl',
 '[{"entity": "study"}, {"entity": "tests"}, {"entity": "indicated"}, {"entity": "ECS"',
 '[{"entity": "findings"}, {"entity": "malignancy"}, {"entity": "right parotid malignancy"},',
 '[{"entity": "parotidectomy"}, {"entity": "examination"}, {"entity": "confirmed"}, {"entity":',
 '[{"entity": "hypercortisolism"}, {"entity": " managed"}, {"entity": "metyrapone"}, {"',
 '[{"entity": "50-years-old woman"}, {"entity": "hypertensive"}, {"entity": "hospital',
 '[{"entity": "MNG"}, {"entity": "her mother"}, {"entity": "sisters"}, {"entity": "c',
 '[{"entity": "signs of cervical compression"}, {"entity": "respiratory signs"}, {"entity": "digest',
 '[{"entity": "thyroid dysfunction"}]</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>',
 '[{"entity": "mass"}, {"entity": "took"}, {"entity": "the front and the two sides of the neck"',
 '[{"entity": "surface"}, {"entity": "embossed"}, {"entity": "cov

In [28]:
postprocessor.test_data['model_responses']  

['[{"entity": "diagnosed"}, {"entity": "unveiled"}, {"entity": "referred"}, {"entity": "',
 '[{"entity": "study"}, {"entity": "tests"}, {"entity": "indicated"}, {"entity": "ECS"',
 '[{"entity": "findings"}, {"entity": "cavity"}, {"entity": "malignancy"}, {"entity": "met',
 '[{"entity": "parotidectomy"}, {"entity": "examination"}, {"entity": "confirmed"}, {"entity":',
 '][{"entity": "hypercortisolism"}, {"entity": "managed"}] ---------- ------------ The entities contained in the',
 '[{"entity": "hypertensive"}, {"entity": "hospitalized"}, {"entity": "mass"}, {"entity": "appe',
 '][{"entity": "history"}, {"entity": "surgery"}, {"entity": "MNG"}, {"entity": "her mother"},',
 '][{"entity": "compression"}, {"entity": "signs"}, {"entity": "cervical compression"}, {"entity": "',
 '[{"entity": "dysfunction"}, {"entity": "thyroid dysfunction"}] \n[{"entity": "she"',
 '[{"entity": "The mass"}, {"entity": "the neck"}] ][]][//][{"entity": "The',
 '[{"entity": "covered"}, {"entity": "surface"}, {"e

In [17]:
postprocessor.test_data['model_responses']  

['1. diagnosed: diagnose\n 2. mellitus: diabetes mellitus\n 3. hypertension',
 '{"Text": "study", "study": "Hormonal study"}][{"Entity": "study"}, {"Entity":',
 '1. findings\n 2. pointed\n 3. malignancy\n 4. metastases\n 5. liver',
 'Entities: "parotidectomy", "examination", "confirmed", "ACC", "The patient"\n Types:',
 'Entities: hypercortisolism, managed, metyrapone, ketoconazole, lanreotide,',
 '1. hypertensive \n2. hospitalized\n3. cervical mass\n4. appeared\n5. a',
 '{"family history", "surgery", "MNG", "mother", "sisters", "cousins"} ]]]>',
 '1. signs of cervical compression\n 2. signs of respiratory compression\n 3. signs of digestive compression',
 "{'She': 'PERSON'} [{'thyroid dysfunction': 'disorder'}]</s></s></s></s></s></s></s></s></s>",
 '{"The mass"} ]</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>',
 '{"surface": "embossed and covered by a thin normal skin"}][{"surface": "covered by a thin normal skin"}, {"',
 'Entities: {"Som

### ZEFIRO

In [1]:
from dotenv import dotenv_values
from datasets import load_dataset, Dataset
from utils.data_preprocessor import DataPreprocessor
from utils.evaluator import Evaluator
from utils.test_data_processor import TestDataProcessor
import pandas as pd
from utils.generate_ft_adapters_list import generate_ft_adapters_list
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import gc
from peft import PeftModel
from tqdm import tqdm

HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']
model_checkpoint = "mii-community/zefiro-7b-base-ITA" 

layer = 'it.layer1'
language = layer.split('.')[0]

adapters = 'ferrazzipietro/zefiro-7b-base-ITA_adapters_it.layer1'
dataset = load_dataset("ferrazzipietro/e3c-sentences", token=HF_TOKEN)
dataset = dataset[layer]
preprocessor = DataPreprocessor(model_checkpoint=model_checkpoint, 
                                tokenizer = model_checkpoint)
dataset = preprocessor.preprocess_data_one_layer(dataset,
                                                 instruction_on_response_format='Estrai le entità contenute nel testo.\nRiporta i risultati in formato json: [{"entity":"nome_entità"}].',
                                                 simplest_prompt=True)
_, val_data, _ = preprocessor.split_layer_into_train_val_test_(dataset, layer)

bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            #load_in_8bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            # llm_int8_threshold= 6.0,
            # llm_int8_has_fp16_weight = False,
            # llm_int8_skip_modules= ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"],
            )

base_model = AutoModelForCausalLM.from_pretrained(
    model_checkpoint, low_cpu_mem_usage=True,
    quantization_config = bnb_config,
    return_dict=True, 
    #torch_dtype=torch.float16,
    device_map= "auto",
    cache_dir='/data/disk1/share/pferrazzi/.cache')


  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]
Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.17it/s]
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [2]:
merged_model = PeftModel.from_pretrained(base_model, adapters, token=HF_TOKEN, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_eos_token=False)
# tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "left"

In [9]:
postprocessor = TestDataProcessor(test_data=val_data.select(range(6)),
                                  preprocessor=preprocessor, 
                                  n_shots_inference=0, 
                                  language=language, 
                                  tokenizer=tokenizer)
postprocessor.add_inference_prompt_column(simplest_prompt=False)
postprocessor.add_ground_truth_column()
#try:
postprocessor.add_responses_column(model=merged_model, 
                                tokenizer=tokenizer, 
                                batch_size=3, 
                                max_new_tokens_factor=4)
#postprocessor.test_data.to_csv(f"data/TMPPmaxNewTokensFactor{4}_nShotsInference{0}_{adapters.split('/')[1]}.csv", index=False)
# except Exception as e:
#     print("ERROR IN PROCESSING: ", Exception, adapters)

Map: 100%|██████████| 6/6 [00:00<00:00, 965.72 examples/s]
Map: 100%|██████████| 6/6 [00:00<00:00, 1021.71 examples/s]
generating responses:   0%|          | 0/6 [00:00<?, ?it/s]

generating responses: 100%|██████████| 6/6 [00:41<00:00,  6.86s/it]


In [10]:
postprocessor.test_data[0]

{'sentence': 'Il caso riguarda un ragazzo di 12 anni, ricoverato presso l’UOC di Chirurgia Pediatrica di Treviso per addome acuto.',
 'entities': [{'id': '5347',
   'offsets': [103, 115],
   'role': '',
   'semantic_type_id': 'C0000727',
   'text': 'addome acuto',
   'type': 'CLINENTITY'}],
 'original_text': 'Il caso riguarda un ragazzo di 12 anni, ricoverato presso l’UOC di Chirurgia Pediatrica di Treviso per addome acuto. Il ragazzo manifestava da circa una settimana vomiti ripetuti accompagnati da coliche addominali, inappetenza e vistoso calo ponderale (4 kg circa in una settimana). Al ricovero il paziente si presentava molto sofferente, astenico, disidratato, apiretico, con addome globoso, trattabile ma dolente alla palpazione profonda elettivamente in fossa iliaca destra; all’ascoltazione si percepiva una peristalsi metallica. Un’ecografia eseguita in pronto soccorso poneva la diagnosi di una peritonite da verosimile appendicite acuta complicata. Il ragazzo era quindi sottoposto 

In [22]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B-Chat", add_eos_token=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
tokenizer

Qwen2TokenizerFast(name_or_path='Qwen/Qwen1.5-7B-Chat', vocab_size=151643, model_max_length=32768, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [23]:
tokenizer.decode([151646])

''

In [12]:
from dotenv import dotenv_values
from datasets import load_dataset, Dataset
from utils.data_preprocessor import DataPreprocessor
from utils.evaluator import Evaluator
from utils.test_data_processor import TestDataProcessor
import pandas as pd
from utils.generate_ft_adapters_list import generate_ft_adapters_list
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from utils.output_cleaner import OutputCleaner
from peft import PeftModel

HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']

max_new_tokens_factor_list = [6]
n_shots_inference_list = [0]
layer = 'en.layer1'
language = layer.split('.')[0]

BASE_MODEL_CHECKPOINT = 'Qwen/Qwen1.5-7B-Chat'
dataset = load_dataset("ferrazzipietro/e3c-sentences", token=HF_TOKEN)
dataset = dataset[layer]
preprocessor = DataPreprocessor(BASE_MODEL_CHECKPOINT, BASE_MODEL_CHECKPOINT)
dataset = preprocessor.preprocess_data_one_layer(dataset, instruction_on_response_format='Return the result in a json format: [{"entity":"entity_name"}].')
_, val_data, _ = preprocessor.split_layer_into_train_val_test_(dataset, layer)



bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            #load_in_8bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            # llm_int8_threshold= 6.0,
            # llm_int8_skip_modules= ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"],
            )
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_CHECKPOINT, low_cpu_mem_usage=True,
    quantization_config = bnb_config,
    return_dict=True,
    device_map= 'auto',
    cache_dir='/data/disk1/share/pferrazzi/.cache')


adapters = 'ferrazzipietro/qwen1.5-7b-chat__adapters_en.layer1_8_torch.bfloat16_16_32_0.01_2_0.0002'
merged_model = PeftModel.from_pretrained(base_model, adapters, token=HF_TOKEN, device_map='auto')
#tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████| 1520/1520 [00:00<00:00, 5638.07 examples/s]
Map: 100%|██████████| 170/170 [00:00<00:00, 7571.75 examples/s]
Loading checkpoint shards: 100%|██████████| 4/4 [02:02<00:00, 30.67s/it]
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.
adapter_config.json: 100%|██████████| 538/538 [00:00<00:00, 2.83MB/s]
adapter_model.safetensors: 100%|██████████| 99.7M/99.7M [00:02<00:00, 39.9MB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
tokenizer.add_special_tokens({"pad_token":"<pad>"})
#model.resize_token_embeddings(len(tokenizer))
print('tokenizer.pad_token_id:', tokenizer.pad_token_id)
# model.config.pad_token_id = tokenizer.pad_token_id
# model.embed_tokens = nn.Embedding(model.config.vocab_size, model.config.hidden_size, model.config.padding_idx)
# tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = 'right'
tokenizer.decode([151646])

tokenizer.pad_token_id: 151646


'<pad>'

In [18]:
import transformers
from typing import Dict
def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))
    
    if num_new_tokens > 0:
        input_embeddings_data = model.get_input_embeddings().weight.data
        output_embeddings_data = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings_data[-num_new_tokens:] = input_embeddings_avg
        output_embeddings_data[-num_new_tokens:] = output_embeddings_avg

smart_tokenizer_and_embedding_resize(
        special_tokens_dict=dict(pad_token="<pad>"),
        tokenizer=tokenizer,
        model=merged_model,
    )

In [6]:
import pandas as pd 
d = pd.read_csv('data/TMP_maxNewTokensFactor6_nShotsInference0_llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.05_4_0.0002.csv')
for i, ex in d.iterrows():
    print(ex['model_responses'])

 [{"entity": "hypertension"}, {"entity": "dyslipidemia"}, {"entity": "diabetes mellitus"}, {"entity": "hypokalemia"}, {"entity": "A 46-year-old man"}, {"entity": "1-month"}, {"entity": "4-months"}] </s>
 [{"entity": "study"}, {"entity": "tests"}, {"entity": "ECS"}, {"entity": "ECS"}, {"entity": "indicated"}] </s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>
 [{"entity": "findings"}, {"entity": "parotid"}, {"entity": "malignancy"}, {"entity": "metastases"}, {"entity": "parotid"}, {"entity": "liver"}] </s>
 [{"entity": "parotidectomy"}, {"entity": "examination"}, {"entity": "confirmed"}, {"entity": "ACC"}, {"entity": "The patient"}] </s></s></s></s></s></s>


In [25]:
l = ["/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_64_64_0.01_8_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_64_64_0.01_4_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_64_64_0.01_2_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_64_32_0.01_8_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_64_32_0.01_4_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_64_32_0.01_2_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_32_64_0.01_8_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_32_64_0.01_4_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_32_64_0.01_2_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_32_32_0.01_8_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_32_32_0.01_4_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_32_32_0.01_2_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_64_0.01_8_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_64_0.01_4_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_64_0.01_2_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.01_8_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.01_4_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.01_2_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_64_32_0.01_8_0.0008",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_64_32_0.01_4_0.0008",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_64_32_0.01_2_0.0008",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_64_32_0.05_8_0.0008",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_64_32_0.05_8_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_64_32_0.05_4_0.0008",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_64_32_0.05_4_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_64_32_0.05_2_0.0008",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_64_32_0.05_2_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_32_32_0.01_8_0.0008",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_32_32_0.01_4_0.0008",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_32_32_0.01_2_0.0008",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_32_32_0.05_8_0.0008",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_32_32_0.05_8_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_32_32_0.05_4_0.0008",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_32_32_0.05_4_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_32_32_0.05_2_0.0008",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_32_32_0.05_2_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.01_8_0.0008",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.01_4_0.0008",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.01_2_0.0008",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.05_8_0.0008",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.05_8_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.05_4_0.0008",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.05_4_0.0002",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.05_2_0.0008",
    "/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.05_2_0.0002"]
ll = []
for el in l:
    if el.endswith('2') and '_16_' in el:
        ll.append(el)
len(set(l))
print(len(ll))
ll

9


['/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_64_0.01_8_0.0002',
 '/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_64_0.01_4_0.0002',
 '/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_64_0.01_2_0.0002',
 '/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.01_8_0.0002',
 '/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.01_4_0.0002',
 '/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.01_2_0.0002',
 '/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.05_8_0.0002',
 '/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.05_4_0.0002',
 '/ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.05_2_0.0002']

In [8]:
sorted_df = d.iloc[d['model_responses'].str.len().argsort()]
for i, ex in sorted_df.iterrows():
    print(ex['model_responses'])

 [{"entity": "findings"}, {"entity": "parotid"}, {"entity": "malignancy"}, {"entity": "metastases"}, {"entity": "parotid"}, {"entity": "liver"}] </s>
 [{"entity": "parotidectomy"}, {"entity": "examination"}, {"entity": "confirmed"}, {"entity": "ACC"}, {"entity": "The patient"}] </s></s></s></s></s></s>
 [{"entity": "hypertension"}, {"entity": "dyslipidemia"}, {"entity": "diabetes mellitus"}, {"entity": "hypokalemia"}, {"entity": "A 46-year-old man"}, {"entity": "1-month"}, {"entity": "4-months"}] </s>
 [{"entity": "study"}, {"entity": "tests"}, {"entity": "ECS"}, {"entity": "ECS"}, {"entity": "indicated"}] </s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>


### ZEFIRO 8bit

In [1]:
from dotenv import dotenv_values
from datasets import load_dataset, Dataset
from utils.data_preprocessor import DataPreprocessor
from utils.evaluator import Evaluator
from config.finetuning import config
from config import postprocessing_params_mistral as postprocessing
from utils.test_data_processor import TestDataProcessor
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import gc
from peft import PeftModel
from tqdm import tqdm

from config.finetuning_zefiro import model_loading_params as models_params
adapters = "ferrazzipietro/zefiro-7b-base-ITA__adapters_it.layer1_8_torch.bfloat16_32_64_0.01_2_0.0002" # "ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_NoQuant_torch.bfloat16_16_32_0.01_2_0.0002" # "ferrazzipietro/Mistral-7B-Instruct-v0.2__adapters_en.layer1_NoQuant_torch.bfloat16_64_32_0.01_8_0.0002"
print(adapters)
BASE_MODEL_CHECKPOINT = "mii-community/zefiro-7b-base-ITA"#"Qwen/Qwen1.5-7B-Chat"  # "meta-llama/Llama-2-7b-chat-hf"  # 'mistralai/Mistral-7B-Instruct-v0.2'
layer = 'it.layer1' # 'en.layer1'
quantization  = True


HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']

max_new_tokens_factor = 6
n_shots_inference = 0
language = layer.split('.')[0]



dataset = load_dataset("ferrazzipietro/e3c-sentences", token=HF_TOKEN)
dataset = dataset[layer]
preprocessor = DataPreprocessor(model_checkpoint=BASE_MODEL_CHECKPOINT, 
                                tokenizer =BASE_MODEL_CHECKPOINT)
dataset = preprocessor.preprocess_data_one_layer(dataset,
                                                 simplest_prompt=False,
                                                 instruction_on_response_format='Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}].')
_, val_data, _ = preprocessor.split_layer_into_train_val_test_(dataset, layer)

if not quantization:
    print("NO QUANTIZATION")
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_CHECKPOINT, low_cpu_mem_usage=True,
        return_dict=True,  
        torch_dtype=postprocessing.torch_dtype,
        device_map= "auto")    
else:
    print("QUANTIZATION")
    load_in_8bit = not models_params.load_in_4bit[0]
    bnb_config = BitsAndBytesConfig(
                load_in_4bit = False,# models_params.load_in_4bit[0],
                load_in_8bit = True,# load_in_8bit,
                # bnb_4bit_use_double_quant = models_params.bnb_4bit_use_double_quant,
                # bnb_4bit_quant_type = models_params.bnb_4bit_quant_type[0],
                # bnb_4bit_compute_dtype = models_params.bnb_4bit_compute_dtype[0],
                llm_int8_threshold = models_params.llm_int8_threshold[0],
                llm_int8_has_fp16_weight = False # models_params.llm_int8_has_fp16_weight,
                # llm_int8_skip_modules = # models_params.llm_int8_skip_modules
                )
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_CHECKPOINT, low_cpu_mem_usage=True,
        quantization_config = bnb_config,
        return_dict=True,  
        #torch_dtype=torch.float16,
        device_map= "auto",
        cache_dir='/data/disk1/share/pferrazzi/.cache'
        )
# merged_model = PeftModel.from_pretrained(base_model, adapters, 
#                                          token=HF_TOKEN, 
#                                          device_map='auto',
#                                          is_trainable = False)

  from .autonotebook import tqdm as notebook_tqdm


ferrazzipietro/zefiro-7b-base-ITA__adapters_it.layer1_8_torch.bfloat16_32_64_0.01_2_0.0002
QUANTIZATION


Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.20it/s]


In [2]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_CHECKPOINT, add_eos_token=True)
#tokenizer.pad_token = "<unk>"
tokenizer.padding_side = "left"

input = """<s><|user|> Estrai le entità contenute nel testo.
Riporta i risultati in formato json: [{""entity"":""nome_entità""}]. <<Il caso riguarda un ragazzo di 12 anni, ricoverato presso l’UOC di Chirurgia Pediatrica di Treviso per addome acuto.>>> </s><|assistant|>"""

In [3]:
base_model.get_memory_footprint()

8041021440

In [4]:
encodeds = tokenizer(input, return_tensors="pt", add_special_tokens=False, padding=True)
model_inputs = encodeds.to('cuda')
generated_ids = base_model.generate(**model_inputs, do_sample=True, max_new_tokens=100,  
                                       pad_token_id=tokenizer.pad_token_id,
                                       temperature = 1.0) # max_new_tokens=max_new_tokens,
decoded = tokenizer.batch_decode(generated_ids)

In [2]:
from dotenv import dotenv_values
from datasets import load_dataset, Dataset
from utils.data_preprocessor import DataPreprocessor
from config import postprocessing_params_mistral as postprocessing
from utils.test_data_processor import TestDataProcessor
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import gc
from peft import PeftModel
from tqdm import tqdm
import torch

from config.finetuning_llama2 import model_loading_params as models_params
adapters = "ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.01_4_0.0002"
BASE_MODEL_CHECKPOINT = "meta-llama/Llama-2-7b-chat-hf"#"mii-community/zefiro-7b-base-ITA"#"Qwen/Qwen1.5-7B-Chat"  # "meta-llama/Llama-2-7b-chat-hf"  # 'mistralai/Mistral-7B-Instruct-v0.2'
layer = 'en.layer1' # 'en.layer1'
quantization  = True


HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']
LLAMA_TOKEN = dotenv_values(".env.base")['LLAMA_TOKEN']

max_new_tokens_factor = 6
n_shots_inference = 0
language = layer.split('.')[0]

dataset = load_dataset("ferrazzipietro/e3c-sentences", token=HF_TOKEN)
dataset = dataset[layer]
preprocessor = DataPreprocessor(model_checkpoint=BASE_MODEL_CHECKPOINT, 
                                tokenizer =BASE_MODEL_CHECKPOINT,
                                token_llama = LLAMA_TOKEN,)
dataset = preprocessor.preprocess_data_one_layer(dataset,
                                                 simplest_prompt=False,
                                                 instruction_on_response_format='Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}].')
_, val_data, _ = preprocessor.split_layer_into_train_val_test_(dataset, layer)

if not quantization:
    print("NO QUANTIZATION")
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_CHECKPOINT, low_cpu_mem_usage=True,
        return_dict=True,  
        torch_dtype=postprocessing.torch_dtype,
        device_map= "auto",
        cache_dir='/data/disk1/share/pferrazzi/.cache')    
else:
    print("QUANTIZATION")
    load_in_8bit = not models_params.load_in_4bit[0]
    bnb_config = BitsAndBytesConfig(
                load_in_4bit = False,# models_params.load_in_4bit[0],
                load_in_8bit = True,# load_in_8bit,
                # bnb_4bit_use_double_quant = models_params.bnb_4bit_use_double_quant,
                # bnb_4bit_quant_type = models_params.bnb_4bit_quant_type[0],
                # bnb_4bit_compute_dtype = models_params.bnb_4bit_compute_dtype[0],
                llm_int8_threshold = models_params.llm_int8_threshold[0],
                # llm_int8_has_fp16_weight = False # models_params.llm_int8_has_fp16_weight,
                # llm_int8_skip_modules = models_params.llm_int8_skip_modules
                )
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_CHECKPOINT, low_cpu_mem_usage=True,
        quantization_config = bnb_config,
        return_dict=True,  
        # torch_dtype=torch.bf16,
        device_map= "auto",
        cache_dir='/data/disk1/share/pferrazzi/.cache',
        token = LLAMA_TOKEN
        )
merged_model = PeftModel.from_pretrained(base_model, adapters, 
                                         token=HF_TOKEN, 
                                         device_map='auto',
                                         is_trainable = False)
# merged_model = base_model.load_adapter(adapters)
# merged_model.enable_adapters()
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_CHECKPOINT, 
                                          add_eos_token=False,
                                          token = LLAMA_TOKEN)
#tokenizer.pad_token = "<unk>"
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"


  from .autonotebook import tqdm as notebook_tqdm


QUANTIZATION


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
adapter_config.json: 100%|██████████| 547/547 [00:00<00:00, 2.59MB/s]
adapter_model.safetensors: 100%|██████████| 98.1M/98.1M [00:02<00:00, 37.8MB/s]


In [3]:
postprocessor = TestDataProcessor(test_data=val_data.select(range(60)), 
                                  preprocessor=preprocessor, 
                                  n_shots_inference=n_shots_inference, 
                                  language=language, 
                                  tokenizer=tokenizer)
postprocessor.add_inference_prompt_column(simplest_prompt=False)
postprocessor.add_ground_truth_column()
#try:
postprocessor.add_responses_column(model=merged_model, 
                                        tokenizer=tokenizer, 
                                        batch_size=1, 
                                        max_new_tokens_factor=max_new_tokens_factor)
postprocessor.test_data.to_csv(f"data/TMP_maxNewTokensFactor{max_new_tokens_factor}_nShotsInference{n_shots_inference}_{adapters.split('/')[1]}.csv", index=False)

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map: 100%|██████████| 60/60 [00:00<00:00, 4998.28 examples/s]
Map: 100%|██████████| 60/60 [00:00<00:00, 6228.86 examples/s]
generating responses:  68%|██████▊   | 41/60 [08:24<03:53, 12.30s/it]


KeyboardInterrupt: 

In [7]:
def check_nan_parameters(merged_model):
    nan_found = False
    for name, param in merged_model.named_parameters():
        if torch.isnan(param).any():
            print(f'Parameter {name} has NaN values!')
            nan_found = True
    if not nan_found:
        print('No parameters with NaN values found.')

check_nan_parameters(merged_model)

No parameters with NaN values found.


In [1]:
for name, param in merged_model.named_parameters():
    print(f'Parameter {name}')
    #display(param)

NameError: name 'merged_model' is not defined