In [1]:
# pip install bitsandbytes accelerate
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from dotenv import dotenv_values
import torch
from datasets import load_dataset
from utils.data_preprocessor import DataPreprocessor
from utils.test_data_processor import TestDataProcessor
from config import base_model
HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']

max_new_tokens_factor_list = base_model.max_new_tokens_factor_list
n_shots_inference_list = base_model.n_shots_inference_list
layer = base_model.TRAIN_LAYER
language = layer.split('.')[0]
save_directory = base_model.save_directory 


# quantization_config = BitsAndBytesConfig(load_in_4bit=True,
#                                          bnb_4bit_compute_type=torch.bfloat16,)
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            # load_in_8bit=True,
            # bnb_4bit_use_double_quant=True,
            # bnb_4bit_quant_type="nf4",
            # bnb_4bit_compute_dtype=torch.bfloat16,
            # llm_int8_threshold= 6.0,
            # llm_int8_skip_modules= ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"],
            )

tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-it", token=HF_TOKEN)

model = AutoModelForCausalLM.from_pretrained(
            "google/gemma-7b-it", low_cpu_mem_usage=True,
            quantization_config = quantization_config,
            # return_dict=True, 
            #torch_dtype=torch.float16,
            device_map= "auto",
            token=HF_TOKEN)


dataset = load_dataset("ferrazzipietro/e3c-sentences", token=HF_TOKEN)
dataset = dataset[layer]

preprocessor = DataPreprocessor(model_checkpoint="google/gemma-7b-it", tokenizer=tokenizer)
instruction_on_response_format=' Extract the entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}].'
dataset = preprocessor.preprocess_data_one_layer(dataset, instruction_on_response_format=instruction_on_response_format)
_, val_data, _ = preprocessor.split_layer_into_train_val_test_(dataset, layer)

postprocessor = TestDataProcessor(test_data=val_data.select(range(24)), 
                                          preprocessor=preprocessor, 
                                          n_shots_inference=0, 
                                          language=language, 
                                          tokenizer=tokenizer)
postprocessor.add_inference_prompt_column(simplest_prompt=False)
postprocessor.add_ground_truth_column()
print('TRY: ', f"{save_directory}/maxNewTokensFactor{8}_nShotsInference{0}_BaseModel.csv")
sorted_data = postprocessor.test_data.to_pandas().sort_values(by='inference_prompt', key=lambda x: x.str.len())
postprocessor.test_data = dataset.from_pandas(sorted_data)




  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.26s/it]
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


TRY:  data/llama/13B_8bit_base/maxNewTokensFactor8_nShotsInference0_BaseModel.csv


In [2]:
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.unk_token

In [3]:
postprocessor.test_data['inference_prompt'][0]

'<bos><start_of_turn>user  Extract the entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. <<The goiter measured 18 x 11 cm.>>> <end_of_turn><start_of_turn>model'

In [4]:
input_text = ['<bos><start_of_turn>user Extract the entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. <<She never suffered from thyroid dysfunction.>>> <end_of_turn><start_of_turn>model',
              "<bos><start_of_turn>Extract the entities contained in this text: We present a case of a 32-year-old woman with a history of gradual enlargement of the anterior neck.  <end_of_turn> <start_of_turn>model"]

# input_ids = tokenizer.encode(input_text, return_tensors="pt", padding=True).to("cuda")

# outputs = model.generate(input_ids, max_new_tokens=10)
# print(tokenizer.batch_decode(outputs))

encodeds = tokenizer.encode(input_text[0], return_tensors="pt", add_special_tokens=False, padding=True)
model_inputs = encodeds.to('cuda')
generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=20,  pad_token_id=tokenizer.eos_token_id) # max_new_tokens=max_new_tokens,
decoded = tokenizer.batch_decode(generated_ids)
print(decoded)




['<bos><start_of_turn>user Extract the entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. <<She never suffered from thyroid dysfunction.>>> <end_of_turn><start_of_turn>model```\n[{"entity":"Thyroid dysfunction"}, {"entity":"She"}, {"entity":"Thyroid']


In [25]:
def _generate_model_response(examples, model, tokenizer, max_new_tokens_factor:float) -> str:
    device = "cuda"
    tokenizer.padding_side = "left"
    input_sentences = examples['sentence']
    prompts = examples['inference_prompt']
    input_sentences_tokenized = tokenizer(input_sentences, return_tensors="pt", padding=True)
    print(prompts)
    max_new_tokens = int(len(max(input_sentences_tokenized, key=len)) * max_new_tokens_factor)
    # if self.preprocessor.model_type == 'gemma':
    #     add_special_tokens = True
    encodeds = tokenizer(prompts, return_tensors="pt", add_special_tokens=False, padding=True)
    model_inputs = encodeds.to(device)
    generated_ids = model.generate(**model_inputs, do_sample=True, max_new_tokens=max_new_tokens,  pad_token_id=tokenizer.eos_token_id) # max_new_tokens=max_new_tokens,
    decoded = tokenizer.batch_decode(generated_ids)
    #decoded = [self._postprocess_model_output(i) for i in decoded]
    return (decoded)

_generate_model_response(postprocessor.test_data.select(range(4)), model, tokenizer, 4.0)



['<bos><start_of_turn>user Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. Text: <<The goiter measured 18 x 11 cm.>>> <end_of_turn><start_of_turn>model', '<bos><start_of_turn>user Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. Text: <<She never suffered from thyroid dysfunction.>>> <end_of_turn><start_of_turn>model', '<bos><start_of_turn>user Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. Text: <<The incision performed was a Kocher cervicotomy.>>> <end_of_turn><start_of_turn>model', '<bos><start_of_turn>user Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. Text: <<Its 

RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

In [12]:
postprocessor.test_data['inference_prompt'][0]

'<bos><start_of_turn>userExtract the entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. <<The goiter measured 18 x 11 cm.>>> <end_of_turn><start_of_turn>model'

In [5]:
postprocessor.add_responses_column(model=model, 
                                tokenizer=tokenizer, 
                                batch_size=1, 
                                max_new_tokens_factor=8)

generating responses:   0%|          | 0/24 [00:01<?, ?it/s]


RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

In [17]:
postprocessor.test_data['inference_prompt'][0]

'<bos><start_of_turn>user Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. Text: <<A 46-year-old man with hypertension and dyslipidemia diagnosed 4-months before, as well as new-onset diabetes mellitus unveiled 1-month earlier, was referred to emergency department for hypokalemia.>>> <end_of_turn><start_of_turn>model'

### QWEN 7B 4bit

In [6]:
from dotenv import dotenv_values
from datasets import load_dataset
from utils.data_preprocessor import DataPreprocessor
from utils.test_data_processor import TestDataProcessor
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from tqdm import tqdm

HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']

max_new_tokens_factor_list = [2]
n_shots_inference_list = [0,2]
layer = 'en.layer1'
language = layer.split('.')[0]
save_directory = 'data/qwen'

# dataset = load_dataset("ferrazzipietro/e3c-sentences", token=HF_TOKEN)
# dataset = dataset[layer]
# 
# bnb_config = BitsAndBytesConfig(
#             load_in_4bit=True,
#             # load_in_8bit=True,
#             bnb_4bit_use_double_quant=True,
#             bnb_4bit_quant_type="nf4",
#             bnb_4bit_compute_dtype=torch.bfloat16,
#             # llm_int8_threshold= 6.0,
#             # llm_int8_skip_modules= ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"],
#             )

# model = AutoModelForCausalLM.from_pretrained(
#             "Qwen/Qwen1.5-14B-Chat", low_cpu_mem_usage=True,
#             quantization_config = bnb_config,
#             return_dict=True, 
#             #torch_dtype=torch.float16,
#             device_map= "auto",
#             token=HF_TOKEN,
#             cache_dir='/data/disk1/share/pferrazzi/.cache')
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-14B-Chat", add_eos_token=True, token=HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"



  from .autonotebook import tqdm as notebook_tqdm
tokenizer_config.json: 100%|██████████| 1.40k/1.40k [00:00<00:00, 334kB/s]
vocab.json: 100%|██████████| 2.78M/2.78M [00:00<00:00, 4.52MB/s]
merges.txt: 100%|██████████| 1.67M/1.67M [00:00<00:00, 3.31MB/s]
tokenizer.json: 100%|██████████| 7.03M/7.03M [00:00<00:00, 28.2MB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
preprocessor = DataPreprocessor(model_checkpoint="Qwen/Qwen1.5-14B-Chat", 
                                tokenizer="Qwen/Qwen1.5-14B-Chat")
instruction_on_response_format='Extract the entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}].'
dataset = preprocessor.preprocess_data_one_layer(dataset, 
                                                 instruction_on_response_format=instruction_on_response_format,
                                                 simplest_prompt=False)
_, val_data, _ = preprocessor.split_layer_into_train_val_test_(dataset, layer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████| 1520/1520 [00:00<00:00, 3159.28 examples/s]
Map: 100%|██████████| 170/170 [00:00<00:00, 6336.26 examples/s]


In [17]:
max_new_tokens_factor_list = [6]
n_shots_inference_list = [0,2]
for max_new_tokens_factor in max_new_tokens_factor_list:
    for n_shots_inference in n_shots_inference_list:
        
        # merged_model, tokenizer = load_mergedModel_tokenizer(adapters, base_model)
        postprocessor = TestDataProcessor(test_data=val_data, 
                                          preprocessor=preprocessor, 
                                          n_shots_inference=n_shots_inference, 
                                          language=language, 
                                          tokenizer=tokenizer)
        postprocessor.add_inference_prompt_column(simplest_prompt=False)
        postprocessor.add_ground_truth_column()
        print('TRY: ', f"{save_directory}/maxNewTokensFactor{max_new_tokens_factor}_nShotsInference{n_shots_inference}_BaseModel.csv")
        # try:
        postprocessor.add_responses_column(model=model, 
                                        tokenizer=tokenizer, 
                                        batch_size=12, 
                                        max_new_tokens_factor=max_new_tokens_factor)
        postprocessor.test_data.to_csv(f"{save_directory}/maxNewTokensFactor{max_new_tokens_factor}_nShotsInference{n_shots_inference}_BaseModel.csv", index=False)
        # except Exception as e: 
        #     print("ERROR IN PROCESSING: ", Exception)

Map: 100%|██████████| 681/681 [00:00<00:00, 8057.96 examples/s]
Map: 100%|██████████| 681/681 [00:00<00:00, 10820.38 examples/s]


TRY:  data/qwen/maxNewTokensFactor6_nShotsInference0_BaseModel.csv


generating responses:  16%|█▋        | 112/681 [06:43<34:08,  3.60s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 134.00 MiB (GPU 1; 10.75 GiB total capacity; 9.59 GiB already allocated; 87.25 MiB free; 10.46 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [5]:
import pandas as pd
d = pd.read_csv('data/qwen/maxNewTokensFactor2_nShotsInference0_BaseModel.csv')
d['model_responses'][0:10].to_list()

['[{"entity": "hypertension"}, {"entity": "dyslipidemia"}, {"entity": "diabetes mellitus"},',
 '[{"entity": "hormonal study"}, {"entity": "dynamic biochemical tests"}, {"entity": "ECS"}]<|im_end|><|im_end|>',
 '[{"entity": "primary right parotid malignancy"}, {"entity": "liver metastases"}]<|im_end|><|im_end|><|im_end|><|im_end|><|im_end|>',
 '[{"entity": "right parotidectomy"}, {"entity": "ACC"}]<|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|>',
 '[\n    {"entity": "metyrapone"},\n    {"entity": "ketoconazole"},\n    {"entity": "lanre',
 '```json\n[\n  {"entity": "50-years-old woman"},\n  {"entity": "hypertensive"},\n  {"entity',
 '[\n    {"entity": "mother"},\n    {"entity": "sisters"},\n    {"entity": "cousins"},\n    {"entity',
 '[{"entity": "patient"}, {"entity": "cervical compression"}, {"entity": "respiratory signs"}, {"entity":',
 '```json\n[\n  {"entity": "thyroid dysfunction"}\n]\n```<|im_end|><|im_end|><|im_end|><|im_end|><

### MISTRAL NO QUANT

In [1]:
from dotenv import dotenv_values
from datasets import load_dataset
from utils.data_preprocessor import DataPreprocessor
from utils.test_data_processor import TestDataProcessor
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from tqdm import tqdm

HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']

max_new_tokens_factor_list = [2]
n_shots_inference_list = [0,2]
layer = 'en.layer1'
language = layer.split('.')[0]

dataset = load_dataset("ferrazzipietro/e3c-sentences", token=HF_TOKEN)
dataset = dataset[layer]


model = AutoModelForCausalLM.from_pretrained(
            "mistralai/Mistral-7B-Instruct-v0.2", low_cpu_mem_usage=True,
            return_dict=True, 
            torch_dtype=torch.float16,
            device_map= "auto",
            token=HF_TOKEN,
            cache_dir='/data/disk1/share/pferrazzi/.cache')
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", add_eos_token=True, token=HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"



  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.40s/it]


In [13]:
from utils.data_preprocessor import DataPreprocessor
from datasets import Dataset
from tqdm import tqdm

class TestDataProcessor():
    def __init__(self, test_data: Dataset, preprocessor:DataPreprocessor, n_shots_inference:int, language:str, tokenizer) -> None:
        """
        Initialize the TestDataProcessor class.
        pass to this the same DataPreprocessor used for the training data. This will ensure that the inference prompt is formatted in the same way as the training prompt.
        """
        self.test_data = test_data
        self.preprocessor = preprocessor
        self.language = language
        self.tokenizer = tokenizer
        self.model_type = preprocessor.model_type
        self.few_shots_dict = {'en':{'questions':['We present a case of a 32-year-old woman with a history of gradual enlargement of the anterior neck.',
                                                   'Patient information: a 9-month-old boy presented to the emergency room with a 3-day history of refusal to bear weight on the right lower extremity and febrile peaks of up to 38.5°C for 24 hours.',
                                                   'There was no evidence of lung lesions.',
                                                   'Locally diminished actin coloration indicated atrophy of smooth muscle fibers.'],
                                        'responses':['[{"entity": "present"}, {"entity": "history"}, {"entity": "enlargement"}]',
                                                     '[{"entity": "presented"}, {"entity": "refusal"}, {"entity": "bear"}, {"entity": "peaks"}]',
                                                      '[{"entity": "evidence"}, {"entity": "lung lesions"]',
                                                      '[{"entity": "coloration"}, {"entity": "indicated"}, {"entity": "atrophy"}, {"entity": "atrophy of smooth muscle fibers"}, {"entity": "smooth muscle fibers"'],
                                        'responses_offset': ['[{"entity": "present", "offset": [3, 10]}, {"entity": "history", "offset": [48, 55]}, {"entity": "enlargement", "offset": [67, 78]}]',
                                                             '[{"entity": "presented", "offset": [39, 48]}, {"entity": "refusal", "offset": [95, 102]}, {"entity": "bear", "offset": [106, 110]}, {"entity": "peaks", "offset": [159, 164]}]',
                                                             '[{ "entity": "evidence", "offsets": [13, 21]}, {"entity": "lung lesions", "offsets": [25, 37]} ]',
                                                             '[{"entity": "coloration", "offsets": [25, 35]}, {"entity": "indicated", "offsets": [36, 45]}, {"entity": "atrophy","offsets": [46, 53]}, {"entity": "atrophy of smooth muscle fibers", "offsets": [46, 77]}, {"entity": "smooth muscle fibers", "offsets": [57, 77]} ]'],
                                    },
                                'it':{'questions':['In considerazione dell’inefficacia della terapia somministrata, in assenza di ulteriori opzioni terapeutiche standard potenzialmente efficaci e dopo colloquio con i genitori si decide di avviare la paziente a trapianto aploidentico, possibilmente NK allo reattivo, da genitore.',
                                                    'L’esame istologico dimostrava mucosa gastrica atrofica con flogosi cronica, marcato edema ed incremento del connettivo del corion, focale metaplasia intestinale, il tutto sovrastante un tessuto fibromuscolare.'],
                                       'responses':['[{"entity": "inefficacia"}, {"entity": "opzioni"}, {"entity": "colloquio"}, {"entity": "avviare"}, {"entity": "trapianto"}, {"entity": "genitori"}, {"entity": "paziente"}, {"entity": "genitore"}]',
                                                    '[{"entity": "mucosa gastrica atrofica"}, {"entity": "flogosi\r\cronica"}]'],
                                       'responses_offset':['[{"entity": "inefficacia", "offset": [23, 34]}, {"entity": "opzioni", "offset": [88,95]}, {"entity": "colloquio", "offset": [149,158]}, {"entity": "avviare", "offset": [187,194]}, {"entity": "trapianto", "offset": [209,218]}, {"entity": "genitori", "offset": [163,173]}, {"entity": "paziente", "offset": [195,106]}, {"entity": "genitore", "offset": [268,276]}]',
                                                           '[{"entity": "mucosa gastrica atrofica", "offset": [30,54]}, {"entity": "flogosi\r\cronica", "offset": [59,75]}]']}
                                }
        if len(self.few_shots_dict[self.language]['questions']) < n_shots_inference:
            raise ValueError(f'The number of shots for the inference prompt is greater than the number of examples available.')
        if len(self.few_shots_dict[self.language]['responses']) < n_shots_inference:
            raise ValueError(f'The number of shots for the inference prompt is greater than the number of responses available.')
        self.n_shots_inference = n_shots_inference
    
    def _extract_ground_truth(self, prompt:str) -> str:
        end_of_prompt_string = self.preprocessor.special_tokens_instruction['user_end'] + self.preprocessor.special_tokens_instruction['model_start']
        out = prompt.split(end_of_prompt_string, 1)
        return {'ground_truth': out[1][0:-4].strip()}
    
    def _format_prompt_inference(self, input: str, instruction_on_response_format:str, n_shots:int, offset: bool, simplest_prompt:bool, output:str='', list_of_examples: [str]=[], list_of_responses:[str]=[]) -> str:
        """
        Format the input and output into a prompt for the finetuning

        Args:
            task: the task for which the prompt is generated, either 'finetuning' or 'inference'
            input: the input text
            instruction_on_response_format: the instruction on the response format. E.g. "The response must be a list of dictionaries, where each dictionary contains the keys 'text' and 'offset'"
            n_shots: the number of examples to provide as few shot prompting
            offset: whether to require the offset in the response
            tokenizer: the tokenizer to use
            output: the output text
            list_of_examples: the list of examples to provide as few shot prompting
            list_of_responses: the list of responses to provide as few shot prompting

        Returns:
            the formatted prompt
        """
        if output != '':
            raise ValueError("The output must be an empty string when generating prompts for the inference")

        if len(list_of_examples) != len(list_of_responses):
            raise ValueError("The number of examples and responses must be the same")
        if n_shots != len(list_of_examples):
            raise ValueError("The number of examples and shots must be the same")
        if n_shots != len(list_of_responses):
            raise ValueError("The number of responses and shots must be the same")
        
        if simplest_prompt:
            base_prompt = self.preprocessor._simplest_base_prompt_input(input)
        elif not simplest_prompt:
            base_prompt = self.preprocessor._base_prompt_input(input, instruction_on_response_format)

        one_shot_example = self.preprocessor.one_shot_example_no_offset if not offset else self.preprocessor.one_shot_example
            
        prompt = ''
        for shot_example in range(n_shots):
            prompt += one_shot_example.format(
                instruction_on_response_format=instruction_on_response_format, 
                example_query=list_of_examples[shot_example], 
                example_response=list_of_responses[shot_example],
                user_start=self.preprocessor.special_tokens_instruction['user_start'],
                user_end=self.preprocessor.special_tokens_instruction['user_end'],
                model_start=self.preprocessor.special_tokens_instruction['model_start'],
                model_end=self.preprocessor.special_tokens_instruction['model_end'])
        
        bos_token = self.preprocessor.tokenizer.bos_token
        if self.model_type == 'qwen':
            bos_token = ''
        prompt = bos_token + prompt + base_prompt + output 
                            
        return prompt
    
    def _extract_inference_prompt(self, sentence:str, simplest_prompt:bool) -> str:
        if self.preprocessor.offset:
            few_shots_responses = self.few_shots_dict[self.language]['responses_offset']
        else:
            few_shots_responses = self.few_shots_dict[self.language]['responses']
        if self.n_shots_inference == 0:
            list_of_examples = []
            list_of_responses = []
        else:
            list_of_examples = self.few_shots_dict[self.language]['questions'][0:self.n_shots_inference]
            list_of_responses = few_shots_responses[0:self.n_shots_inference]
        inference_prompt = self._format_prompt_inference(input=sentence, 
                                                        instruction_on_response_format=self.preprocessor.instruction_on_response_format,
                                                        offset=self.preprocessor.offset,
                                                        output='',
                                                        n_shots=self.n_shots_inference,
                                                        simplest_prompt=simplest_prompt,
                                                        list_of_examples=list_of_examples,
                                                        list_of_responses=list_of_responses)
        return {'inference_prompt': inference_prompt}
    
    def add_inference_prompt_column(self, simplest_prompt:bool) -> None:
        """
        Add the inferencePrompt and groundTruth columns to the test_data dataframe.
        """
        self.test_data = self.test_data.map(lambda x: self._extract_inference_prompt(x['sentence'], simplest_prompt=simplest_prompt))
    
    def add_ground_truth_column(self) -> None:
        """
        Add the groundTruth column to the test_data dataframe.
        """
        self.test_data = self.test_data.map(lambda x: self._extract_ground_truth(x['prompt']))

    def _generate_model_response(self, examples, model, tokenizer, max_new_tokens_factor:float) -> str:
        device = "cuda"
        tokenizer.padding_side = "left"
        if self.model_type == 'qwen':
            tokenizer.pad_token = '<unk>' # tokenizer.special_tokens['<extra_0>']
        input_sentences = examples['sentence']
        prompts = examples['inference_prompt']
        input_sentences_tokenized = tokenizer(input_sentences, return_tensors="pt", padding=True)
        max_new_tokens = int(len(max(input_sentences_tokenized, key=len)) * max_new_tokens_factor)
        # if self.preprocessor.model_type == 'gemma':
        #     add_special_tokens = True
        encodeds = tokenizer(prompts, return_tensors="pt", add_special_tokens=False, padding=True)
        model_inputs = encodeds.to(device)
        generated_ids = model.generate(**model_inputs, do_sample=True, max_new_tokens=max_new_tokens,  pad_token_id=tokenizer.eos_token_id) # max_new_tokens=max_new_tokens,
        generated_ids = generated_ids[:, encoding.input_ids.shape[1]:]
        decoded = tokenizer.batch_decode(generated_ids)
        decoded = [self._postprocess_model_output(i) for i in decoded]
        return (decoded)
                
    def add_responses_column(self, model, tokenizer, batch_size:int, max_new_tokens_factor:float) -> None:
        """
        Adds a column with the response of the model to the actual query.
        
        params:
        model: the model to use to generate the response
        tokenizer: the tokenizer to use to generate the response
        batch_size: the batch size to use to process the examples. Increasing this makes it faster but requires more GPU. Default is 8.
        max_new_tokens_factor: the factor conotrolling the number of new tokens to generate. This is a factor of the length of the input sentence.
        """
        responses_col = []
        total_rows = len(self.test_data)
        indexes = [i for i in range(len(self.test_data)) if i % batch_size == 0]
        max_index = self.test_data.shape[0]


        with tqdm(total=total_rows, desc="generating responses") as pbar:
            for i, idx in enumerate(indexes[:-1]):
                indici = list(range(idx, indexes[i+1]))
                tmp = self._generate_model_response(self.test_data.select(indici), model, tokenizer, max_new_tokens_factor)
                responses_col.extend(tmp)
                pbar.update(batch_size)
            indici = list(range(indexes[i+1], max_index))
            tmp = self._generate_model_response(self.test_data.select(indici), model, tokenizer, max_new_tokens_factor)
            responses_col.extend(tmp)
            pbar.update(batch_size)

        self.test_data = self.test_data.add_column('model_responses', responses_col)
    
    def _postprocess_model_output(self, model_output: str) -> str:
        """
        Postprocess the model output to remove the instruction and return the model response.

        Args:
        model_output (str): the model output as it is returned by the model. The processing of the output is done in the function

        return:
        str: the model response, i.e. the model output without the instruction

        """
        end_of_prompt_string = self.preprocessor.special_tokens_instruction['user_end'] + self.preprocessor.special_tokens_instruction['model_start']
        return model_output.split(end_of_prompt_string, 1)[-1].strip()

In [12]:
n_shots_inference = 2
max_new_tokens_factor = 8


preprocessor = DataPreprocessor(model_checkpoint="mistralai/Mistral-7B-Instruct-v0.2", 
                                tokenizer=tokenizer)
instruction_on_response_format='Extract the entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}].'
dataset = preprocessor.preprocess_data_one_layer(dataset, 
                                                 instruction_on_response_format=instruction_on_response_format,
                                                 simplest_prompt=False)
_, val_data, _ = preprocessor.split_layer_into_train_val_test_(dataset, layer)

save_directory = 'data/mistral/noQuant'

postprocessor = TestDataProcessor(test_data=val_data, 
                                  preprocessor=preprocessor, 
                                  n_shots_inference=n_shots_inference, 
                                  language=language, 
                                  tokenizer=tokenizer)
postprocessor.add_inference_prompt_column(simplest_prompt=False)

postprocessor.add_ground_truth_column()
print('TRY: ', f"{save_directory}/maxNewTokensFactor{n_shots_inference}_nShotsInference{n_shots_inference}_BaseModel.csv")
# try:
postprocessor.add_responses_column(model=model, 
                                tokenizer=tokenizer, 
                                batch_size=16, 
                                max_new_tokens_factor=max_new_tokens_factor)
postprocessor.test_data.to_csv(f"{save_directory}/maxNewTokensFactor{max_new_tokens_factor}_nShotsInference{n_shots_inference}_BaseModel.csv", index=False)

Map:  42%|████▏     | 640/1520 [00:00<00:00, 6315.52 examples/s]

Map: 100%|██████████| 1520/1520 [00:00<00:00, 5237.06 examples/s]
Map: 100%|██████████| 170/170 [00:00<00:00, 7351.45 examples/s]
Map: 100%|██████████| 681/681 [00:00<00:00, 8447.45 examples/s]
Map: 100%|██████████| 681/681 [00:00<00:00, 11770.93 examples/s]


TRY:  data/mistral/noQuant/maxNewTokensFactor2_nShotsInference2_BaseModel.csv


generating responses: 688it [05:45,  1.99it/s]                         
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  5.01ba/s]


4231264

### LLAMA 7B 4bit

In [2]:
from dotenv import dotenv_values
from datasets import load_dataset
from utils.data_preprocessor import DataPreprocessor
from utils.test_data_processor import TestDataProcessor
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from tqdm import tqdm

HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']

max_new_tokens_factor_list = [2]
n_shots_inference_list = [0,2]
layer = 'en.layer1'
language = layer.split('.')[0]
save_directory = 'data/llama/7B_4bit_base'

dataset = load_dataset("ferrazzipietro/e3c-sentences", token=HF_TOKEN)
dataset = dataset[layer]
preprocessor = DataPreprocessor(model_checkpoint="meta-llama/Llama-2-7b-chat-hf", 
                                tokenizer="meta-llama/Llama-2-7b-chat-hf")

dataset = preprocessor.preprocess_data_one_layer(dataset, instruction_on_response_format='Return the result in a json format: [{"entity":"entity_name"}].')
_, val_data, _ = preprocessor.split_layer_into_train_val_test_(dataset, layer)

bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            # load_in_8bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            # llm_int8_threshold= 6.0,
            # llm_int8_skip_modules= ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"],
            )

model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Llama-2-7b-chat-hf", low_cpu_mem_usage=True,
            quantization_config = bnb_config,
            return_dict=True, 
            #torch_dtype=torch.float16,
            device_map= "auto",
            token=HF_TOKEN)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", add_eos_token=True, token=HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"



Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.60s/it]
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [7]:

for max_new_tokens_factor in max_new_tokens_factor_list:
    for n_shots_inference in n_shots_inference_list:
        
        # merged_model, tokenizer = load_mergedModel_tokenizer(adapters, base_model)
        postprocessor = TestDataProcessor(test_data=val_data, 
                                          preprocessor=preprocessor, 
                                          n_shots_inference=n_shots_inference, 
                                          language=language, 
                                          tokenizer=tokenizer)
        postprocessor.add_inference_prompt_column()
        postprocessor.add_ground_truth_column()
        print('TRY: ', f"{save_directory}/maxNewTokensFactor{max_new_tokens_factor}_nShotsInference{n_shots_inference}_BaseModel.csv")
        # try:
        postprocessor.add_responses_column(model=model, 
                                        tokenizer=tokenizer, 
                                        batch_size=4, 
                                        max_new_tokens_factor=max_new_tokens_factor)
        postprocessor.test_data.to_csv(f"{save_directory}/maxNewTokensFactor{max_new_tokens_factor}_nShotsInference{n_shots_inference}_BaseModel.csv", index=False)
        # except Exception as e: 
        #     print("ERROR IN PROCESSING: ", Exception)

TRY:  data/llama/7B_4bit_base/maxNewTokensFactor2_nShotsInference0_BaseModel.csv


generating responses:   0%|          | 0/681 [00:00<?, ?it/s]

generating responses:   1%|          | 4/681 [00:34<1:37:08,  8.61s/it]


RuntimeError: probability tensor contains either `inf`, `nan` or element < 0