In [1]:
# pip install bitsandbytes accelerate
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from dotenv import dotenv_values
import torch
from datasets import load_dataset
from utils.data_preprocessor import DataPreprocessor
from utils.test_data_processor import TestDataProcessor
from config import base_model
HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']

max_new_tokens_factor_list = base_model.max_new_tokens_factor_list
n_shots_inference_list = base_model.n_shots_inference_list
layer = base_model.TRAIN_LAYER
language = layer.split('.')[0]
save_directory = base_model.save_directory 


# quantization_config = BitsAndBytesConfig(load_in_4bit=True,
#                                          bnb_4bit_compute_type=torch.bfloat16,)
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            # load_in_8bit=True,
            # bnb_4bit_use_double_quant=True,
            # bnb_4bit_quant_type="nf4",
            # bnb_4bit_compute_dtype=torch.bfloat16,
            # llm_int8_threshold= 6.0,
            # llm_int8_skip_modules= ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"],
            )

tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-it", token=HF_TOKEN)

model = AutoModelForCausalLM.from_pretrained(
            "google/gemma-7b-it", low_cpu_mem_usage=True,
            quantization_config = quantization_config,
            # return_dict=True, 
            #torch_dtype=torch.float16,
            device_map= "auto",
            token=HF_TOKEN)


dataset = load_dataset("ferrazzipietro/e3c-sentences", token=HF_TOKEN)
dataset = dataset[layer]

preprocessor = DataPreprocessor(model_checkpoint="google/gemma-7b-it", tokenizer=tokenizer)
instruction_on_response_format=' Extract the entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}].'
dataset = preprocessor.preprocess_data_one_layer(dataset, instruction_on_response_format=instruction_on_response_format)
_, val_data, _ = preprocessor.split_layer_into_train_val_test_(dataset, layer)

postprocessor = TestDataProcessor(test_data=val_data.select(range(24)), 
                                          preprocessor=preprocessor, 
                                          n_shots_inference=0, 
                                          language=language, 
                                          tokenizer=tokenizer)
postprocessor.add_inference_prompt_column(simplest_prompt=False)
postprocessor.add_ground_truth_column()
print('TRY: ', f"{save_directory}/maxNewTokensFactor{8}_nShotsInference{0}_BaseModel.csv")
sorted_data = postprocessor.test_data.to_pandas().sort_values(by='inference_prompt', key=lambda x: x.str.len())
postprocessor.test_data = dataset.from_pandas(sorted_data)




  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.26s/it]
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


TRY:  data/llama/13B_8bit_base/maxNewTokensFactor8_nShotsInference0_BaseModel.csv


In [2]:
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.unk_token

In [3]:
postprocessor.test_data['inference_prompt'][0]

'<bos><start_of_turn>user  Extract the entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. <<The goiter measured 18 x 11 cm.>>> <end_of_turn><start_of_turn>model'

In [4]:
input_text = ['<bos><start_of_turn>user Extract the entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. <<She never suffered from thyroid dysfunction.>>> <end_of_turn><start_of_turn>model',
              "<bos><start_of_turn>Extract the entities contained in this text: We present a case of a 32-year-old woman with a history of gradual enlargement of the anterior neck.  <end_of_turn> <start_of_turn>model"]

# input_ids = tokenizer.encode(input_text, return_tensors="pt", padding=True).to("cuda")

# outputs = model.generate(input_ids, max_new_tokens=10)
# print(tokenizer.batch_decode(outputs))

encodeds = tokenizer.encode(input_text[0], return_tensors="pt", add_special_tokens=False, padding=True)
model_inputs = encodeds.to('cuda')
generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=20,  pad_token_id=tokenizer.eos_token_id) # max_new_tokens=max_new_tokens,
decoded = tokenizer.batch_decode(generated_ids)
print(decoded)




['<bos><start_of_turn>user Extract the entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. <<She never suffered from thyroid dysfunction.>>> <end_of_turn><start_of_turn>model```\n[{"entity":"Thyroid dysfunction"}, {"entity":"She"}, {"entity":"Thyroid']


In [25]:
def _generate_model_response(examples, model, tokenizer, max_new_tokens_factor:float) -> str:
    device = "cuda"
    tokenizer.padding_side = "left"
    input_sentences = examples['sentence']
    prompts = examples['inference_prompt']
    input_sentences_tokenized = tokenizer(input_sentences, return_tensors="pt", padding=True)
    print(prompts)
    max_new_tokens = int(len(max(input_sentences_tokenized, key=len)) * max_new_tokens_factor)
    # if self.preprocessor.model_type == 'gemma':
    #     add_special_tokens = True
    encodeds = tokenizer(prompts, return_tensors="pt", add_special_tokens=False, padding=True)
    model_inputs = encodeds.to(device)
    generated_ids = model.generate(**model_inputs, do_sample=True, max_new_tokens=max_new_tokens,  pad_token_id=tokenizer.eos_token_id) # max_new_tokens=max_new_tokens,
    decoded = tokenizer.batch_decode(generated_ids)
    #decoded = [self._postprocess_model_output(i) for i in decoded]
    return (decoded)

_generate_model_response(postprocessor.test_data.select(range(4)), model, tokenizer, 4.0)



['<bos><start_of_turn>user Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. Text: <<The goiter measured 18 x 11 cm.>>> <end_of_turn><start_of_turn>model', '<bos><start_of_turn>user Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. Text: <<She never suffered from thyroid dysfunction.>>> <end_of_turn><start_of_turn>model', '<bos><start_of_turn>user Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. Text: <<The incision performed was a Kocher cervicotomy.>>> <end_of_turn><start_of_turn>model', '<bos><start_of_turn>user Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. Text: <<Its 

RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

In [12]:
postprocessor.test_data['inference_prompt'][0]

'<bos><start_of_turn>userExtract the entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. <<The goiter measured 18 x 11 cm.>>> <end_of_turn><start_of_turn>model'

In [5]:
postprocessor.add_responses_column(model=model, 
                                tokenizer=tokenizer, 
                                batch_size=1, 
                                max_new_tokens_factor=8)

generating responses:   0%|          | 0/24 [00:01<?, ?it/s]


RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

In [17]:
postprocessor.test_data['inference_prompt'][0]

'<bos><start_of_turn>user Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. Text: <<A 46-year-old man with hypertension and dyslipidemia diagnosed 4-months before, as well as new-onset diabetes mellitus unveiled 1-month earlier, was referred to emergency department for hypokalemia.>>> <end_of_turn><start_of_turn>model'

### QWEN 7B 4bit

In [1]:
from dotenv import dotenv_values
from datasets import load_dataset
from utils.data_preprocessor import DataPreprocessor
from utils.test_data_processor import TestDataProcessor
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from tqdm import tqdm

HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']

max_new_tokens_factor_list = [2]
n_shots_inference_list = [0,2]
layer = 'en.layer1'
language = layer.split('.')[0]
save_directory = 'data/qwen'

dataset = load_dataset("ferrazzipietro/e3c-sentences", token=HF_TOKEN)
dataset = dataset[layer]

bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            # load_in_8bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            # llm_int8_threshold= 6.0,
            # llm_int8_skip_modules= ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"],
            )

model = AutoModelForCausalLM.from_pretrained(
            "Qwen/Qwen1.5-14B-Chat", low_cpu_mem_usage=True,
            quantization_config = bnb_config,
            return_dict=True, 
            #torch_dtype=torch.float16,
            device_map= "auto",
            token=HF_TOKEN,
            cache_dir='/data/disk1/share/pferrazzi/.cache')
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-14B-Chat", add_eos_token=True, token=HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"



  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 8/8 [00:10<00:00,  1.34s/it]
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
tokenizer.bos

AttributeError: 'Qwen2TokenizerFast' object has no attribute 'bos'

In [13]:
preprocessor = DataPreprocessor(model_checkpoint="Qwen/Qwen1.5-14B-Chat", 
                                tokenizer="Qwen/Qwen1.5-14B-Chat")
instruction_on_response_format='Extract the entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}].'
dataset = preprocessor.preprocess_data_one_layer(dataset, 
                                                 instruction_on_response_format=instruction_on_response_format,
                                                 simplest_prompt=False)
_, val_data, _ = preprocessor.split_layer_into_train_val_test_(dataset, layer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████| 1520/1520 [00:00<00:00, 3159.28 examples/s]
Map: 100%|██████████| 170/170 [00:00<00:00, 6336.26 examples/s]


In [17]:
max_new_tokens_factor_list = [6]
n_shots_inference_list = [0,2]
for max_new_tokens_factor in max_new_tokens_factor_list:
    for n_shots_inference in n_shots_inference_list:
        
        # merged_model, tokenizer = load_mergedModel_tokenizer(adapters, base_model)
        postprocessor = TestDataProcessor(test_data=val_data, 
                                          preprocessor=preprocessor, 
                                          n_shots_inference=n_shots_inference, 
                                          language=language, 
                                          tokenizer=tokenizer)
        postprocessor.add_inference_prompt_column(simplest_prompt=False)
        postprocessor.add_ground_truth_column()
        print('TRY: ', f"{save_directory}/maxNewTokensFactor{max_new_tokens_factor}_nShotsInference{n_shots_inference}_BaseModel.csv")
        # try:
        postprocessor.add_responses_column(model=model, 
                                        tokenizer=tokenizer, 
                                        batch_size=12, 
                                        max_new_tokens_factor=max_new_tokens_factor)
        postprocessor.test_data.to_csv(f"{save_directory}/maxNewTokensFactor{max_new_tokens_factor}_nShotsInference{n_shots_inference}_BaseModel.csv", index=False)
        # except Exception as e: 
        #     print("ERROR IN PROCESSING: ", Exception)

Map: 100%|██████████| 681/681 [00:00<00:00, 8057.96 examples/s]
Map: 100%|██████████| 681/681 [00:00<00:00, 10820.38 examples/s]


TRY:  data/qwen/maxNewTokensFactor6_nShotsInference0_BaseModel.csv


generating responses:  16%|█▋        | 112/681 [06:43<34:08,  3.60s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 134.00 MiB (GPU 1; 10.75 GiB total capacity; 9.59 GiB already allocated; 87.25 MiB free; 10.46 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

### LLAMA 7B 4bit

In [2]:
from dotenv import dotenv_values
from datasets import load_dataset
from utils.data_preprocessor import DataPreprocessor
from utils.test_data_processor import TestDataProcessor
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from tqdm import tqdm

HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']

max_new_tokens_factor_list = [2]
n_shots_inference_list = [0,2]
layer = 'en.layer1'
language = layer.split('.')[0]
save_directory = 'data/llama/7B_4bit_base'

dataset = load_dataset("ferrazzipietro/e3c-sentences", token=HF_TOKEN)
dataset = dataset[layer]
preprocessor = DataPreprocessor(model_checkpoint="meta-llama/Llama-2-7b-chat-hf", 
                                tokenizer="meta-llama/Llama-2-7b-chat-hf")

dataset = preprocessor.preprocess_data_one_layer(dataset, instruction_on_response_format='Return the result in a json format: [{"entity":"entity_name"}].')
_, val_data, _ = preprocessor.split_layer_into_train_val_test_(dataset, layer)

bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            # load_in_8bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            # llm_int8_threshold= 6.0,
            # llm_int8_skip_modules= ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"],
            )

model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Llama-2-7b-chat-hf", low_cpu_mem_usage=True,
            quantization_config = bnb_config,
            return_dict=True, 
            #torch_dtype=torch.float16,
            device_map= "auto",
            token=HF_TOKEN)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", add_eos_token=True, token=HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"



Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.60s/it]
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [7]:

for max_new_tokens_factor in max_new_tokens_factor_list:
    for n_shots_inference in n_shots_inference_list:
        
        # merged_model, tokenizer = load_mergedModel_tokenizer(adapters, base_model)
        postprocessor = TestDataProcessor(test_data=val_data, 
                                          preprocessor=preprocessor, 
                                          n_shots_inference=n_shots_inference, 
                                          language=language, 
                                          tokenizer=tokenizer)
        postprocessor.add_inference_prompt_column()
        postprocessor.add_ground_truth_column()
        print('TRY: ', f"{save_directory}/maxNewTokensFactor{max_new_tokens_factor}_nShotsInference{n_shots_inference}_BaseModel.csv")
        # try:
        postprocessor.add_responses_column(model=model, 
                                        tokenizer=tokenizer, 
                                        batch_size=4, 
                                        max_new_tokens_factor=max_new_tokens_factor)
        postprocessor.test_data.to_csv(f"{save_directory}/maxNewTokensFactor{max_new_tokens_factor}_nShotsInference{n_shots_inference}_BaseModel.csv", index=False)
        # except Exception as e: 
        #     print("ERROR IN PROCESSING: ", Exception)

TRY:  data/llama/7B_4bit_base/maxNewTokensFactor2_nShotsInference0_BaseModel.csv


generating responses:   0%|          | 0/681 [00:00<?, ?it/s]

generating responses:   1%|          | 4/681 [00:34<1:37:08,  8.61s/it]


RuntimeError: probability tensor contains either `inf`, `nan` or element < 0