In [2]:
from dotenv import dotenv_values
from datasets import load_dataset, Dataset
from utils.data_preprocessor import DataPreprocessor
from utils.evaluator import Evaluator
from config.finetuning import config
from utils.load_merged_model_tokenizer import load_mergedModel_tokenizer
from config import postprocessing
from utils.test_data_processor import TestDataProcessor
import pandas as pd
from log import enlayer1_3epochs_4bits__ft_params as models_params
from utils.generate_ft_adapters_list import generate_ft_adapters_list
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import gc
from peft import PeftModel
from tqdm import tqdm

HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']

max_new_tokens_factor_list = postprocessing.max_new_tokens_factor_list
n_shots_inference_list = postprocessing.n_shots_inference_list
layer = models_params.TRAIN_LAYER
language = layer.split('.')[0]


dataset = load_dataset("ferrazzipietro/e3c-sentences", token=HF_TOKEN)
dataset = dataset[layer]
preprocessor = DataPreprocessor(config.BASE_MODEL_CHECKPOINT, config.BASE_MODEL_CHECKPOINT)
dataset = preprocessor.preprocess_data_one_layer(dataset, instruction_on_response_format='Return the result in a json format: [{"entity":"entity_name"}].')
_, val_data, _ = preprocessor.split_layer_into_train_val_test_(dataset, layer)

# bnb_config = BitsAndBytesConfig(
#             load_in_4bit=True,
#             bnb_4bit_use_double_quant=True,
#             bnb_4bit_quant_type="nf4",
#             bnb_4bit_compute_dtype=torch.bfloat16)

bnb_config = BitsAndBytesConfig(
            load_in_4bit=False,
            load_in_8bit=True,
            #bnb_4bit_use_double_quant=True,
            #bnb_4bit_quant_type="nf4",
            #bnb_4bit_compute_dtype=torch.bfloat16,
            llm_int8_threshold= 6.0,
            llm_int8_skip_modules= ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"],
            )


adapters_list = generate_ft_adapters_list("enlayer1_3epochs_4bits__ft_params")



Map: 100%|██████████| 1520/1520 [00:00<00:00, 3633.82 examples/s]
Map: 100%|██████████| 170/170 [00:00<00:00, 6718.22 examples/s]


### LLAMA

In [1]:
from dotenv import dotenv_values
from datasets import load_dataset, Dataset
from utils.data_preprocessor import DataPreprocessor
from utils.evaluator import Evaluator
from config.finetuning_llama2 import config
from utils.load_merged_model_tokenizer import load_mergedModel_tokenizer
from config import postprocessing
from utils.test_data_processor import TestDataProcessor
import pandas as pd
from log import enlayer1_3epochs_8bits__ft_params_llama as models_params
from utils.generate_ft_adapters_list import generate_ft_adapters_list
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import gc
from peft import PeftModel
from tqdm import tqdm

HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']
LLAMA_TOKEN = dotenv_values(".env.base")['LLAMA_TOKEN']

max_new_tokens_factor_list = postprocessing.max_new_tokens_factor_list
n_shots_inference_list = postprocessing.n_shots_inference_list
layer = models_params.TRAIN_LAYER
language = layer.split('.')[0]


dataset = load_dataset("ferrazzipietro/e3c-sentences", token=HF_TOKEN)
dataset = dataset[layer]
preprocessor = DataPreprocessor()
dataset = preprocessor.preprocess_data_one_layer(dataset)
_, val_data, _ = preprocessor.split_layer_into_train_val_test_(dataset, layer)

bnb_config = BitsAndBytesConfig(
            load_in_4bit=False,
            load_in_8bit=True,
            #bnb_4bit_use_double_quant=True,
            #bnb_4bit_quant_type="nf4",
            #bnb_4bit_compute_dtype=torch.bfloat16,
            llm_int8_threshold= 6.0,
            load_in_8bit_fp32_cpu_offload=True,
            llm_int8_skip_modules= ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"],
            )


adapters_list = generate_ft_adapters_list("enlayer1_3epochs_8bits__ft_params_llama")


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
adapters = "ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.05_4_0.0002"
merged_model, tokenizer = load_mergedModel_tokenizer(adapters, "meta-llama/Llama-2-7b-chat-hf", llama_key=LLAMA_TOKEN)

postprocessor = TestDataProcessor(test_data=val_data.select(range(48)), preprocessor=preprocessor, n_shots_inference=2, language='en', tokenizer=tokenizer)
postprocessor.add_inference_prompt_column()
postprocessor.add_ground_truth_column()
postprocessor.add_responses_column(model=merged_model, tokenizer=tokenizer, batch_size=12, max_new_tokens_factor=5)
postprocessor.test_data.to_csv(f"data/test_data_processed/en_nShots{2}_maxNewTokensFactor{5}.csv", index=False)

In [3]:
postprocessor.test_data['model_responses']
postprocessor.test_data.to_csv(f"data/test_data_processed/en_nShots{2}_maxNewTokensFactor{5}.csv", index=False)

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 67.60ba/s]


183295

In [None]:

for max_new_tokens_factor in max_new_tokens_factor_list:
    for n_shots_inference in n_shots_inference_list:
        for adapters in tqdm(adapters_list, desc="adapters_list"):
            print("PROCESSING:", adapters)
            base_model = AutoModelForCausalLM.from_pretrained(
                models_params.BASE_MODEL_CHECKPOINT, low_cpu_mem_usage=True,
                quantization_config = bnb_config,
                return_dict=True,  load_in_4bit=True, 
                token = LLAMA_TOKEN,
                #torch_dtype=torch.float16,
                device_map= "auto")
            merged_model = PeftModel.from_pretrained(base_model, adapters, token=HF_TOKEN, device_map='auto')
            tokenizer = AutoTokenizer.from_pretrained(models_params.BASE_MODEL_CHECKPOINT, add_eos_token=True, token=LLAMA_TOKEN)
            tokenizer.pad_token = tokenizer.eos_token
            tokenizer.padding_side = "left"

            # merged_model, tokenizer = load_mergedModel_tokenizer(adapters, base_model)
            postprocessor = TestDataProcessor(test_data=val_data, 
                                              preprocessor=preprocessor, 
                                              n_shots_inference=n_shots_inference, 
                                              language=language, 
                                              tokenizer=tokenizer)
            postprocessor.add_inference_prompt_column()
            postprocessor.add_ground_truth_column()
            # try:
            postprocessor.add_responses_column(model=merged_model, 
                                            tokenizer=tokenizer, 
                                            batch_size=12, 
                                            max_new_tokens_factor=max_new_tokens_factor)
            postprocessor.test_data.to_csv(f"data/test_data_processed/maxNewTokensFactor{max_new_tokens_factor}_nShotsInference{n_shots_inference}_{adapters.split('/')[1]}.csv", index=False)
            # except Exception as e:
            #     print("ERROR IN PROCESSING: ", Exception, adapters)

            del merged_model
            del base_model
            del tokenizer
            gc.collect()
            torch.cuda.empty_cache()



### ONE RUN MISTRAL

In [3]:
from dotenv import dotenv_values
from datasets import load_dataset, Dataset
from utils.data_preprocessor import DataPreprocessor
from utils.evaluator import Evaluator
from config.finetuning import config
from utils.load_merged_model_tokenizer import load_mergedModel_tokenizer
from config import postprocessing
from utils.test_data_processor import TestDataProcessor
import pandas as pd
from log import enlayer1_3epochs_4bits__ft_params as models_params
from utils.generate_ft_adapters_list import generate_ft_adapters_list
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from utils.output_cleaner import OutputCleaner

HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']

max_new_tokens_factor_list = postprocessing.max_new_tokens_factor_list
n_shots_inference_list = postprocessing.n_shots_inference_list
layer = models_params.TRAIN_LAYER
language = layer.split('.')[0]


dataset = load_dataset("ferrazzipietro/e3c-sentences", token=HF_TOKEN)
dataset = dataset[layer]
preprocessor = DataPreprocessor(config.BASE_MODEL_CHECKPOINT, config.BASE_MODEL_CHECKPOINT)
dataset = preprocessor.preprocess_data_one_layer(dataset, instruction_on_response_format='Return the result in a json format: [{"entity":"entity_name"}].')
_, val_data, _ = preprocessor.split_layer_into_train_val_test_(dataset, layer)



bnb_config = BitsAndBytesConfig(
            load_in_4bit=False,
            #load_in_8bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            # llm_int8_threshold= 6.0,
            # llm_int8_skip_modules= ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"],
            )
base_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2", low_cpu_mem_usage=True,
    quantization_config = bnb_config,
    return_dict=True,
    device_map= 'auto')




adapters = "ferrazzipietro/Mistral-7B-Instruct-v0.2_adapters_en.layer1_8_torch.bfloat16_32_32_0.05_2_0.0002"
merged_model = PeftModel.from_pretrained(base_model, adapters, token=HF_TOKEN, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

postprocessor = TestDataProcessor(test_data=val_data.select(range(6)), preprocessor=preprocessor, n_shots_inference=2, language='en', tokenizer=tokenizer)
postprocessor.add_inference_prompt_column()
postprocessor.add_ground_truth_column()
postprocessor.add_responses_column(model=merged_model, tokenizer=tokenizer, batch_size=3, max_new_tokens_factor=5)

output_cleaner = OutputCleaner()
similar_is_equal = False
similar_is_equal_threshold = 100
cleaned_data = output_cleaner.apply_cleaning(postprocessor.test_data, wrong_keys_to_entity=False)

Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.42s/it]
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.
Map: 100%|██████████| 6/6 [00:00<00:00, 902.03 examples/s]
Map: 100%|██████████| 6/6 [00:00<00:00, 984.96 examples/s]
generating responses:   0%|          | 0/6 [00:00<?, ?it/s]

['<s>[INST] Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. Text: <<<We present a case of a 32-year-old woman with a history of gradual enlargement of the anterior neck.>>> [/INST] [{"entity": "present"}, {"entity": "history"}, {"entity": "enlargement"}] \n[INST] Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. Text: <<<Patient information: a 9-month-old boy presented to the emergency room with a 3-day history of refusal to bear weight on the right lower extremity and febrile peaks of up to 38.5°C for 24 hours.>>> [/INST] [{"entity": "presented"}, {"entity": "refusal"}, {"entity": "bear"}, {"entity": "peaks"}] \n[INST] Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. Text: <<

generating responses:  50%|█████     | 3/6 [00:27<00:27,  9.04s/it]

['<s>[INST] Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. Text: <<<We present a case of a 32-year-old woman with a history of gradual enlargement of the anterior neck.>>> [/INST] [{"entity": "present"}, {"entity": "history"}, {"entity": "enlargement"}] \n[INST] Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. Text: <<<Patient information: a 9-month-old boy presented to the emergency room with a 3-day history of refusal to bear weight on the right lower extremity and febrile peaks of up to 38.5°C for 24 hours.>>> [/INST] [{"entity": "presented"}, {"entity": "refusal"}, {"entity": "bear"}, {"entity": "peaks"}] \n[INST] Extract the entities contained in the text. Extract only entities contained in the text.\nReturn the result in a json format: [{"entity":"entity_name"}]. Text: <<

generating responses: 100%|██████████| 6/6 [00:53<00:00,  8.95s/it]
Map: 100%|██████████| 6/6 [00:00<00:00, 912.27 examples/s]
Map: 100%|██████████| 6/6 [00:00<00:00, 819.39 examples/s]

ORIGINAL MODEL OUTPUT:  [{"entity": "present"}, {"entity": "history"}, {"entity": "enlargement"}] 
[INST] Extract the entities contained in the text. Extract only entities contained in the text.
Return the result in a json format: [{"entity":"entity_name"}]. Text: <<<Patient information: a 9-month-old boy presented to the emergency room with a 3-day history of refusal to bear weight on the right lower extremity and febrile peaks of up to 38.5°C for 24 hours.>>> [/INST] [{"entity": "presented"}, {"entity": "refusal"}, {"entity": "bear"}, {"entity": "peaks"}] 
[INST] Extract the entities contained in the text. Extract only entities contained in the text.
Return the result in a json format: [{"entity":"entity_name"}]. Text: <<A 46-year-old man with hypertension and dyslipidemia diagnosed 4-months before, as well as new-onset diabetes mellitus unveiled 1-month earlier, was referred to emergency department for hypokalemia.>>> [/INST] 1-month-onset diabetes mellitus, hypertension, dyslipidem




## LLAMA 7B 8bit

In [3]:
from dotenv import dotenv_values
from datasets import load_dataset, Dataset
from utils.data_preprocessor import DataPreprocessor
from utils.evaluator import Evaluator
from config import postprocessing
from utils.test_data_processor import TestDataProcessor
import pandas as pd
from log import enlayer1_3epochs_8bits__ft_params_llama as models_params
from utils.generate_ft_adapters_list import generate_ft_adapters_list
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import gc
from peft import PeftModel
from tqdm import tqdm

HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']

max_new_tokens_factor_list = postprocessing.max_new_tokens_factor_list
n_shots_inference_list = postprocessing.n_shots_inference_list
layer = models_params.TRAIN_LAYER
language = layer.split('.')[0]


dataset = load_dataset("ferrazzipietro/e3c-sentences", token=HF_TOKEN)
dataset = dataset[layer]
preprocessor = DataPreprocessor(model_checkpoint=models_params.BASE_MODEL_CHECKPOINT, 
                                tokenizer = models_params.BASE_MODEL_CHECKPOINT)
dataset = preprocessor.preprocess_data_one_layer(dataset,models_params.instruction_on_response_format)
_, val_data, _ = preprocessor.split_layer_into_train_val_test_(dataset, layer)

bnb_config = BitsAndBytesConfig(
            load_in_4bit=False,
            load_in_8bit=True,
            #bnb_4bit_use_double_quant=True,
            #bnb_4bit_quant_type="nf4",
            #bnb_4bit_compute_dtype=torch.bfloat16,
            llm_int8_threshold= 6.0,
            llm_int8_has_fp16_weight = False,
            llm_int8_skip_modules= ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"],
            )


adapters_list = generate_ft_adapters_list("enlayer1_3epochs_8bits__ft_params_llama")


max_new_tokens_factor_list = [2]
n_shots_inference_list = [0]
val_data = val_data.select(range(12))


  from .autonotebook import tqdm as notebook_tqdm


In [4]:

for max_new_tokens_factor in max_new_tokens_factor_list:
    for n_shots_inference in n_shots_inference_list:
        for adapters in tqdm(adapters_list, desc="adapters_list"):
            print("PROCESSING:", adapters)
            base_model = AutoModelForCausalLM.from_pretrained(
                models_params.BASE_MODEL_CHECKPOINT, low_cpu_mem_usage=True,
                quantization_config = bnb_config,
                return_dict=True, 
                #torch_dtype=torch.float16,
                device_map= "auto")
            merged_model = PeftModel.from_pretrained(base_model, adapters, token=HF_TOKEN, device_map='auto')
            tokenizer = AutoTokenizer.from_pretrained(models_params.BASE_MODEL_CHECKPOINT, add_eos_token=False)
            tokenizer.pad_token = tokenizer.unk_token
            tokenizer.padding_side = "left"

            # merged_model, tokenizer = load_mergedModel_tokenizer(adapters, base_model)
            postprocessor = TestDataProcessor(test_data=val_data, 
                                              preprocessor=preprocessor, 
                                              n_shots_inference=n_shots_inference, 
                                              language=language, 
                                              tokenizer=tokenizer)
            postprocessor.add_inference_prompt_column()
            postprocessor.add_ground_truth_column()
            #try:
            postprocessor.add_responses_column(model=merged_model, 
                                            tokenizer=tokenizer, 
                                            batch_size=6, 
                                            max_new_tokens_factor=max_new_tokens_factor)
            postprocessor.test_data.to_csv(f"data/test_data_processed/maxNewTokensFactor{max_new_tokens_factor}_nShotsInference{n_shots_inference}_{adapters.split('/')[1]}.csv", index=False)
            # except Exception as e:
            #     print("ERROR IN PROCESSING: ", Exception, adapters)
            del merged_model
            del base_model
            del tokenizer
            gc.collect()
            torch.cuda.empty_cache()



adapters_list:   0%|          | 0/36 [00:00<?, ?it/s]

PROCESSING: ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.05_2_0.0002


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.13s/it]
Map: 100%|██████████| 12/12 [00:00<00:00, 1604.91 examples/s]
Map: 100%|██████████| 12/12 [00:00<00:00, 1081.22 examples/s]
generating responses: 100%|██████████| 12/12 [01:10<00:00,  5.89s/it]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 97.74ba/s]
adapters_list:   3%|▎         | 1/36 [01:16<44:25, 76.17s/it]

PROCESSING: ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.05_2_0.0008


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]
generating responses:   0%|          | 0/12 [00:19<?, ?it/s]
adapters_list:   3%|▎         | 1/36 [01:39<58:18, 99.97s/it]


KeyboardInterrupt: 

In [5]:
postprocessor.test_data

Dataset({
    features: ['sentence', 'entities', 'original_text', 'original_id', 'prompt', 'inference_prompt', 'ground_truth'],
    num_rows: 12
})

In [4]:
base_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(
            in_features=4096, out_features=4096, bias=False
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): Linear(
            in_features=4096, out_features=4096, bias=False
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default)

In [None]:
adapters = "ferrazzipietro/Llama-2-7b-chat-hf_adapters_en.layer1_8_torch.bfloat16_16_32_0.05_4_0.0002"

postprocessor = TestDataProcessor(test_data=val_data.select(range(48)), preprocessor=preprocessor, n_shots_inference=2, language='en', tokenizer=tokenizer)
postprocessor.add_inference_prompt_column()
postprocessor.add_ground_truth_column()
postprocessor.add_responses_column(model=merged_model, tokenizer=tokenizer, batch_size=12, max_new_tokens_factor=5)
postprocessor.test_data.to_csv(f"data/test_data_processed/en_nShots{2}_maxNewTokensFactor{5}.csv", index=False)