In [4]:

import os 
from huggingface_hub import login
from datasets import load_dataset
from transformers import AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
from dotenv import dotenv_values
import torch
from tqdm.auto import tqdm
import gc


from utils import generate_adapters_list, OutputGenerator, DataPreprocessor
from utils.data_format_converter import  DatasetFormatConverter
from src.billm import LlamaForTokenClassification, MistralForTokenClassification


batch_size = 24 # '5EpochsBestF1Train' # 5EpochsBestF1Trainbatch_size = 64
appendix = '6Epochs' # '5EpochsBestF1Train' # 5EpochsBestF1Train
log_name_training = "llama_6Epochs" # "llama_3EpochsLast"
clent = True
training_type= ''#'NoLora' # 'unmasked'
dtype = torch.float16


if training_type == 'NoLora':
    BASE_MODEL_CHECKPOINT = "meta-llama/Llama-2-7b-hf"
else:
    pass # it will be assigned based on the adapters


HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']
HF_TOKEN_WRITE = dotenv_values(".env.base")['HF_TOKEN_WRITE']
login(token=HF_TOKEN_WRITE)

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
torch.cuda.empty_cache() 


print('PREPROCESSING DATA...')
DATASET_CHEKPOINT="ferrazzipietro/e3c-sentences" 
TRAIN_LAYER="en.layer1"
## adapters_list = generate_adapters_list(log_name_training, appendix=appendix, training_type=training_type)
adapters_list = ["ferrazzipietro/LS_Llama-2-7b-hf_adapters_en.layer1_NoQuant_32_64_0.01_1_0.0002_6Epochs_clent"]

if training_type != 'NoLora':
    peft_config = PeftConfig.from_pretrained(adapters_list[0], token = HF_TOKEN_WRITE)
    BASE_MODEL_CHECKPOINT = peft_config.base_model_name_or_path
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_CHECKPOINT,token =HF_TOKEN_WRITE)
tokenizer.pad_token = tokenizer.eos_token
dataset = load_dataset(DATASET_CHEKPOINT, token=HF_TOKEN_WRITE) #download_mode="force_redownload"
dataset = dataset[TRAIN_LAYER]
dataset = dataset.shuffle(seed=1234)  
dataset_format_converter = DatasetFormatConverter(dataset, clent=clent)
dataset_format_converter.apply()
ds = dataset_format_converter.dataset
label2id = dataset_format_converter.label2id
id2label = dataset_format_converter.get_id2label()
label_list = dataset_format_converter.get_label_list()
dataset_format_converter.set_tokenizer(tokenizer)
dataset_format_converter.set_max_seq_length(256)
tokenized_ds = ds.map(lambda x: dataset_format_converter.tokenize_and_align_labels(x), batched=True)
preprocessor = DataPreprocessor()
_, data, _ = preprocessor.split_layer_into_train_val_test_(tokenized_ds, TRAIN_LAYER)
print('PREPROCESSING DATA...DONE')

print(data[0:10])



print('LOADING MODEL...')
model_type = 'llama' if 'llama' in BASE_MODEL_CHECKPOINT.lower() else 'mistral'
if model_type == 'llama':
    ModelForTokenClassification = LlamaForTokenClassification
elif model_type == 'mistral':
    ModelForTokenClassification = MistralForTokenClassification
else:
    raise ValueError('Model type not recognized')

if training_type != 'NoLora':

    base_model = ModelForTokenClassification.from_pretrained(
        BASE_MODEL_CHECKPOINT,
        num_labels=len(label2id), id2label=id2label, label2id=label2id,
        token = HF_TOKEN_WRITE,
        quantization_config = BitsAndBytesConfig(load_in_4bit=True, load_in),
        # cache_dir='/data/disk1/share/pferrazzi/.cache',
        device_map='auto',
        torch_dtype=dtype,
        # quantization_config = bnb_config
        )
print('LOADING MODEL...DONE')


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/pietroferrazzi/.cache/huggingface/token
Login successful
PREPROCESSING DATA...




PREPROCESSING DATA...DONE
{'sentence': ['The results of the PCR were positive for RNA specific to SARS-CoV-2.', 'Flow cytometry of the CSF showed that blast cells were positive for a cluster of differentiation markers (CD) (CD34, CD19, CD10, CD22 and partially positive for CD45) confirming CNS extramedullary lymphoid blast infiltration.', 'She had also positive serum antibody against the Smith antigen and low serum level of C3 complement component: 67 mg/dL (serum normal range: 84 – 151).', 'Twenty five days later, she returned to obstetric emergencies for significant bleeding with severe anemia with hemoglobin 9 g/dL without signs of hypovolemic shock.', 'Nasosinusal MRI shows a progressive increase in size of the tissue mass compared to the previously CT occupying the right nasal fossa on almost all of its height, measuring 4 × 2.8 × 3.5cm without extension within the cavum with probable invasion of the lower part of the nasolacrimal duct and mass effect on both the sinusonasal bone 

ImportError: Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`

In [None]:

print(adapters_list)
for adapters in adapters_list:
    print('GENERATING:', adapters, '...')
    if training_type != 'NoLora':
        peft_config = PeftConfig.from_pretrained(adapters, token = HF_TOKEN_WRITE)
        BASE_MODEL_CHECKPOINT = peft_config.base_model_name_or_path
        model = PeftModel.from_pretrained(base_model, adapters, token = HF_TOKEN_WRITE)
        model = model.merge_and_unload()
    else:
        model = ModelForTokenClassification.from_pretrained(
                adapters,
                num_labels=len(label2id), id2label=id2label, label2id=label2id,
                token = HF_TOKEN_WRITE,
                torch_dtype=dtype,
                device_map='auto')
    generator = OutputGenerator(model, tokenizer, label2id, label_list)
    test_data = generator.generate(data, batch_size = batch_size)
    if dtype==torch.bfloat16:
        adapters = adapters+'_bf'
        print('SSSSSSAVINGGGGGGG in bf16')
    else:
        print('NOooooo in bf16')
    test_data.push_to_hub(adapters+'_bf', token=HF_TOKEN_WRITE, split='test')
    print('GENERATING:', adapters, '...DONE')
    del model
    gc.collect()
    torch.cuda.empty_cache()

