In [None]:
import argparse
from functools import partial
import os
import json
import random

os.environ['TRANSFORMERS_CACHE'] = "/data/hf/"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import bitsandbytes as bnb
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets.arrow_dataset import Dataset

%load_ext autoreload
%autoreload 2

## Define

In [None]:
def load_model(model_name, bnb_config):
    n_gpus = torch.cuda.device_count()
    # max_memory = f'{40960}MB'

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        # max_memory = {i: max_memory for i in range(n_gpus)},
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)

    # Needed for LLaMA tokenizer
    # tokenizer.pad_token = tokenizer.eos_token
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id


    return model, tokenizer

def create_zeta_prompt(sample: dict, eos_token="</s>"):
    formatted_prompt = "Question: " + sample["question"] + "\n\n" + "Answer: " + sample["answer"] + eos_token
    return {"text": formatted_prompt}

def split_and_save_dataset(input_file, train_file, test_file, test_percentage=0.25):
    # Load the dataset from the input file
    zeta_raw = []
    with open(input_file, "r") as f:
        for line in f:
            zeta_raw.append(json.loads(line))

    # Shuffle the dataset to randomize the order of samples
    random.shuffle(zeta_raw)

    # Determine the index to split the dataset (test_percentage for test set)
    split_index = int(test_percentage * len(zeta_raw))

    # Split the dataset into train and test sets
    train_set = zeta_raw[split_index:]
    test_set = zeta_raw[:split_index]

    # Save the train and test sets to separate files
    with open(train_file, "w") as f:
        for sample in train_set:
            f.write(json.dumps(sample) + "\n")

    with open(test_file, "w") as f:
        for sample in test_set:
            f.write(json.dumps(sample) + "\n")

# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: Dataset):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """
    
    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_zeta_prompt)#, batched=True)
    
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        # remove_columns=['question', 'output', 'text']
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)
    
    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset


def create_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    return bnb_config

def create_peft_config(modules):
    """
    Create Parameter-Efficient Fine-Tuning config for your model
    :param modules: Names of the modules to apply Lora to
    """
    config = LoraConfig(
        r=16,  # dimension of the updated matrices
        lora_alpha=64,  # parameter for scaling
        target_modules=modules,
        modules_to_save= ["embed_tokens", "lm_head"],
        lora_dropout=0.1,  # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM",
    )

    return config

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)


def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )

## Dataset

In [None]:
# Declare the dataset paths 
rel_path = "../data/aug_4838_20230803/"

input_file = rel_path + "augmented.jsonl"
train_file = rel_path + "train.jsonl"
test_file = rel_path + "test.jsonl"

# Split the dataset into train and test
split_and_save_dataset(input_file, train_file, test_file, test_percentage=0.15)

# Load the dataset in the HuggingFace Dataset format
zeta_train = load_dataset(rel_path, split="train")
zeta_test = load_dataset(rel_path, split="test")

print(f"Train set size: {len(zeta_train)}")
print(f"Test set size: {len(zeta_test)}")

## Model

In [None]:
model_name = "meta-llama/Llama-2-7b-hf"

bnb_config = create_bnb_config()

model, tokenizer = load_model(model_name, bnb_config)

## Preprocess

In [None]:
max_length = get_max_length(model)

train_ds = preprocess_dataset(tokenizer, max_length, 0, zeta_train)
eval_ds = preprocess_dataset(tokenizer, max_length, 0, zeta_test)

print(f"Train set size after preprocessing: {len(train_ds):,}")
print(f"Test set size after preprocessing: {len(eval_ds):,}")

## Train

In [None]:
def train(model, tokenizer, train_ds, eval_ds, output_dir):
    # Apply preprocessing to the model to prepare it by
    # 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()

    # 2 - Using the prepare_model_for_kbit_training method from PEFT
    model = prepare_model_for_kbit_training(model)

    # Get lora module names
    modules = find_all_linear_names(model)

    # Create PEFT config for these modules and wrap the model to PEFT
    peft_config = create_peft_config(modules)
    model = get_peft_model(model, peft_config)
    
    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)
    
    # Training parameters
    trainer = Trainer(
        model=model,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        args=TrainingArguments(
            per_device_train_batch_size=1,
            gradient_accumulation_steps=1,
            per_device_eval_batch_size=64,
            warmup_steps=2,
            num_train_epochs=3,
            # max_steps=20,
            learning_rate=2e-6,
            fp16=True,
            logging_steps=1,
            output_dir=output_dir,
            optim="paged_adamw_8bit",
            evaluation_strategy="steps",
            eval_steps=25,
            push_to_hub=False,
            # load_best_model_at_end=True,
            # metric_for_best_model="eval_loss",
        ),
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],

    )
    
    model.config.use_cache = False  # re-enable for inference to speed up predictions for similar inputs
    
    ### SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
    # Verifying the datatypes before training
    
    dtypes = {}
    for _, p in model.named_parameters():
        dtype = p.dtype
        if dtype not in dtypes: dtypes[dtype] = 0
        dtypes[dtype] += p.numel()
    total = 0
    for k, v in dtypes.items(): total+= v
    for k, v in dtypes.items():
        print(k, v, v/total)
     
    do_train = True
    
    # Launch training
    print("Training...")
    
    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)
    
    # Saving model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok=True)
    trainer.model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    # Free memory for merging weights
    del model
    del trainer
    torch.cuda.empty_cache()

output_dir = "trained/peft_finetuned"
train(
    model=model, 
    tokenizer=tokenizer, 
    train_ds=train_ds, 
    eval_ds=eval_ds,
    output_dir=output_dir
)

## Export

In [None]:
import os
from peft import AutoPeftModelForCausalLM
import torch
from transformers import AutoTokenizer

output_dir = "trained/peft_finetuned"
model_name = "meta-llama/Llama-2-7b-hf"
output_merged_dir = "results/llama2/sage_fire"

# Load model
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
model = model.merge_and_unload()

# Save model
os.makedirs(output_merged_dir, exist_ok=True)
model.save_pretrained(
    output_merged_dir,
    safe_serialization=True,
    push_to_hub=True,
    repo_id="cipher982/report_builder"
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Save tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(
    output_merged_dir, 
    push_to_hub=True, 
    repo_id="cipher982/report_builder"
)