In [1]:
import argparse
import os

from accelerate import Accelerator
from datasets import load_dataset, Dataset
from peft import LoraConfig
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, logging, set_seed

from trl import SFTTrainer
from trl.trainer import ConstantLengthDataset

import pandas as pd

In [None]:
model_name = "PROVIDE_MODEL" ## Any large language mode from hugging face

In [2]:
def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )



In [3]:

def prepare_sample_text(example):
    """Prepare the text from a sample of the dataset."""
    text = f"{example['prompt']}"
    return text

In [4]:

dataset = pd.read_csv('datasets/train_docred_prompt_wES.csv')
dataset_train = Dataset.from_pandas(dataset)
dev = pd.read_csv('datasets/dev_docred_prompt_wES.csv')
dataset_dev = Dataset.from_pandas(dev)

In [5]:
def create_datasets(tokenizer,dataset_train,dataset_dev, seq_length):

    train_data = dataset_train
    valid_data = dataset_dev
    print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")

    chars_per_token = chars_token_ratio(train_data, tokenizer)
    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

    train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        formatting_func=prepare_sample_text,
        infinite=True,
        seq_length=seq_length,
        chars_per_token=chars_per_token,
    )
    valid_dataset = ConstantLengthDataset(
        tokenizer,
        valid_data,
        formatting_func=prepare_sample_text,
        infinite=False,
        seq_length=seq_length,
        chars_per_token=chars_per_token,
    )
    return train_dataset, valid_dataset


In [9]:
# You can adjust the parameter accordingly
def run_training(train_data, val_data):
    print("Loading the model")

    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

    train_data.start_iteration = 0

    print("Starting main loop")

    training_args = TrainingArguments(
        output_dir='OUTPUT_DIRECTORY_NAME',
        dataloader_drop_last=True,
        evaluation_strategy="steps",
        max_steps=1000,
        eval_steps=500,
        save_steps=500,
        logging_steps=1,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        learning_rate=1e-4,
        lr_scheduler_type='cosine',
        warmup_steps=100,
        gradient_accumulation_steps=1,
        gradient_checkpointing=False,
        fp16=True,
        bf16=False,
        weight_decay=0.05,
        run_name="L3_18b-Insturct-02-ft",
        report_to="wandb", ### Turn this option off if you don't want to reporting on wandb
        ddp_find_unused_parameters=False,
    )


    model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True, device_map={"": Accelerator().process_index}
    )

    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        peft_config=lora_config,
        packing=True,
    )

    print_trainable_parameters(trainer.model)

    print("Training...")
    trainer.train()

    print("Saving last checkpoint of the model")
    trainer.model.save_pretrained(os.path.join("OUTPUT_DIRECTORY", "final_checkpoint/"))

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name) 

In [10]:
train_dataset, eval_dataset = create_datasets(tokenizer,dataset_train,dataset_dev)

Size of the train set: 3053. Size of the validation set: 998


100%|██████████| 400/400 [00:00<00:00, 542.26it/s]

The character to token ratio of the dataset is: 3.47





In [None]:
set_seed(0)
os.makedirs("OUTPUT_DIRECTORY", exist_ok=True)

logging.set_verbosity_error()
run_training( train_dataset, eval_dataset)

#### The follwoing code will merge the train model with the oringal parameter

In [None]:
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoModelForCausalLM
import torch
import os

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, load_in_8bit=False,
                                             device_map="auto",
                                             trust_remote_code=True)

model_path = "OUTPUT_DIRECTORY/final_checkpoint/"

peft_model = PeftModel.from_pretrained(model, model_path, from_transformers=True, device_map="auto")

model = peft_model.merge_and_unload()

#### The following code will run the train model to generate response
#### You can change the parameters such as temprature and top_k accordingly

In [None]:
def formatted_prompt(doc)-> str:
    text = f"{doc}\nExample Output"
    return text

In [None]:
def generate_output(user_input):
    prompt = formatted_prompt(user_input)

    inputs = tokenizer([prompt], return_tensors="pt")
    generation_config = GenerationConfig(penalty_alpha=0.6,do_sample = True,
      top_k=5,temperature=0.3,repetition_penalty=1.2,
      max_new_tokens=512,pad_token_id=tokenizer.eos_token_id
    )
    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
    outputs = model.generate(**inputs, generation_config=generation_config)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True).split('Example Output')[1]
    return result