# The main process for the creation of the general llm contains steps
1. The structure of data that we are going to use for the training of the llm
2. base llm model which we want to train for our specific use-case

# The main processing work can be followed by -
1. data loading
2. data preprocessing(auto tokenizer)
3. peft(paremeter efficient fine tuning) and lora
4. model's arguments adjustments
5. model training
6. model saving
7. if want(model + peft merging for new model)
8. saving the model to get used

# Terminologies you should know before
## Parameter efficent transfer learning
## Parameter efficent fine tuning lora
## need of load and qlora

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# Data loading function

In [None]:
def load_raw_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data

def load_question_answer(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        qa_data = file.read()
        qa_texts = [f"Question: {qa['question']} Answer: {qa['answer']}" for qa in qa_data]
    return data

def load_chunk(file_path,chunk_size):
    with open(file_path, 'r', encoding='utf-8') as file:
        chunk = file.read(chunk_size)
    return chunk


# PEFT and LORA arguments setting

In [None]:
class fine_tune_llm:
  def __init__(Model_to_be_used_name="NousResearch/Llama-2-7b-chat-hf",new_model_name="Llama-2-7b-chat-finetune"):
    # Model and dataset
    model_name = Model_to_be_used_name
    new_model = new_model_name

    # QLoRA parameters
    lora_r = 64
    lora_alpha = 16
    lora_dropout = 0.1

    # bitsandbytes parameters
    use_4bit = True
    bnb_4bit_compute_dtype = "float16"
    bnb_4bit_quant_type = "nf4"
    use_nested_quant = False

    # TrainingArguments parameters
    output_dir = "./results"
    num_train_epochs = 1
    fp16 = False
    bf16 = False
    per_device_train_batch_size = 4
    per_device_eval_batch_size = 4
    gradient_accumulation_steps = 1
    gradient_checkpointing = True
    max_grad_norm = 0.3
    learning_rate = 2e-4
    weight_decay = 0.001
    optim = "paged_adamw_32bit"
    lr_scheduler_type = "cosine"
    max_steps = -1
    warmup_ratio = 0.03
    group_by_length = True
    save_steps = 0
    logging_steps = 25
    max_seq_length = None
    packing = False
    device_map = {"": 0}

  def load_config(self):
    compute_dtype = getattr(torch, self.bnb_4bit_compute_dtype)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=self.use_4bit,
        bnb_4bit_quant_type=self.bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=self.compute)
    return compute_dtype,bnb_config

  def check_gpu_compatibility(self,compute_dtype):
    if compute_dtype == torch.float16 and use_4bit:
      major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

  def load_base_model(self,bnb_config,):
    base_model = AutoModelForCausalLM.from_pretrained(
        self.model_name,
        quantization_config=bnb_config,
        device_map=self.device_map)
    self.model.config.use_cache = False
    self.model.config.pretraining_tp = 1
    return base_model

  def LLama_tokenizer(self):
    tokenizer = AutoTokenizer.from_pretrained(self.model_name,trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    return tokenizer

  def Lora_config(self):
    peft_config = LoraConfig(
        lora_alpha=self.lora_alpha,
        lora_dropout=self.lora,
        r=self.lora_r,
        bias="none",
        task_type="CAUSAL_LM")
    return peft_config

  def set_training_parameter(self):
    training_arguments = TrainingArguments(
      output_dir=self.output_dir,
      num_train_epochs=self.num_train_epochs,
      per_device_train_batch_size=self.per_device_train_batch_size,
      gradient_accumulation_steps=self.gradient_accumulation_steps,
      optim=self.optim,
      save_steps=self.save_steps,
      logging_steps=self.logging_steps,
      learning_rate=self.learning_rate,
      weight_decay=self.weight_decay,
      fp16=self.fp16,
      bf16=self.bf16,
      max_grad_norm=self.max_grad_norm,
      max_steps=self.max_steps,
      warmup_ratio=self.warmup_ratio,
      group_by_length=self.group_by_length,
      lr_scheduler_type=self.lr_scheduler_type,
      report_to="tensorboard"
      )
    return training_arguments

  def set_fine_tuning_parameters(self,model,dataset,tokenizer,peft_config,training_arguments):
    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=None,
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False
    )
    return trainer

  def model_save(self,Trainer):
    return Trainer.model.save_pretrained(self.new_model);



# Model training

# Finetuning with questions and answer data

In [None]:
# Assuming you have a custom class or module named 'fine_tune_llm' with methods as described

# Create an instance of the fine_tune_llm class/module
a = fine_tune_llm()

# Load the question-answer dataset using the corrected function
data_path = "your_data_path.json"  # Ensure this points to your JSON file
data = load_question_answer(data_path)

# Load the configuration for compute_dtype and other related configurations
compute_dtype, bnb_config = a.load_config()

# Check GPU compatibility with the compute_dtype
a.check_gpu_compatibility(compute_dtype)

# Load the base model to be fine-tuned
base_model = a.load_base_model()

# Initialize the tokenizer specific to LLaMA
tokenizer = a.LLama_tokenizer()

# Set up PEFT configuration (e.g., for LoRA)
peft_config = a.Lora_config()

# Set training parameters for fine-tuning
training_arguments = a.set_training_parameter()

# Set fine-tuning parameters with the loaded data
trainer = a.set_fine_tuning_parameters(base_model, data, tokenizer, peft_config, training_arguments)

# Start the training process
trainer.train()

# Save the fine-tuned model
trainer.model.save_pretrained(a.new_model)

# Reload the model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    a.model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=a.device_map,
)
model = PeftModel.from_pretrained(base_model, a.new_model)
model = model.merge_and_unload()

# Reload the tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(a.model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


# fine tuning with raw data

In [None]:
# Initialize the fine-tuning class instance
a = fine_tune_llm()

# Path to your dataset (make sure it’s properly formatted for fine-tuning)
data_path = "your_data_path"

# Load the raw data from the specified path
data = load_raw_data(data_path)

# Load the configuration settings, including compute dtype and bitsandbytes config
compute_dtype, bnb_config = a.load_config()

# Check if the GPU is compatible with the selected compute dtype
a.check_gpu_compatibility(compute_dtype)

# Load the base model that you want to fine-tune
base_model = a.load_base_model()

# Load the tokenizer associated with the LLaMA model
tokenizer = a.LLama_tokenizer()

# Load the LoRA (Low-Rank Adaptation) configuration for PEFT (Parameter-Efficient Fine-Tuning)
peft_config = a.Lora_config()

# Set the training parameters (e.g., learning rate, batch size, number of epochs)
training_arguments = a.set_training_parameter()

# Set up the fine-tuning trainer with the model, data, tokenizer, and training arguments
trainer = a.set_fine_tuning_parameters(base_model, data, tokenizer, peft_config, training_arguments)

# Start the training process
trainer.train()

# Save the fine-tuned model to the specified directory
trainer.model.save_pretrained(a.new_model)

# Reload the base model in FP16 (16-bit floating point) and merge it with the LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    a.model_name,  # The original base model's name
    low_cpu_mem_usage=True,  # Optimize memory usage on CPU
    return_dict=True,  # Ensure the model returns a dictionary
    torch_dtype=torch.float16,  # Use FP16 precision
    device_map=a.device_map,  # Map model to the appropriate device(s)
)

# Load the fine-tuned model with LoRA weights
model = PeftModel.from_pretrained(base_model, a.new_model)

# Merge LoRA weights with the base model and unload unnecessary parts
model = model.merge_and_unload()

# Reload the tokenizer for the model to save it properly
tokenizer = AutoTokenizer.from_pretrained(a.model_name, trust_remote_code=True)

# Set the padding token to be the same as the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

# Ensure padding is applied to the right side of the sequence
tokenizer.padding_side = "right"


# Model Evaluation

In [None]:
%load_ext tensorboard
%tensorboard --logdir results/runs

# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])