In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
import os
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
from huggingface_hub import HfApi, create_repo

# Define the token and username for Hugging Face Hub
HF_TOKEN = "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"
username = "hemanthkandimalla"
api = HfApi(token=HF_TOKEN)

# Define the model and dataset parameters
model_name = "NousResearch/Llama-2-7b-chat-hf"
dataset_name = "mlabonne/guanaco-llama2-1k"
new_model_name = "/content/drive/MyDrive/HEMANTH_llms"
csv_file_path = "/content/drive/MyDrive/training.csv"
text_column = "input"
id_column = "id"
question_column = "output"
max_seq_length = 512

# Ensure that the CSV file path is provided
if not os.path.exists(csv_file_path):
    raise ValueError("A valid CSV file path must be provided.")

# Load the CSV file as a pandas DataFrame and then into a Hugging Face Dataset
df = pd.read_csv(csv_file_path)
dataset = Dataset.from_pandas(df)

# Load the tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

# Define the preprocessing function
def preprocess_function(examples):
    inputs = [ex + tokenizer.eos_token + q for ex, q in zip(examples[text_column], examples[question_column])]
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=max_seq_length)
    return model_inputs

# Apply the preprocessing function to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Load the model
model = AutoModelForCausalLM.from_pretrained(model_name)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    weight_decay=0.001,
    max_grad_norm=0.3,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_dir='./logs',
    logging_steps=25,
    save_strategy="no",
    report_to="none"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model(new_model_name)

# Create a repository on the Hugging Face Hub
create_repo(
    repo_id=f"{username}/{new_model_name}-GGUF",
    token=HF_TOKEN,
    repo_type="model",
    exist_ok=True
)

# Upload the fine-tuned model to the Hugging Face Hub
api.upload_folder(
    folder_path=new_model_name,
    repo_id=f"{username}/{new_model_name}-GGUF",
    token=HF_TOKEN
)

print(f"Training complete. Model saved to: {new_model_name}")


In [None]:
import argparse
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
import pandas as pd
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from huggingface_hub import create_repo, HfApi
import torch
HF_TOKEN = "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"
username = "hemanthkandimalla"

# Defined in the secrets tab in Google Colab
api = HfApi(token=HF_TOKEN)

def main():
    import torch
    parser = argparse.ArgumentParser(description="Fine-tune a model from Hugging Face hub using specific parameters.")

    # Model parameters
    parser.add_argument("--model-name", type=str, default="NousResearch/Llama-2-7b-chat-hf", help="The model from the Hugging Face hub to train.")
    parser.add_argument("--dataset-name", type=str, default="mlabonne/guanaco-llama2-1k", help="The instruction dataset to use.")
    parser.add_argument("--new-model", type=str, default="Hemanth_LLMs", help="Fine-tuned model name.")
    parser.add_argument("--csv-file-path", type=str, default="/content/drive/MyDrive/training.csv", help="Path to the local CSV file.")
    parser.add_argument("--text-column", type=str, default="input", help="The column name for the input text in the CSV file.")
    parser.add_argument("--id-column", type=str, default="id", help="The column name for the ids in the CSV file.")
    parser.add_argument("--question-column", type=str, default="question", help="The column name for the questions in the CSV file.")

    # QLoRA parameters
    parser.add_argument("--lora-r", type=int, default=64, help="LoRA attention dimension.")
    parser.add_argument("--lora-alpha", type=int, default=16, help="Alpha parameter for LoRA scaling.")
    parser.add_argument("--lora-dropout", type=float, default=0.1, help="Dropout probability for LoRA layers.")

    # bitsandbytes parameters
    parser.add_argument("--use-4bit", action="store_true", help="Activate 4-bit precision base model loading.")
    parser.add_argument("--bnb-4bit-compute-dtype", type=str, default="float16", help="Compute dtype for 4-bit base models.")
    parser.add_argument("--bnb-4bit-quant-type", type=str, default="nf4", help="Quantization type (fp4 or nf4).")
    parser.add_argument("--use-nested-quant", action="store_true", help="Activate nested quantization for 4-bit base models (double quantization).")

    # TrainingArguments parameters
    parser.add_argument("--output-dir", type=str, default="./results", help="Output directory for model predictions and checkpoints.")
    parser.add_argument("--num-train-epochs", type=int, default=1, help="Number of training epochs.")
    parser.add_argument("--fp16", action="store_true", help="Enable fp16 training.")
    parser.add_argument("--bf16", action="store_true", help="Enable bf16 training.")
    parser.add_argument("--per-device-train-batch-size", type=int, default=4, help="Batch size per GPU for training.")
    parser.add_argument("--per-device-eval-batch-size", type=int, default=4, help="Batch size per GPU for evaluation.")
    parser.add_argument("--gradient-accumulation-steps", type=int, default=1, help="Number of update steps to accumulate gradients for.")
    parser.add_argument("--gradient-checkpointing", action="store_true", help="Enable gradient checkpointing.")
    parser.add_argument("--max-grad-norm", type=float, default=0.3, help="Maximum gradient norm (gradient clipping).")
    parser.add_argument("--learning-rate", type=float, default=2e-4, help="Initial learning rate (AdamW optimizer).")
    parser.add_argument("--weight-decay", type=float, default=0.001, help="Weight decay to apply to all layers except bias/LayerNorm weights.")
    parser.add_argument("--optim", type=str, default="paged_adamw_32bit", help="Optimizer to use.")
    parser.add_argument("--lr-scheduler-type", type=str, default="cosine", help="Learning rate schedule.")
    parser.add_argument("--max-steps", type=int, default=-1, help="Number of training steps (overrides num_train_epochs).")
    parser.add_argument("--warmup-ratio", type=float, default=0.03, help="Ratio of steps for a linear warmup (from 0 to learning rate).")
    parser.add_argument("--group-by-length", action="store_true", help="Group sequences into batches with same length.")
    parser.add_argument("--save-steps", type=int, default=0, help="Save checkpoint every X updates steps.")
    parser.add_argument("--logging-steps", type=int, default=25, help="Log every X updates steps.")

    # SFT parameters
    parser.add_argument("--max-seq-length", type=int, default=None, help="Maximum sequence length to use.")
    parser.add_argument("--packing", action="store_true", help="Pack multiple short examples in the same input sequence to increase efficiency.")
    parser.add_argument("--device-map", type=str, default='{"": 0}', help="Load the entire model on the GPU 0.")

    args = parser.parse_args()

    max_seq_length = 512

        # Ensure that the CSV file path is provided
    if not args.csv_file_path or not os.path.exists(args.csv_file_path):
        raise ValueError("A valid CSV file path must be provided.")
        # Create a Hugging Face Dataset from the DataFrame
        # Load your local CSV file
    df = pd.read_csv(args.csv_file_path)
    dataset = Dataset.from_pandas(df)

    # Load LLaMA tokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

    def preprocess_function(examples):
      inputs = [ex + tokenizer.eos_token + q for ex, q in zip(examples[args.text_column], examples[args.question_column])]
      model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=max_seq_length, return_tensors="pt")
      return {k: v.squeeze(0) for k, v in model_inputs.items()}


    tokenized_dataset = dataset.map(preprocess_function, batched=True)

    # # Parse the device_map if provided
    # if args.device_map:
    #     args.device_map = eval(args.device_map)  # Convert string representation of dictionary to actual dictionary

    # # Add code here to use the parsed arguments to train your model
    # # For example:
    # # model = train_model(args.model_name, args.dataset_name, ...)
    # # ...

    # print("Arguments parsed and model training would start with the following configuration:")
    # for arg in vars(args):
    #     print(f"{arg}: {getattr(args, arg)}")



    # Load tokenizer and model with QLoRA configuration
    # compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
    # bnb_config = BitsAndBytesConfig(
    #     load_in_4bit=args.use_4bit
    # )

    # # Check GPU compatibility with bfloat16
    # if compute_dtype == torch.float16 and args.use_4bit:
    #     major, _ = torch.cuda.get_device_capability()
    #     if major >= 8:
    #         print("=" * 80)
    #         print("Your GPU supports bfloat16: accelerate training with bf16=True")
    #         print("=" * 80)

    # Load base model
    model = AutoModelForCausalLM.from_pretrained(
        args.model_name
    )
    model.config.use_cache = False
    # Load LoRA configuration
    peft_config = LoraConfig(
        lora_alpha=args.lora_alpha,
        lora_dropout=args.lora_dropout,
        r=args.lora_r,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # Set training parameters
    training_arguments = TrainingArguments(
        output_dir=args.output_dir,
        num_train_epochs=args.num_train_epochs,
        per_device_train_batch_size=args.per_device_train_batch_size,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        optim=args.optim,
        save_steps=args.save_steps,
        logging_steps=args.logging_steps,
        learning_rate=args.learning_rate,
        weight_decay=args.weight_decay,
        fp16=args.fp16,
        bf16=args.bf16,
        max_grad_norm=args.max_grad_norm,
        max_steps=args.max_steps,
        warmup_ratio=args.warmup_ratio,
        group_by_length=args.group_by_length,
        lr_scheduler_type=args.lr_scheduler_type,
        report_to="tensorboard"
    )
    trainer = SFTTrainer(
        model=model,
        train_dataset=tokenized_dataset,
        peft_config=peft_config,
        dataset_text_field="input_ids",  # Changed to input_ids since we tokenized the text
        tokenizer=tokenizer,
        args=training_arguments,
        packing=args.packing,
    )

    # Train model
    trainer.train()

    print(f'Fine-tunings of completed  ')
    # Save trained model
    trainer.model.save_pretrained(args.new_model)

    print("Training complete. Model saved to:", args.new_model)


    # Create empty repo
    create_repo(
        repo_id = f"{username}/{args.new_model}-GGUF",
        repo_type="model",
        exist_ok=True,
    )

    # Upload gguf files
    api.upload_folder(
        folder_path=args.new_model,
        repo_id=f"{username}/{args.new_model}-GGUF",
        allow_patterns=f"*.gguf",
    )
if __name__ == "__main__":
    main()

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"
# The instruction dataset to use
dataset_name = "mlabonne/guanaco-llama2-1k"
# Fine-tuned model name
new_model = "llama-2-7b-miniguanaco"
################################################################################
# QLoRA parameters
################################################################################
# LoRA attention dimension
lora_r = 64
# Alpha parameter for LoRA scaling
lora_alpha = 16
# Dropout probability for LoRA layers
lora_dropout = 0.1
################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################
# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"
# Number of training epochs
num_train_epochs = 1
# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False
# Batch size per GPU for training
per_device_train_batch_size = 4
# Batch size per GPU for evaluation
per_device_eval_batch_size = 4
# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1
# Enable gradient checkpointing
gradient_checkpointing = True
# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3
# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4
# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001
# Optimizer to use
optim = "paged_adamw_32bit"
# Learning rate schedule
lr_scheduler_type = "cosine"
# Number of training steps (overrides num_train_epochs)
max_steps = -1
# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03
# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True
# Save checkpoint every X updates steps
save_steps = 0
# Log every X updates steps
logging_steps = 25
################################################################################
# SFT parameters
################################################################################
# Maximum sequence length to use
max_seq_length = None
# Pack multiple short examples in the same input sequence to increase efficiency
packing = False
# Load the entire model on the GPU 0
device_map = {"": 0}

In [None]:
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)
# Train model
trainer.train()
# Save trained model
trainer.model.save_pretrained(new_model)

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

In [None]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
!huggingface-cli login

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)