<a href="https://colab.research.google.com/github/danishnaseer00/codeminx/blob/main/Fine_Tuning_with_Unsloth_(2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install transformers datasets peft accelerate bitsandbytes

In [None]:
from unsloth import FastLanguageModel
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling , EarlyStoppingCallback
from datasets import load_dataset
import torch
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
import huggingface_hub
print(huggingface_hub.utils.get_session().get("https://huggingface.co").status_code)

In [None]:
from huggingface_hub import login
    from google.colab import userdata

    HF_TOKEN = userdata.get('HF-TOKEN')

    if HF_TOKEN:
        login(token=HF_TOKEN)
        print("Successfully logged in to Hugging Face!")
    else:
        print("Hugging Face token not found in Colab Secrets. Please add it.")

In [None]:
max_seq_length = 1024
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name= "unsloth/phi-3-mini-4k-instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

In [None]:
special_tokens = ["<|user|>", "<|assistant|>", "<|end|>"]
tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})
model.resize_token_embeddings(len(tokenizer))

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

In [None]:
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train").shuffle(seed=3407).select(range(7000))
dataset = dataset.train_test_split(test_size=0.15, seed=3407)

In [None]:
dataset

In [None]:
def format_prompts(examples):
    """
    Format the dataset for instruction tuning with Phi-3 Mini template
    """
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]

    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        try:
            if input_text and input_text.strip():
                prompt = f"""<|user|>\n{instruction}\nInput: {input_text}<|end|>\n<|assistant|>\n{output}<|end|>"""
            else:
                prompt = f"""<|user|>\n{instruction}<|end|>\n<|assistant|>\n{output}<|end|>"""
            texts.append(prompt)
        except Exception as e:
            print(f"Error formatting prompt: {e}")
            texts.append("")
    return {"text": texts}

train_dataset = dataset["train"].map(format_prompts, batched=True)
eval_dataset = dataset["test"].map(format_prompts, batched=True)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_seq_length)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=eval_dataset.column_names)

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./Fine_tuned_LLM",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    max_steps=100,
    learning_rate=1e-5,
    fp16=True,
    bf16=False,
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    eval_strategy="steps",
    eval_steps=50,
    save_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator,
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=5,
            early_stopping_threshold=0.001
        )
    ],
)

In [None]:
torch.cuda.empty_cache()
print("Starting training...")
print(f"GPU memory before training: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

trainer.train()

print("Training completed!")
print(f"GPU memory after training: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

In [None]:
def validate_model(prompt, max_length=512):
    """Test the model with a sample prompt"""
    inputs = tokenizer(
        f"<|user|>\n{prompt}<|end|>\n<|assistant|>\n",
        return_tensors="pt",
        truncation=True,
        max_length=max_seq_length
    ).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            tokenizer = tokenizer,

            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,

            stop_strings=["```\n", "### Instruction:", "### Input:", "### Response:"]
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("<|assistant|>")[-1].strip()

# Test the model
test_prompt = "Write function to check a string is palindrome or not "
print("\nTest Generation:")
print(validate_model(test_prompt))

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./Fine_tuned_LLM",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    max_steps=100,
    learning_rate=2e-4,
    fp16=True,
    bf16=False,
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    eval_strategy="steps",
    eval_steps=50,
    save_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator,
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=5,
            early_stopping_threshold=0.001
        )
    ],
)

In [None]:
torch.cuda.empty_cache()
print("Starting training...")
print(f"GPU memory before training: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

trainer.train(resume_from_checkpoint="/content/Fine_tuned_LLM/checkpoint-100")

print("Training completed!")
print(f"GPU memory after training: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

In [None]:
def validate_model(prompt, max_length=512):
    """Test the model with a sample prompt"""
    inputs = tokenizer(
        f"<|user|>\n{prompt}<|end|>\n<|assistant|>\n",
        return_tensors="pt",
        truncation=True,
        max_length=max_seq_length
    ).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            tokenizer = tokenizer,

            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,

            stop_strings=["```\n", "### Instruction:", "### Input:", "### Response:"]
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("<|assistant|>")[-1].strip()

# Test the model
test_prompt = "Write a mergesort algorithm in python."
print("\nTest Generation:")
print(validate_model(test_prompt))

In [None]:
model.save_pretrained("phi3-python-fine_tuned")
tokenizer.save_pretrained("phi3-python-fine_tuned")

In [None]:
!apt-get install git

In [None]:
!git config --global user.name "danishnaseer00"
!git config --global user.email "danishmughal.dev@gmail.com"

In [None]:
from google.colab import userdata

# Fetch the token from Colab secrets
try:
    GITHUB_TOKEN = userdata.get('GH-token')
    print("Token fetched successfully (length: {} characters)".format(len(GITHUB_TOKEN)))
except Exception as e:
    print(f"Error fetching token: {e}")
    print("Make sure you added the secret named 'GITHUB_TOKEN' in Colab secrets.")
    GITHUB_TOKEN = None

In [None]:
%%bash
# Set up GitHub repository details
REPO_URL="https://github.com/danishnaseer00/codeminx.git"
BRANCH="main"

# Create a new directory for the repo
mkdir Fine_tuned_phi
cd Fine_tuned_phi # Changed from my_model to Fine_tuned_phi

# Initialize git
git init
git checkout -b $BRANCH

# Copy all files from Colab environment (adjust paths as needed)
cp -r /content/* .

# Add all files including notebook, model, and tokenizer
git add .
git commit -m "Upload Colab notebook, fine-tuned model, and tokenizer"

# Add GitHub remote and push
git remote add origin $REPO_URL
git push -u origin $BRANCH

In [None]:
pip install nbformat==5.10.4 nbconvert==7.16.6

In [None]:
pip install --upgrade jupyter notebook nbformat nbconvert