<a href="https://colab.research.google.com/github/rakshitshah280701/InstructAware/blob/main/Option4_Training_DeepSeek_using_Unsloth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade pip
!pip uninstall unsloth -y
!pip install unsloth[colab-new]

In [None]:
!pip install torch transformers datasets huggingface_hub


In [None]:
import torch
import json
import random
from datasets import load_dataset
from unsloth import FastLanguageModel
from huggingface_hub import login
from google.colab import userdata


In [None]:
hf_token = userdata.get("DeepSeek")  # Correct retrieval method

if hf_token is None:
    raise ValueError("Hugging Face token not found in Colab secrets. Make sure you added it correctly.")

# Login to Hugging Face
login(token=hf_token)


In [None]:
from unsloth import FastLanguageModel

max_seq_length = 2048
dtype = None
load_in_4bit = True


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token,
)

In [None]:
import pandas as pd
import json
import re  # Import regex for text cleaning
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset, DatasetDict


EOS_TOKEN = tokenizer.eos_token  # Add EOS_TOKEN at the end of each entry


# Load cleaned CSVs
train_df = pd.read_csv("/content/drive/MyDrive/InstructAware/REU/OpenAI/Data/InstructAware/CSV/train_dataset_cleaned.csv")
val_df = pd.read_csv("/content/drive/MyDrive/InstructAware/REU/OpenAI/Data/InstructAware/CSV/validation_dataset_cleaned.csv")
test_df = pd.read_csv("/content/drive/MyDrive/InstructAware/REU/OpenAI/Data/InstructAware/CSV/test_dataset_cleaned.csv")

# Format to prompt-response style
def format_prompt_response(df):
    texts = []
    for i, row in df.iterrows():
        input_text = row["INPUT TEXT"]
        output_text = row["OUTPUT TEXT"]
        full_text = f"""Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the details and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are an AI vision assistant with advanced capabilities that help people with low vision to understand their surroundings by providing them natural language narratives based on detected signs, bounding box coordinates, and OCR text.

### Detected Text & Locations:
{input_text}

### Response:
<think>{output_text}</think>{EOS_TOKEN}
"""
        texts.append(full_text)
    return Dataset.from_dict({"text": texts})

# Convert to HF Dataset
train_dataset = format_prompt_response(train_df)
val_dataset = format_prompt_response(val_df)
test_dataset = format_prompt_response(test_df)

# Combine into a dataset dict
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset,
})

# Save to disk
dataset.save_to_disk("/content/structured_split_dataset")


In [None]:
from datasets import load_from_disk

dataset_path = "/content/structured_split_dataset"
dataset = load_from_disk(dataset_path)


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

output_dir = "/content/drive/MyDrive/InstructAware/Code/Option4TransformerDeepSeek/Deepseek_Logs"


In [None]:
# from trl import SFTTrainer
# from transformers import TrainingArguments
# from unsloth import is_bfloat16_supported

# trainer = SFTTrainer(
#     model=model,
#     tokenizer=tokenizer,
#     train_dataset=dataset["train"],
#     eval_dataset=dataset["validation"],
#     dataset_text_field="text",
#     max_seq_length=max_seq_length,
#     dataset_num_proc=2,
#     args=TrainingArguments(
#         per_device_train_batch_size=2,
#         gradient_accumulation_steps=4,
#         warmup_steps=5,
#         max_steps=60,
#         learning_rate=2e-4,
#         fp16=not is_bfloat16_supported(),
#         bf16=is_bfloat16_supported(),
#         logging_steps=10,
#         evaluation_strategy="steps",
#         eval_steps=20,
#         save_strategy="no",
#         optim="adamw_8bit",
#         weight_decay=0.01,
#         lr_scheduler_type="linear",
#         seed=3407,
#         output_dir="outputs",
#     ),
# )

# trainer_stats = trainer.train()

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_dir=output_dir,                     # ✅ log dir for TensorBoard
        logging_steps=10,                           # ✅ how often to log
        evaluation_strategy="steps",
        eval_steps=20,
        save_strategy="no",
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir=output_dir,                      # ✅ saves checkpoints & logs
        report_to="tensorboard",                    # ✅ enable TensorBoard logging
    ),
)

trainer_stats = trainer.train()


In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/InstructAware/Code/Option4TransformerDeepSeek/Deepseek_Logs


In [None]:
FastLanguageModel.for_inference(model)

# Example inference loop over test data
for i in range(3):  # Change to len(dataset["test"]) for full
    test_prompt = dataset["test"][i]["text"].split("### Response:")[0] + "### Response:\n"

    inputs = tokenizer([test_prompt], return_tensors="pt").to("cuda")
    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=1200,
        use_cache=True,
    )
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    print(f"🔍 Prediction {i+1}:\n", response.split("### Response:")[1].strip())
    print("─" * 100)


In [None]:
save_model_path = "/content/drive/MyDrive/InstructAware/Code/Option4TransformerDeepSeek/7thAprilRun"

# Save the model and tokenizer
model.save_pretrained(save_model_path)
tokenizer.save_pretrained(save_model_path)

print(f"✅ Model and tokenizer saved at: {save_model_path}")


In [None]:
from tqdm import tqdm
import pandas as pd

FastLanguageModel.for_inference(model)

# Load original test data with ground truth
test_df_raw = pd.read_csv("/content/drive/MyDrive/InstructAware/REU/OpenAI/Data/InstructAware/CSV/test_dataset_cleaned.csv")

# Ensure same length & order
assert len(test_df_raw) == len(dataset["test"])

predictions = []

for i in tqdm(range(len(dataset["test"]))):
    example = dataset["test"][i]["text"]
    prompt = example.split("### Response:")[0] + "### Response:\n"

    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=1200,
        use_cache=True,
    )
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    response_clean = response.split("### Response:")[1].strip() if "### Response:" in response else response

    predictions.append({
        "INPUT TEXT": test_df_raw.iloc[i]["INPUT TEXT"],
        "ORIGINAL OUTPUT TEXT": test_df_raw.iloc[i]["OUTPUT TEXT"],
        "PREDICTED OUTPUT TEXT": response_clean
    })

# Convert to DataFrame
pred_df = pd.DataFrame(predictions)

# Save to CSV
output_csv_path = "/content/drive/MyDrive/InstructAware/Code/Option4TransformerDeepSeek/7thAprilRun/Output.csv"
pred_df.to_csv(output_csv_path, index=False)

print(f"✅ Predictions with ground truth saved to: {output_csv_path}")


In [None]:
import pandas as pd

# Load the CSV file
csv_path = "/content/drive/MyDrive/InstructAware/Code/Option4TransformerDeepSeek/7thAprilRun/Output.csv"  # Update the path if needed
df = pd.read_csv(csv_path)

# Display first few rows
df.head()

from google.colab import data_table

# Display CSV as an interactive table
data_table.DataTable(df)