In [None]:
# Cell 1: Install Libraries
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install "transformers[torch]" datasets scikit-learn "trl>=0.8.0"
!pip install gdown

print("✅ All libraries installed.")



In [None]:
#@title Colab Extra Install { display-mode: "form" }
%%capture
import os
!pip install --upgrade -qqq uv
if "COLAB_" not in "".join(os.environ.keys()):
    # If you're not in Colab, just use pip install!
    !pip install unsloth vllm
else:
    try: import numpy; get_numpy = f"numpy=={numpy.__version__}"
    except: get_numpy = "numpy"
    try: import subprocess; is_t4 = "Tesla T4" in str(subprocess.check_output(["nvidia-smi"]))
    except: is_t4 = False
    get_vllm, get_triton = ("vllm==0.10.1", "triton==3.2.0") if is_t4 else ("vllm", "triton")
    !uv pip install -qqq --upgrade \
        unsloth {get_vllm} {get_numpy} torchvision bitsandbytes xformers
    !uv pip install -qqq {get_triton}
!uv pip install transformers==4.55.4
!uv pip install --no-deps trl==0.22.2

In [None]:
# Cell 2: Download Dataset Directly from Your Links
import gdown
import os

train_url = "https://drive.google.com/file/d/1KX5H7QESPo72HE9RguDEfVuhQAwYZB5y/view?usp=drive_link"
test_url = "https://drive.google.com/file/d/1_8-E4cmA51y5kMfgUhWCihA1CfZ4DEvh/view?usp=drive_link"

train_file_output = "train.csv"
test_file_output = "test.csv"

print("Downloading training data...")
gdown.download(url=train_url, output=train_file_output, quiet=False, fuzzy=True)

print("\nDownloading test data...")
gdown.download(url=test_url, output=test_file_output, quiet=False, fuzzy=True)

print("\n✅ Successfully downloaded dataset files.")

Downloading training data...


Downloading...
From: https://drive.google.com/uc?id=1KX5H7QESPo72HE9RguDEfVuhQAwYZB5y
To: /content/train.csv
100%|██████████| 5.24M/5.24M [00:00<00:00, 235MB/s]



Downloading test data...


Downloading...
From: https://drive.google.com/uc?id=1_8-E4cmA51y5kMfgUhWCihA1CfZ4DEvh
To: /content/test.csv
100%|██████████| 1.31M/1.31M [00:00<00:00, 83.5MB/s]


✅ Successfully downloaded dataset files.





In [None]:
# Cell 3: Load Model, Tokenizer, and Format Data

# Import Unsloth first
from unsloth import FastLanguageModel
import torch

from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments

# ========== MODEL CONFIGURATION ==========
model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
output_dir = "fine_tuned_mistral7b_gda"
# ========================================

# Load the mistral model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=1024,
    dtype=None,
    load_in_4bit=True,
)

# --- Data Formatting (Alpaca Style) ---
# This mirrors the simple and effective format from the official Unsloth notebook.
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

def format_data(examples):
    instructions = ["Determine if the following gene-disease association is true or false."] * len(examples["text"])
    inputs       = examples["text"]
    outputs      = ["true" if label == 1 else "false" for label in examples["label"]]

    texts = []
    for instruction, input_text, output_text in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input_text, output_text) + tokenizer.eos_token
        texts.append(text)
    return { "text" : texts }

# Load raw data and apply the formatting
dataset = load_dataset("csv", data_files={"train": "train.csv"}) # We only need the train set for SFT
dataset = dataset.map(format_data, batched=True)

print("\n✅ mistral7b model loaded.")
print("✅ Dataset formatted using the Alpaca instruction style.")
print("\nExample of new data format:")
print(dataset['train'][0]['text'])

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 09-15 10:06:09 [__init__.py:241] Automatically detected platform cuda.
ERROR 09-15 10:06:11 [fa_utils.py:57] Cannot use FA version 2 is not supported due to FA2 is only supported on devices with compute capability >= 8
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.9.4: Fast Mistral patching. Transformers: 4.55.4. vLLM: 0.10.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/54755 [00:00<?, ? examples/s]


✅ mistral7b model loaded.
✅ Dataset formatted using the Alpaca instruction style.

Example of new data format:
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Determine if the following gene-disease association is true or false.

### Input:
CD86 is associated with Gastro-enteropancreatic neuroendocrine tumor

### Response:
true</s>


In [None]:
# CEll4 Training Cell

from trl import SFTTrainer, SFTConfig

# Configure LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha=32,

    # --- CHANGE 1: Set dropout to 0 for a speed optimization ---
    lora_dropout=0,

    bias="none",
    use_gradient_checkpointing=True,
    random_state=42,
)

# --- CORRECTED TRAINER SETUP ---
# This configuration is designed for a more substantial training run
# that is still feasible within a Colab session.
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    dataset_text_field="text",
    max_seq_length=128, # Increased slightly from 64 for a bit more context
    tokenizer=tokenizer,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=10,

        # --- CHANGE 2: Increased training steps for deeper learning ---
        max_steps=2000,

        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=100, # Log progress every 100 steps
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=42,
        output_dir="fine_tuned_mistral7b_gda_2000steps", # New output directory
        report_to="none",
    ),
)

print("🚀 Starting the final, optimized fine-tuning run...")
trainer.train()
print("\n🎉 Fine-tuning complete!")

Unsloth 2025.9.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/54755 [00:00<?, ? examples/s]

🚀 Starting the final, optimized fine-tuning run...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 54,755 | Num Epochs = 1 | Total steps = 2,000
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 7,289,966,592 (0.58% trained)


Step,Training Loss
100,0.4249
200,0.3338
300,0.3323
400,0.322
500,0.3117
600,0.3103
700,0.3041
800,0.2913
900,0.2919
1000,0.2884


Unsloth: Will smartly offload gradients to save VRAM!

🎉 Fine-tuning complete!


In [None]:
# Cell (after training): Inference Test
from transformers import TextStreamer

# Prepare the model for faster inference
FastLanguageModel.for_inference(model)

# Define the Alpaca prompt template you used for training
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Create a prompt for a known true association
prompt = alpaca_prompt.format(
    "Determine if the following gene-disease association is true or false.", # instruction
    "BRCA1 is associated with Breast Neoplasms", # input
    "", # output - leave this blank for generation!
)

inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

print("--- Testing the Fine-tuned Model ---")
print("Prompt:\n", "BRCA1 is associated with Breast Neoplasms")
print("\nModel Response:")
# Generate the response
outputs = model.generate(**inputs, max_new_tokens=5, streamer=TextStreamer(tokenizer))

--- Testing the Fine-tuned Model ---
Prompt:
 BRCA1 is associated with Breast Neoplasms

Model Response:
<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Determine if the following gene-disease association is true or false.

### Input:
BRCA1 is associated with Breast Neoplasms

### Response:
true</s>


In [None]:
# Cell (after inference test): Save the LoRA Adapters

# The output directory is already defined from your training cell
output_dir = "fine_tuned_mistral7b_gda"

print(f"Saving the fine-tuned LoRA adapters to '{output_dir}_adapters'...")
# We save to a new folder to keep things clean
model.save_pretrained(f"{output_dir}_adapters")
tokenizer.save_pretrained(f"{output_dir}_adapters")

print("✅ Model adapters saved successfully.")

Saving the fine-tuned LoRA adapters to 'fine_tuned_mistral7b_gda_adapters'...
✅ Model adapters saved successfully.


In [None]:
# Cell (final): Zip and Download Your Model
from google.colab import files

output_dir_adapters = "fine_tuned_mistral7b_gda_adapters"
zip_filename = "fine_tuned_mistral7b_gda.zip"

print(f"Zipping the '{output_dir_adapters}' folder...")
!zip -r {zip_filename} {output_dir_adapters}

print(f"\n✅ Zipping complete. Your model is saved as '{zip_filename}'.")
print("Downloading now...")
files.download(zip_filename)