In [None]:
task = "Mistake_Identification"

In [None]:
model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"

In [None]:
model_to_be_trained = "mistral-7b-instruct"

In [None]:
extra = '-training-subset-augmented'

In [5]:
new_folder_name = f'{model_to_be_trained}-{task}{extra}'

In [None]:
new_folder_name

### Connect to Drive

In [7]:
# prompt: connect to google drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


### Connect to Wandb

In [None]:
import wandb
WANDB_API_KEY="YOUR_WANDB_API_KEY"
wandb.login(key=WANDB_API_KEY)
wandb_project = new_folder_name
wandb.init(project=wandb_project)

### Installation

In [9]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

### Unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [11]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


<a name="Data"></a>
### Data Prep
We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.

**[NOTE]** To train only on completions (ignoring the user's input) read TRL's docs [here](https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only).

**[NOTE]** Remember to add the **EOS_TOKEN** to the tokenized output!! Otherwise you'll get infinite generations!

If you want to use the `llama-3` template for ShareGPT datasets, try our conversational [notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Alpaca.ipynb)

For text completions like novel writing, try this [notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_(7B)-Text_Completion.ipynb).

In [None]:
from datasets import load_dataset, Dataset, DatasetDict
import random
from sklearn.model_selection import train_test_split

alpaca_prompt = """### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Load dataset using Hugging Face Datasets
train_dataset = load_dataset("json", data_files=f"/content/drive/~/Mistake_identification_augm_train_split_chat_format.jsonl")

# Function to format prompts
EOS_TOKEN = tokenizer.eos_token  # Ensure tokenizer is defined before this

# mistral instruction
system_msg = "Evaluate the tutor's response based on whether they identified a mistake in the student's response or not. Mistake Identification: Has the tutor identified a mistake in the student's answer? Options:Yes, To some extent, No. Yes means the mistake is clearly identified\/recognized in the tutor's response. No means the tutor does not recognize the mistake (e.g., they proceed to simply provide the answer to the asked question). To some extent means the tutor's response suggests that there may be a mistake, but it sounds as if the tutor is not certain. You should answer by Yes, No or To some extent strictly in the following format: `Evaluation: (Yes\/No\/To Some Extent).` It is very important to have the word `Evaluation:` before your answer, while also sticking to the criteria of evaluation."
# Convert OpenAI-style messages to alpaca-style format
def formatting_prompts_func(example):
    # Extract fields from messages
    user_msg = next(m["content"] for m in example["messages"] if m["role"] == "user")
    label = next(m["content"] for m in example["messages"] if m["role"] == "assistant")

    # Format the prompt
    text = alpaca_prompt.format(system_msg, user_msg, "Evaluation: " + label) + EOS_TOKEN
    return {
        "text": text,
        "instruction": system_msg,
        "input": user_msg,
        "output": "Evaluation: " + label
    }

# Apply formatting
train_dataset = train_dataset.map(formatting_prompts_func)

test_dataset = load_dataset("json", data_files=f"/content/drive/~/Mistake_identification_validation_split_chat_format.jsonl")

test_dataset = test_dataset.map(formatting_prompts_func)

train_dataset = train_dataset['train']
test_dataset = test_dataset['train']


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2180 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/496 [00:00<?, ? examples/s]

In [32]:
test_dataset['text'][10]

"### Instruction:\nEvaluate the tutor's response based on whether they identified a mistake in the student's response or not. Mistake Identification: Has the tutor identified a mistake in the student's answer? Options:Yes, To some extent, No. Yes means the mistake is clearly identified\\/recognized in the tutor's response. No means the tutor does not recognize the mistake (e.g., they proceed to simply provide the answer to the asked question). To some extent means the tutor's response suggests that there may be a mistake, but it sounds as if the tutor is not certain. You should answer by Yes, No or To some extent strictly in the following format: `Evaluation: (Yes\\/No\\/To Some Extent).` It is very important to have the word `Evaluation:` before your answer, while also sticking to the criteria of evaluation.\n\n### Input:\nTutor: Hi, could you please provide a step-by-step solution for the question below? The question is: Apple can run at a rate of 3 miles per hour. Mac can run at a r

In [13]:
# Print class distributions to verify stratification
print("Train Class Distribution:\n", train_dataset.to_pandas()["output"].value_counts())
print("\nTest Class Distribution:\n", test_dataset.to_pandas()["output"].value_counts())

Train Class Distribution:
 output
Evaluation: Yes               1545
Evaluation: No                 396
Evaluation: To some extent     239
Name: count, dtype: int64

Test Class Distribution:
 output
Evaluation: Yes               387
Evaluation: No                 74
Evaluation: To some extent     35
Name: count, dtype: int64


<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [17]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Set to False if you're doing causal language modeling (e.g., with LLaMA, GPT-style models)
)


In [18]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    data_collator=data_collator,
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # max_steps = 60,
        num_train_epochs=3,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = ["wandb"], # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/2180 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/496 [00:00<?, ? examples/s]

In [19]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.557 GB.
7.0 GB of memory reserved.


### Train:

In [20]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,180 | Num Epochs = 3 | Total steps = 408
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040/7,000,000,000 (0.60% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.5593
2,1.7096
3,1.6674
4,1.4483
5,1.3197
6,1.1465
7,0.957
8,0.8197
9,0.7194
10,0.6114


Step,Training Loss
1,1.5593
2,1.7096
3,1.6674
4,1.4483
5,1.3197
6,1.1465
7,0.957
8,0.8197
9,0.7194
10,0.6114


In [21]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

1463.2841 seconds used for training.
24.39 minutes used for training.
Peak reserved memory = 7.0 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 17.696 %.
Peak reserved memory for training % of max memory = 0.0 %.


### Save Model:

In [None]:
from huggingface_hub import login, HfApi, HfFolder
from transformers import AutoModel, AutoTokenizer

# 1. Login to Hugging Face
login(token="YOUR_HUGGING_FACE_TOKEN")  # Replace with your Hugging Face token

# 2. Save model & tokenizer
model.save_pretrained(new_folder_name, safe_serialization=True)
tokenizer.save_pretrained(new_folder_name)

# 3. Create repo if it doesn’t exist
repo_id = f"FDekmak/{new_folder_name}"  # Replace with your repo name
api = HfApi()
api.create_repo(repo_id=repo_id, private=True, exist_ok=True)

# 4. Push model to Hugging Face Hub
from huggingface_hub import upload_folder

upload_folder(
    folder_path=new_folder_name,  # Path to saved model
    repo_id=repo_id,
    commit_message="Uploading fine-tuned model"
)

print(f"Model successfully uploaded to: https://huggingface.co/{repo_id}")


### Load Model if needed:

In [None]:
from huggingface_hub import login, HfApi, HfFolder
from unsloth import FastLanguageModel

# 1. Login to Hugging Face
login(token="YOUR_HUGGING_FACE_TOKEN")  # Replace with your Hugging Face token

# 3. Create repo if it doesn’t exist
repo_id = f"FDekmak/{new_folder_name}"  # Replace with your repo name

# 1. Load base model using Unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,  # or whatever base you used
    max_seq_length=2048,
    load_in_4bit=True
)

# 2. Load your adapter (the one you pushed)
from peft import PeftModel

model = PeftModel.from_pretrained(model, repo_id)


<a name="Inference"></a>
### Inference


In [24]:
inference_dataset = test_dataset
import time
start = time.time()

In [None]:
# output folder

base_path = '/content/drive/MyDrive/Evaluators/V3'  # Replace with your desired base path

new_folder_path = os.path.join(base_path, new_folder_name)

if not os.path.exists(new_folder_path):
  os.makedirs(new_folder_path)
  print(f"Folder '{new_folder_name}' created at '{new_folder_path}'")
else:
  print(f"Folder '{new_folder_name}' already exists at '{new_folder_path}'")


In [36]:
import torch
import pandas as pd
import re
from sklearn.metrics import f1_score, classification_report

# Ensure the model is in inference mode
FastLanguageModel.for_inference(model)

# Store results
results = []

for example in inference_dataset:
    # Tokenize input
    inputs = tokenizer([example["text"].split("\nEvaluation")[0]], return_tensors="pt").to("cuda")

    # Generate response
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        use_cache=True,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )

    # Decode and extract response
    response_text = tokenizer.batch_decode(outputs)[0]
    extracted_text = response_text.split("### Response:", 1)[-1].strip()

    # Extract mistake identification decision using regex
    # match = re.search(r"Evaluation:\s*(Yes \(and the answer is correct\)|No|Yes \(but the answer is incorrect\))", extracted_text, re.IGNORECASE)
    match = re.search(r"Evaluation:\s*(Yes|No|To some extent)", extracted_text, re.IGNORECASE)
    model_prediction = match.group(1) if match else "Unknown"

    match = re.search(r"Evaluation:\s*(Yes|No|To some extent)", example['output'], re.IGNORECASE)
    gt = match.group(1) if match else "Unknown"

    # Store results
    results.append({
        "Instruction": example['instruction'],
        "Input": example['input'],
        "Model Output": extracted_text,
        "Extracted Prediction": model_prediction,
        "Ground Truth": gt,
        "Match": model_prediction.lower() == gt.lower()
    })

# Convert results to DataFrame
df = pd.DataFrame(results)

# Compute accuracy
accuracy = df["Match"].mean() * 100

# Convert labels to categorical format for F1-score computation
label_mapping = {
    "Yes": 0,
    "No": 1,
    "To some extent": 2
}


# Save DataFrame to CSV
df.to_csv(f"{new_folder_path}/{task}_results.csv", index=False)

# Print evaluation results
print(f"Accuracy: {accuracy:.2f}%")

# Display sample of results
df.head()

Accuracy: 88.51%


Unnamed: 0,Instruction,Input,Model Output,Extracted Prediction,Ground Truth,Match
0,Evaluate the tutor's response based on whether...,"Tutor: Hi, could you please provide a step-by-...",Evaluation: To some extent</s>,To some extent,No,False
1,Evaluate the tutor's response based on whether...,"Tutor: Hi, could you please provide a step-by-...",Evaluation: Yes</s>,Yes,Yes,True
2,Evaluate the tutor's response based on whether...,"Tutor: Hi, could you please provide a step-by-...",Evaluation: Yes</s>,Yes,Yes,True
3,Evaluate the tutor's response based on whether...,"Tutor: Hi, could you please provide a step-by-...",Evaluation: Yes</s>,Yes,Yes,True
4,Evaluate the tutor's response based on whether...,Tutor: Please have a look at the steps on the ...,Evaluation: Yes</s>,Yes,Yes,True


In [None]:
time.time() - start

In [37]:
import pandas as pd
from sklearn.metrics import classification_report

# Ensure both columns are strings and normalize case
df["Extracted Prediction"] = df["Extracted Prediction"].astype(str).str.lower().apply(lambda x: x.strip().strip("."))
df["Ground Truth"] = df["Ground Truth"].astype(str).str.lower().apply(lambda x: x.strip().strip("."))

# Compute classification report
report = classification_report(df["Ground Truth"], df["Extracted Prediction"], digits=4, zero_division=0)

# Print classification report
print(report)


                precision    recall  f1-score   support

            no     0.7917    0.7703    0.7808        74
to some extent     0.5882    0.2857    0.3846        35
           yes     0.9140    0.9612    0.9370       387

      accuracy                         0.8851       496
     macro avg     0.7646    0.6724    0.7008       496
  weighted avg     0.8728    0.8851    0.8747       496

