*Library and Installation*

In [None]:
%pip install unsloth
%pip install --upgrade --force-reinstall --no-cache-dir unsloth unsloth_zoo

*Initial Structure*

In [None]:
import psutil

In [None]:
from unsloth import tokenizer_utils
def do_nothing(*args, **kwargs):
    pass
tokenizer_utils.fix_untrained_tokens = do_nothing

In [None]:
import os
import warnings
from typing import Any, Dict, List, Tuple, Union

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split

from transformers import (
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)

from trl import SFTTrainer
from unsloth import FastLanguageModel


# ---------------------------
# GPU INFORMATION
# ---------------------------
major_version, minor_version = torch.cuda.get_device_capability()
print(f"Major: {major_version}, Minor: {minor_version}")


# ---------------------------
# MODEL LIST
# ---------------------------
used_models = [
    "unsloth/Qwen3-0.6B-Base-unsloth-bnb-4bit",
    "unsloth/Qwen3-1.7B-Base-unsloth-bnb-4bit",
    "unsloth/Qwen3-4B-Base-unsloth-bnb-4bit",
    "unsloth/Qwen3-8B-Base-unsloth-bnb-4bit",
    "unsloth/Qwen3-14B-Base-unsloth-bnb-4bit",
]

NUM_CLASSES = 2
max_seq_length = 2048
dtype = None  # Auto detection


# ---------------------------
# MODEL SELECTION
# ---------------------------
model_name = "unsloth/Qwen3-0.6B-Base-unsloth-bnb-4bit"
load_in_4bit = True


# ---------------------------
# MODEL LOADING
# ---------------------------
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    load_in_4bit=load_in_4bit,
    max_seq_length=max_seq_length,
    dtype=dtype,
)

In [None]:
number_token_ids = []
for i in range(0, NUM_CLASSES+1):
    number_token_ids.append(tokenizer.encode(str(i), add_special_tokens=False)[0])
par = torch.nn.Parameter(model.lm_head.weight[number_token_ids, :])

old_shape = model.lm_head.weight.shape
old_size = old_shape[0]
print(par.shape)
print(old_shape)

model.lm_head.weight = par

reverse_map = {value: idx for idx, value in enumerate(number_token_ids)} # will be used later to convert an idx from the old tokenizer to the new lm_head
reverse_map

In [None]:
from peft import LoftQConfig

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = [
        "lm_head",
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = True,
)
print("trainable parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))

*Data*

In [None]:
data = pd.read_csv("/content/Boamente_Treino_Atualizado.csv")
train_df, val_df = train_test_split(data, test_size=0.2, random_state=42)
print(len(train_df))

token_counts = [len(tokenizer.encode(x)) for x in train_df.text]
a = plt.hist(token_counts, bins=30)

*Prompt and Data Collator*

In [None]:
from datasets import Dataset

prompt = """Aqui estÃ¡ uma sentenÃ§a.
{}

Se a sentenÃ§a contÃ©m ideaÃ§Ã£o suicida, responda "Sim". Caso contrÃ¡rio, responda "NÃ£o":

A resposta correta Ã©: "{}"""

positivelabel = "Sim" #Yes
negativelabel = "NÃ£o" #No

def formatting_prompts_func(dataset_):
    texts = []
    for i in range(len(dataset_['text'])):
        text_ = dataset_['text'].iloc[i]
        label_ = dataset_['label'].iloc[i] #the csv is setup so that the label column corresponds exactly to the 3 classes defined above in the prompt (important)

        text = prompt.format(text_, label_)

        texts.append(text)
    return texts

#apply formatting_prompts_func to train_df
train_df['text'] = formatting_prompts_func(train_df)
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)

In [None]:
class DataCollatorForLastTokenLM(DataCollatorForLanguageModeling):
    def __init__(
        self,
        *args,
        mlm: bool = False,
        ignore_index: int = -100,
        **kwargs,
    ):
        super().__init__(*args, mlm=mlm, **kwargs)
        self.ignore_index = ignore_index

    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        batch = super().torch_call(examples)

        for i in range(len(examples)):
            last_token_idx = (batch["labels"][i] != self.ignore_index).nonzero()[-1].item()
            batch["labels"][i, :last_token_idx] = self.ignore_index
            batch["labels"][i, last_token_idx] = reverse_map[ batch["labels"][i, last_token_idx].item() ]
        return batch
collator = DataCollatorForLastTokenLM(tokenizer=tokenizer)

*Training with Cross-Validation*

In [None]:
psutil.cpu_count()
import psutil

In [None]:
# =====================================
# IMPORTS (ORDEM IMPORTA!)
# =====================================
import psutil  # ðŸ”¥ OBRIGATÃ“RIO PARA UNSLOTH
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from transformers import TrainingArguments
from trl import SFTTrainer

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve

from datasets import Dataset
from scipy.special import softmax


# =====================================
# CONFIG
# =====================================
NUM_FOLDS = 5
SEED = 3407
OUTPUT_DIR = "/content/roc_cv_results"

os.makedirs(OUTPUT_DIR, exist_ok=True)

torch.manual_seed(SEED)
np.random.seed(SEED)


# =====================================
# DATASET
# =====================================
full_dataset = Dataset.from_pandas(train_df, preserve_index=False)

skf = StratifiedKFold(
    n_splits=NUM_FOLDS,
    shuffle=True,
    random_state=SEED
)


# =====================================
# STORAGE
# =====================================
fold_results = []
roc_curves = []   # (fpr, tpr, auc)


# =====================================
# CROSS-VALIDATION
# =====================================
for fold, (train_idx, val_idx) in enumerate(
    skf.split(full_dataset["text"], full_dataset["label"])
):

    print(f"\nðŸš€ Starting Fold {fold+1}/{NUM_FOLDS}")

    train_dataset = full_dataset.select(train_idx)
    val_dataset   = full_dataset.select(val_idx)

    training_args = TrainingArguments(
        per_device_train_batch_size=32,
        gradient_accumulation_steps=1,
        warmup_steps=10,
        learning_rate=1e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=50,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        seed=SEED,
        output_dir=f"/content/outputs/fold_{fold+1}",
        num_train_epochs=1,
        report_to="none",
        group_by_length=True,
    )

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        max_seq_length=max_seq_length,
        dataset_num_proc=2,
        packing=False,
        args=training_args,
        formatting_func=formatting_prompts_func,
        data_collator=collator,
    )

    # -----------------------------
    # TRAIN
    # -----------------------------
    trainer.train()

    # -----------------------------
    # PREDICT
    # -----------------------------
    predictions = trainer.predict(val_dataset)
    logits = predictions.predictions

    probs = softmax(logits, axis=1)[:, 1]
    y_true = np.array(val_dataset["label"])

    # -----------------------------
    # AUC + ROC
    # -----------------------------
    auc = roc_auc_score(y_true, probs)
    fpr, tpr, _ = roc_curve(y_true, probs)

    # -----------------------------
    # SAVE PER FOLD
    # -----------------------------
    np.save(f"{OUTPUT_DIR}/y_true_fold_{fold+1}.npy", y_true)
    np.save(f"{OUTPUT_DIR}/y_score_fold_{fold+1}.npy", probs)
    np.save(f"{OUTPUT_DIR}/fpr_fold_{fold+1}.npy", fpr)
    np.save(f"{OUTPUT_DIR}/tpr_fold_{fold+1}.npy", tpr)

    fold_results.append({
        "fold": fold + 1,
        "auc": auc
    })

    roc_curves.append((fpr, tpr, auc))

    print(f"ðŸ“Š Fold {fold+1} | AUC = {auc:.4f}")


# =====================================
# RESULTS TABLE
# =====================================
df_results = pd.DataFrame(fold_results)
df_results.to_csv(f"{OUTPUT_DIR}/auc_per_fold.csv", index=False)

mean_auc = df_results["auc"].mean()
std_auc  = df_results["auc"].std()

print("\nðŸ“Œ AUC por fold:")
display(df_results)

print(f"\nâœ… Mean AUC: {mean_auc:.4f} Â± {std_auc:.4f}")


# =====================================
# MEAN ROC CURVE
# =====================================
mean_fpr = np.linspace(0, 1, 100)
tprs = []

for fpr, tpr, _ in roc_curves:
    tprs.append(np.interp(mean_fpr, fpr, tpr))

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0

np.save(f"{OUTPUT_DIR}/mean_fpr.npy", mean_fpr)
np.save(f"{OUTPUT_DIR}/mean_tpr.npy", mean_tpr)


# =====================================
# PLOT
# =====================================
plt.figure(figsize=(7, 6))
plt.plot(
    mean_fpr,
    mean_tpr,
    linewidth=2,
    label=f"Mean ROC (AUC = {mean_auc:.4f} Â± {std_auc:.4f})"
)
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Mean ROC Curve â€“ Stratified 5-Fold CV")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
!pip install psutil

In [None]:
from sklearn.model_selection import StratifiedKFold
import os
import torch
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from datasets import Dataset
from transformers import TrainingArguments
from trl import SFTTrainer

num_folds = 5
batch_size = 64
seed = 3407

output_dir = "/home/joaopedro/joaopedro/llm/roc"
os.makedirs(output_dir, exist_ok=True)

# Dataset completo (jÃ¡ formatado com prompt)
full_dataset = Dataset.from_pandas(data, preserve_index=False)

skf = StratifiedKFold(
    n_splits=num_folds,
    shuffle=True,
    random_state=seed
)

all_fold_aucs = []

for fold, (train_idx, val_idx) in enumerate(
    skf.split(full_dataset["text"], full_dataset["label"])
):
    print(f"\nðŸš€ Starting Fold {fold + 1}/{num_folds}")

    train_dataset = full_dataset.select(train_idx)
    val_dataset   = full_dataset.select(val_idx)

    training_args = TrainingArguments(
        per_device_train_batch_size=32,
        gradient_accumulation_steps=1,
        warmup_steps=10,
        learning_rate=1e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        seed=seed,
        output_dir=f"outputs/fold_{fold}",
        num_train_epochs=1,
        report_to="none",
        group_by_length=True,
    )

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        max_seq_length=max_seq_length,
        dataset_num_proc=2,
        packing=False,
        args=training_args,
        data_collator=collator,
    )

    trainer.train()

    # ===== InferÃªncia =====
    val_texts  = val_dataset["text"]
    val_labels = val_dataset["label"]

    fold_probabilities = []
    fold_labels = []

    tokenized_inputs = []
    for text, label in zip(val_texts, val_labels):
        # remove resposta do final
        test_str = text.rsplit("A resposta correta Ã©:", 1)[0] + "A resposta correta Ã©:"

        tokenized = tokenizer(
            test_str,
            return_tensors="pt",
            add_special_tokens=False
        )
        tokenized_inputs.append((tokenized, label))

    # ordenar por comprimento
    tokenized_inputs.sort(
        key=lambda x: x[0]["input_ids"].shape[1]
    )

    grouped_inputs = defaultdict(list)
    for tokenized, label in tokenized_inputs:
        length = tokenized["input_ids"].shape[1]
        grouped_inputs[length].append((tokenized, label))

    for length, group in tqdm(grouped_inputs.items()):
        for i in range(0, len(group), batch_size):
            batch = group[i:i + batch_size]

            input_ids = torch.cat(
                [item[0]["input_ids"] for item in batch]
            ).to("cuda")

            attention_mask = torch.cat(
                [item[0]["attention_mask"] for item in batch]
            ).to("cuda")

            labels = [item[1] for item in batch]

            with torch.no_grad():
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )

            logits = outputs.logits[:, -1, :2]
            probs = F.softmax(logits, dim=-1)[:, 1].cpu().numpy()

            fold_probabilities.extend(probs)
            fold_labels.extend(labels)

    fpr, tpr, _ = roc_curve(fold_labels, fold_probabilities)
    fold_auc = auc(fpr, tpr)
    all_fold_aucs.append(fold_auc)

    np.savetxt(os.path.join(output_dir, f"fpr_modelX_fold{fold}.txt"), fpr)
    np.savetxt(os.path.join(output_dir, f"tpr_modelX_fold{fold}.txt"), tpr)

    with open(os.path.join(output_dir, f"auc_modelX_fold{fold}.txt"), "w") as f:
        f.write(f"{fold_auc:.6f}\n")

    print(f"ðŸ“Š Fold {fold + 1} â€” AUC: {fold_auc:.4f}")

# ===== Resultados finais =====
mean_auc = np.mean(all_fold_aucs)
std_auc  = np.std(all_fold_aucs)

print("\nâœ… Final Cross-Validation Results")
print(f"Mean AUC: {mean_auc:.4f}")
print(f"Std AUC:  {std_auc:.4f}")

with open(os.path.join(output_dir, "auc_summary_modelX.txt"), "w") as f:
    f.write(f"Mean AUC: {mean_auc:.6f}\n")
    f.write(f"Std AUC:  {std_auc:.6f}\n")

In [None]:
import torch
import numpy as np
from transformers import TrainingArguments
from trl import SFTTrainer
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset

#Number of folds for cross-validation
num_folds = 5

#Convert the pandas dataset to the Hugging Face dataset format (if not already done)
full_dataset = Dataset.from_pandas(train_df, preserve_index=False)

#Create fold indices for cross-validation
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=3407)

#Store metrics from each fold
all_results = []

#Iterate over folds
for fold, (train_idx, val_idx) in enumerate(skf.split(full_dataset["text"], full_dataset["label"])):
    print(f"\nðŸš€ Starting Fold {fold+1}/{num_folds}...")

    #Create training and validation subsets
    train_dataset = full_dataset.select(train_idx)
    val_dataset = full_dataset.select(val_idx)

    #Define training arguments
    training_args = TrainingArguments(
        per_device_train_batch_size=32,
        gradient_accumulation_steps=1,
        warmup_steps=10,
        learning_rate=1e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        seed=3407,
        output_dir=f"outputs/fold_{fold+1}",  #Separate directory for each fold
        num_train_epochs=1,
        report_to="none",  #Disable logging to external services
        group_by_length=True,
    )

    #Create the Trainer
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,  #Adding validation
        max_seq_length=max_seq_length,
        dataset_num_proc=2,
        packing=False,
        args=training_args,
        formatting_func=formatting_prompts_func,
        data_collator=collator,
    )

    #Train the model
    trainer.train()

    #Evaluate the model on this fold's validation set
    metrics = trainer.evaluate()
    print(f"ðŸ“Š Results for Fold {fold+1}: {metrics}")

    # Store the metrics
    all_results.append(metrics)

#Compute the mean metrics across all folds
final_results = {
    metric: np.mean([result[metric] for result in all_results]) for metric in all_results[0]
}

print("\nâœ… Final average metrics after cross-validation:")
print(final_results)

#Note: the output is in Portuguese because it was the old version of the code. In the new version, the code and comments are in English.

*Training with 80/20 (Hold-out)*

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    max_seq_length = max_seq_length,
    dataset_num_proc = 1,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 32,
        gradient_accumulation_steps = 1,
        warmup_steps = 10,
        learning_rate = 1e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        num_train_epochs = 1,
        report_to = "none",
        group_by_length = True,
    ),
    data_collator=collator,
    dataset_text_field="text",
)

In [None]:
trainer_stats = trainer.train()

*Inference*

In [None]:
FastLanguageModel.for_inference(model)
print()

*Evaluation with metrics*

In [None]:
import os
import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

#Define output directory
output_dir = "/home/joaopedro/joaopedro/llm/roc"
os.makedirs(output_dir, exist_ok=True)

#Load validation data
val_texts = val_df['text'].tolist()
val_labels = val_df['label'].tolist()

#Define storage
all_probabilities = []
all_labels = []

#Step 1: Tokenization and sorting by token length
tokenized_inputs = []
for text, label in zip(val_texts, val_labels):
    test_str = prompt.format(text, "")
    tokenized_input = tokenizer(test_str, return_tensors="pt", add_special_tokens=False)
    tokenized_inputs.append((tokenized_input, test_str, label))

#Sort by tokenized length
tokenized_inputs.sort(key=lambda x: x[0]['input_ids'].shape[1])

#Step 2: Group by tokenized length
grouped_inputs = defaultdict(list)
for tokenized_input, test_str, label in tokenized_inputs:
    length = tokenized_input['input_ids'].shape[1]
    grouped_inputs[length].append((tokenized_input, test_str, label))

#Step 3: Process batches
batch_size = 64

for length, group in tqdm(grouped_inputs.items()):
    for i in range(0, len(group), batch_size):
        batch = group[i:i + batch_size]
        batch_inputs = [item[0] for item in batch]
        batch_labels = [item[2] for item in batch]

        #Concatenate batch inputs
        input_ids = torch.cat([item['input_ids'] for item in batch_inputs], dim=0).to("cuda")
        attention_mask = torch.cat([item['attention_mask'] for item in batch_inputs], dim=0).to("cuda")

        #Forward pass
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        #Extract logits for classification
        logits = outputs.logits[:, -1, :2]
        probabilities = F.softmax(logits, dim=-1)[:, 1].cpu().numpy()  #Probability of class 1

        #Store results
        all_probabilities.extend(probabilities)
        all_labels.extend(batch_labels)

#Step 4: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(all_labels, all_probabilities)
roc_auc = auc(fpr, tpr)

#Save ROC data
np.savetxt(os.path.join(output_dir, "fpr_Qwen3_14B.txt"), fpr)
np.savetxt(os.path.join(output_dir, "tpr_Qwen3_14B.txt"), tpr)
np.savetxt(os.path.join(output_dir, "thresholds_Qwen3_14B.txt"), thresholds)
with open(os.path.join(output_dir, "auc_Qwen3_14B.txt"), "w") as f:
    f.write(f"AUC: {roc_auc:.4f}\n")

#Step 5: Find optimal threshold
optimal_idx = (tpr - fpr).argmax()
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal Threshold: {optimal_threshold:.4f}")

#Step 6: Convert probabilities to binary predictions
all_outputs = (all_probabilities >= optimal_threshold).astype(int)

#Step 7: Compute evaluation metrics
cm = confusion_matrix(all_labels, all_outputs)
accuracy = accuracy_score(all_labels, all_outputs)
precision = precision_score(all_labels, all_outputs)
recall = recall_score(all_labels, all_outputs)
f1 = f1_score(all_labels, all_outputs)

#Step 8: Save all metrics + confusion matrix to a single file
metrics_file = os.path.join(output_dir, "metrics_Qwen3_14B.txt")
with open(metrics_file, "w") as f:
    f.write("===== Model Evaluation Metrics =====\n")
    f.write(f"Accuracy: {accuracy:.4f}\n")
    f.write(f"Precision: {precision:.4f}\n")
    f.write(f"Recall: {recall:.4f}\n")
    f.write(f"F1-Score: {f1:.4f}\n")
    f.write(f"AUC: {roc_auc:.4f}\n")

    f.write("\n===== Confusion Matrix =====\n")
    np.savetxt(f, cm, fmt="%d")

print(f"\nMetrics saved to: {metrics_file}")

#Step 9: Save confusion matrix separately
np.savetxt(os.path.join(output_dir, "confusion_matrix_Qwen3_14B.txt"), cm, fmt='%d')
print("\nConfusion Matrix:")
print(cm)
print("\nEvaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"AUC: {roc_auc:.4f}")