In [4]:
# Obrigatório para testes no Google Colab, senão ignorar
from google.colab import userdata

github_token = userdata.get('GITHUB_TOKEN')
!git clone https://{github_token}@github.com/erickrribeiro/ufam_nlp_01_2025.git
!mv ./ufam_nlp_01_2025/scripts/** ./
!cp ./ufam_nlp_01_2025/models/* ./
!unzip ./mistral-sql-lora-v3.zip -d ./
!unzip ./mistral-sql-lora-v4.zip -d ./

fatal: destination path 'ufam_nlp_01_2025' already exists and is not an empty directory.
mv: cannot stat './ufam_nlp_01_2025/scripts/**': No such file or directory
Archive:  ./mistral-sql-lora-v3.zip
replace ./mistral-sql-lora-v3/adapter_config.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: Archive:  ./mistral-sql-lora-v4.zip
replace ./mistral-sql-lora-v4/README.md? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
!rm -rf ./ufam_nlp_01_2025
!rm -rf ./spider
!rm -rf mistral-sql-lora-*
!rm *.py
!rm *.ipynb
!rm *.csv
# !rm *.zip

In [None]:
!pip install -U gdown==5.2.0 \
                transformers==4.52.4 \
                trl==0.18.2 \
                peft==0.15.2 \
                datasets==3.6.0 \
                accelerate==1.8.0 \
                huggingface_hub==0.33.0 \
                fsspec==2025.3.0 \
                deepeval==3.1.6 --quiet

In [None]:
import datasets, transformers, trl, peft, accelerate, huggingface_hub, fsspec, deepeval

print("datasets:", datasets.__version__)
print("transformers:", transformers.__version__)
print("peft:", peft.__version__)
print("trl:", trl.__version__)
print("accelerate:", accelerate.__version__)
print("huggingface_hub:", huggingface_hub.__version__)
print("fsspec:", fsspec.__version__)
print("deepeval:", deepeval.__version__)

# Fase 4: Análise Quantitativa de Regressão de Capacidade



- 4.1. Metodologia de Avaliação MMLU: Este notebook avalia o modelo base e os modelos fine-tuned na suíte de 150 questões do [MMLU](https://github.com/hendrycks/test)(Massive Multitask Language Understanding). A avaliação será feita em modo 4-shot para cada uma das quatro opções de múltipla escolha.
- 4.2. Cálculo de Acurácia: A métrica utilizado será a acurácia de resposta correta.
- 4.3. Análise de Regressão: Será calculado a variação percentual de acurácia entre o modelo base e o dois modelos fine-tuned. Esta análise será reportada de forma agregada e também por categoria (STEM, Humanidades, Ciências Sociais) para identificar se a regressão de capacidade afeta domínios de forma heterogênea.

## Notas de Implementação
- **Ajuste no parâmetro max_new_tokens**: Ajustei o `max_new_tokens` para 10 tokens com o objetivo de acelerar a geração, isso porque, o benchmark MMLU espera respostas extremamente curtas (apenas uma das alternativas: A, B, C ou D). Esse valor é suficiente para capturar a resposta sem gerar texto desnecessário.

- **Pós-processamento da saída do modelo**: O modelo base `mistralai/Mistral-7B-Instruct-v0.2` na inferência gera retorna o `prompt+resposta`. Logo, foi necessário implementar um filtro `completion = output[len(prompt):].strip()` para remover o texto do prompt da saída gerada.

- **Extração da alternativa**: Na classe `Mistral7B(DeepEvalBaseLLM)` adicionei o método `extract_choice` para extrair apenas a letra da alternativa correta (A, B, C ou D) da resposta do modelo. Isso garante compatibilidade com o formato de avaliação do MMLU, independentemente de como o modelo expressa a resposta (ex.: "Answer: B", "B. Montesquieu" ou "B").

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from utils import set_seed
from custom_model import Mistral7B

set_seed(42)

In [3]:
# Testando a classe Mistral7B implementado no arquivo ./custom_model

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
baseline = Mistral7B(model=model, tokenizer=tokenizer, device=device)

prompt = """This question refers to the following information.
The following excerpt is from a pamphlet.
You will do me the justice to remember, that I have always strenuously supported the Right of every man to his own opinion, however different that opinion might be to mine. He who denies to another this right, makes a slave of himself to his present opinion, because he precludes himself the right of changing it.
The most formidable weapon against errors of every kind is Reason. I have never used any other, and I trust I never shall.
The circumstance that has now taken place in France of the total abolition of the whole national order of priesthood, and of everything appertaining to compulsive systems of religion, and compulsive articles of faith, has not only precipitated my intention, but rendered a work of this kind exceedingly necessary, lest in the general wreck of superstition, of false systems of government, and false theology, we lose sight of morality, of humanity, and of the theology that is true.
I believe in one God, and no more; and I hope for happiness beyond this life.
I believe in the equality of man; and I believe that religious duties consist in doing justice, loving mercy, and endeavoring to make our fellow-creatures happy.
I do not believe in the creed professed by the Jewish church, by the Roman church, by the Greek church, by the Turkish church, by the Protestant church, nor by any church that I know of. My own mind is my own church.
All national institutions of churches, whether Jewish, Christian or Turkish, appear to me no other than human inventions, set up to terrify and enslave mankind, and monopolize power and profit.
I do not mean by this declaration to condemn those who believe otherwise; they have the same right to their belief as I have to mine.
—Thomas Paine, The Age of Reason, 1794–1795
Which of the following Enlightenment philosophes designed a system of checks and balances for government to avoid abuses of power?
A. Jean Jacques Rousseau
B. Baron Montesquieu
C. Mary Wollstonecraft
D. Adam Smith
"""
print(baseline.generate(prompt))
print(baseline.batch_generate([prompt]))

KeyboardInterrupt: 

In [None]:
from deepeval.benchmarks.mmlu.task import MMLUTask
from utils import run_mmlu

batch_size = 16
mm_tasks = [MMLUTask.ABSTRACT_ALGEBRA, MMLUTask.ASTRONOMY]
df_base = run_mmlu(baseline, mm_tasks, batch_size)
df_base

In [None]:
import os
import pandas as pd
import torch
from utils import run_mmlu
from constants import CATEGORY_MAPPING

if not os.path.exists("./baseline-mmlu-predictions.csv"):
  mm_tasks = []
  mm_tasks.extend(CATEGORY_MAPPING["STEM"])
  mm_tasks.extend(CATEGORY_MAPPING["Humanidades"])
  mm_tasks.extend(CATEGORY_MAPPING["Ciências Sociais"])
  batch_size = 16
  model_name = "mistralai/Mistral-7B-Instruct-v0.2"
  device = "cuda" if torch.cuda.is_available() else "cpu"
  model = AutoModelForCausalLM.from_pretrained(model_name)
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokenizer.pad_token = tokenizer.eos_token
  baseline = Mistral7B(model=model, tokenizer=tokenizer, device=device)
  df_base = run_mmlu(baseline, mm_tasks, batch_size)
  df_base.to_csv("./baseline-mmlu-predictions.csv")
else:
  df_base = pd.read_csv("./baseline-mmlu-predictions.csv")
df_base

In [None]:
import pandas as pd

df_base.to_csv("./baseline-mmlu-predictions.csv")

- Avalia o modelo `./mistral-sql-lora-v3` originado por fine-tuning

In [None]:
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel, PeftConfig

if not os.path.exists("./mistral-sql-lora-v3-mmlu-predictions.csv"):
  lora_path = "./mistral-sql-lora-v3"
  device = "cuda" if torch.cuda.is_available() else "cpu"
  config = PeftConfig.from_pretrained(lora_path)
  base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    device_map="auto",
    torch_dtype="auto"
  )
  peft_model_v3 = PeftModel.from_pretrained(base_model, lora_path)

  tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
  tokenizer.pad_token = tokenizer.eos_token

  ministral_sql_lora_v3 = Mistral7B(model=peft_model_v3, tokenizer=tokenizer, device=device)
  df_ministral_sql_lora_v3 = run_mmlu(model=ministral_sql_lora_v3)
  df_ministral_sql_lora_v3.to_csv("./mistral-sql-lora-v3-mmlu-predictions.csv")
else:
  df_ministral_sql_lora_v3 = pd.read_csv("./mistral-sql-lora-v3-mmlu-predictions.csv")

df_ministral_sql_lora_v3

- Avalia o modelo `./mistral-sql-lora-v4` originado por fine-tuning

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel, PeftConfig

if not os.path.exists("./mistral-sql-lora-v4-mmlu-predictions.csv"):
  lora_path = "./mistral-sql-lora-v4"
  device = "cuda" if torch.cuda.is_available() else "cpu"
  config = PeftConfig.from_pretrained(lora_path)
  base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    device_map="auto",
    torch_dtype="auto"
  )
  peft_model_v4 = PeftModel.from_pretrained(base_model, lora_path)

  tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
  tokenizer.pad_token = tokenizer.eos_token

  ministral_sql_lora_v4 = Mistral7B(model=peft_model_v4, tokenizer=tokenizer)
  df_ministral_sql_lora_v4 = run_mmlu(model=ministral_sql_lora_v4)
else:
  df_ministral_sql_lora_v4 = pd.read_csv("./mistral-sql-lora-v4-mmlu-predictions.csv")

df_ministral_sql_lora_v4

- Comparação dos modelos base e fine-tuned

In [None]:
df_baseline = df_base.rename(columns={"accuracy": "accuracy_baseline"})
df_ministral_sql_lora_v3 = df_ministral_sql_lora_v3.rename(columns={"accuracy": "accuracy_mistral-sql-lora-v3"})
df_ministral_sql_lora_v4 = df_ministral_sql_lora_v4.rename(columns={"accuracy": "accuracy_mistral-sql-lora-v4"})

df = df_baseline.merge(df_ministral_sql_lora_v3, on=["task", "category"])
df = df.merge(df_ministral_sql_lora_v4, on=["task", "category"])

# Calcula a variação percentual
df["variation_percent_v3"] = (
  (df["accuracy_mistral-sql-lora-v3"] - df["accuracy_baseline"]) / df["accuracy_baseline"]
) * 100

df["variation_percent_v4"] = (
  (df["accuracy_mistral-sql-lora-v4"] - df["accuracy_baseline"]) / df["accuracy_baseline"]
) * 100

# Agrega os resultados por categoria
agg = df.groupby("category").agg({
  "accuracy_baseline": "mean",
  "accuracy_mistral-sql-lora-v3": "mean",
  "accuracy_mistral-sql-lora-v4": "mean",
  "variation_percent_v3": "mean",
  "variation_percent_v4": "mean"
}).reset_index()

# Agregado total (overall)
agg_total = pd.DataFrame([{
  "category": "Overall",
  "accuracy_baseline": df["accuracy_baseline"].mean(),
  "accuracy_mistral-sql-lora-v3": df["accuracy_mistral-sql-lora-v3"].mean(),
  "accuracy_mistral-sql-lora-v4": df["accuracy_mistral-sql-lora-v4"].mean(),
  "variation_percent_v3": df["variation_percent_v3"].mean(),
  "variation_percent_v4": df["variation_percent_v4"].mean(),
}])

final_report = pd.concat([agg_total, agg], ignore_index=True)

print("\n=== Variação por Categoria ===\n")
print(final_report.to_markdown(index=False))

print("\n=== Variação por Tarefa ===\n")
print(df[["task", "category", "accuracy_base", "accuracy_finetuned", "variation_percent"]].to_markdown(index=False))