In [13]:
!rm -rf M-IFEval
!git clone git@github.com:ManuPassinato/M-IFEvalFork.git M-IFEval
%cd M-IFEval

# Instala√ß√£o de depend√™ncias
!cd M-IFEval && pip install -q -r requirements.txt
!pip install -q vllm==0.7.1 bitsandbytes==0.45.1 hf-transfer==0.1.9 langdetect janome ja_sentence_segmenter

# Setup do Spacy e NLTK
import nltk
nltk.download('punkt')

import sys
import os
sys.path.append(os.getcwd())
!touch instruction_utils/__init__.py

Cloning into 'M-IFEval'...
remote: Enumerating objects: 734, done.[K
remote: Counting objects: 100% (734/734), done.[K
remote: Compressing objects: 100% (415/415), done.[K
remote: Total 734 (delta 320), reused 722 (delta 313), pack-reused 0 (from 0)[K
Receiving objects: 100% (734/734), 30.32 MiB | 22.55 MiB/s, done.
Resolving deltas: 100% (320/320), done.
/content/M-IFEval
/bin/bash: line 1: cd: M-IFEval: No such file or directory


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
%%writefile universal_inference.py
import os
import argparse
import torch
import gc
import sys
import traceback
from datasets import load_dataset
from vllm import LLM, SamplingParams

def run_model_inference(model_name):
    print(f"\n[WORKER] Iniciando: {model_name}")

    # 1. Limpeza
    gc.collect()
    torch.cuda.empty_cache()

    # 2. Carregar Modelo
    try:
        print(f"[WORKER] Carregando vLLM...")
        llm = LLM(
            model=model_name,
            trust_remote_code=True,
            gpu_memory_utilization=0.90,
            max_model_len=4096,
            enforce_eager=True,
            tensor_parallel_size=1,
            device="cuda"
        )
    except Exception:
        print("‚ùå [ERRO FATAL] Falha ao carregar o modelo.")
        traceback.print_exc()
        sys.exit(1)

    sampling_params = SamplingParams(temperature=0.0, max_tokens=2048)

    # 3. Identificar arquivo de dados (Usa o CLEAN se existir, sen√£o o normal)
    data_dir = "./data"
    input_file = None

    # Prioridade para o arquivo limpo que geramos
    if os.path.exists(os.path.join(data_dir, "pt_input_data_FINAL_CLEAN.jsonl")):
        input_file = "pt_input_data_FINAL_CLEAN.jsonl"
    elif os.path.exists(os.path.join(data_dir, "pt_input_data_clean.jsonl")):
        input_file = "pt_input_data_clean.jsonl"
    elif os.path.exists(os.path.join(data_dir, "pt_input_data.jsonl")):
        input_file = "pt_input_data.jsonl"

    if not input_file:
        print(f"‚ùå Nenhum arquivo de input encontrado em {data_dir}")
        sys.exit(1)

    input_path = os.path.join(data_dir, input_file)
    print(f"[WORKER] Usando arquivo de entrada: {input_file}")

    # 4. Processamento
    try:
        ds = load_dataset("json", data_files={"train": input_path}, split="train")

        # Detecta coluna de prompt
        col_names = ds.column_names
        prompt_col = "prompt"
        if "prompt" not in col_names:
            # Tenta achar substitutos
            for c in ["instruction", "pergunta", "input"]:
                if c in col_names:
                    prompt_col = c; break

        print(f"[WORKER] Coluna de prompt detectada: '{prompt_col}'")
        prompts = [item[prompt_col] for item in ds]

        # Gera√ß√£o
        outputs = llm.generate(prompts, sampling_params)
        generated_text = [output.outputs[0].text for output in outputs]

        # Salva Sa√≠da
        safe_model = model_name.replace('/', '__')
        output_filename = os.path.join(data_dir, f"pt_input_response_data_{safe_model}_new.jsonl")

        ds = ds.add_column("response", generated_text)
        ds.select_columns([prompt_col, "response"]).to_json(output_filename)
        print(f"‚úÖ [SUCESSO] Arquivo salvo: {output_filename}")

    except Exception as e:
        print(f"‚ùå [ERRO] Falha durante gera√ß√£o: {e}")
        traceback.print_exc()
        sys.exit(1)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", type=str, required=True)
    args = parser.parse_args()
    run_model_inference(args.model_name)

Overwriting universal_inference.py


In [7]:
# Instala os modelos lingu√≠sticos do Spacy necess√°rios para as 4 l√≠nguas
print("Instalando modelos do Spacy para EN, ES, FR e JA...")

!python -m spacy download en_core_web_sm
!python -m spacy download es_core_news_sm
!python -m spacy download fr_core_news_sm
!python -m spacy download ja_core_news_sm
!python -m spacy download pt_core_news_sm
!python -m spacy download xx_sent_ud_sm

print("\n‚úÖ Instala√ß√£o conclu√≠da. Pronto para rodar o benchmark completo.")

Instalando modelos do Spacy para EN, ES, FR e JA...
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.8/12.8 MB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m‚ö† Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting es-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[2K     [90m‚îÅ‚îÅ

In [14]:
import os

def rename_json_to_jsonl():
    old_path = "/content/M-IFEval/data/pt_input_data.json"
    new_path = "/content/M-IFEval/data/pt_input_data.jsonl"

    if not os.path.exists(old_path):
        print(f"‚ùå Arquivo n√£o encontrado: {old_path}")
        return

    if os.path.exists(new_path):
        print(f"‚ö†Ô∏è O arquivo j√° existe: {new_path}")
        return

    os.rename(old_path, new_path)
    print("‚úÖ Arquivo renomeado com sucesso.")

rename_json_to_jsonl()

‚úÖ Arquivo renomeado com sucesso.


retirei os arquivos com "pt:detectable_format:constrained_response" do "data/pt_input_data.jsonl", pois estavam dando problema ao rodar o benchmark

In [15]:
import json
import os

print("üßπ INICIANDO LIMPEZA CIR√öRGICA...")

# Lista negra exata baseada no seu erro
KILL_LIST = [
    "pt:detectable_format:constrained_response",
]

input_path = "data/pt_input_data.jsonl"
output_path = "data/pt_input_data_FINAL_CLEAN.jsonl"

total = 0
kept = 0
removed = 0

if not os.path.exists(input_path):
    print("‚ùå Arquivo original n√£o encontrado!")
else:
    with open(input_path, "r", encoding="utf-8") as fin, \
         open(output_path, "w", encoding="utf-8") as fout:

        for line in fin:
            total += 1
            try:
                data = json.loads(line)
                ids = data.get("instruction_id_list", [])

                # Verifica se ALGUM dos IDs pedidos est√° na lista negra
                # any() retorna True se encontrar qualquer correspond√™ncia
                is_bad_line = any(bad_id in ids for bad_id in KILL_LIST)

                if is_bad_line:
                    removed += 1
                    # print(f"   üóëÔ∏è Removendo linha com: {ids}")
                else:
                    fout.write(line)
                    kept += 1

            except json.JSONDecodeError:
                pass

    print("-" * 40)
    print(f"üìä RESULTADO:")
    print(f"   Total lido:      {total}")
    print(f"   Linhas APAGADAS: {removed}")
    print(f"   Linhas MANTIDAS: {kept}")
    print(f"   Arquivo salvo:   {output_path}")
    print("-" * 40)

    if kept > 0:
        print("‚úÖ Arquivo limpo gerado com sucesso.")
    else:
        print("‚ùå ALERTA: O arquivo resultante ficou vazio.")

üßπ INICIANDO LIMPEZA CIR√öRGICA...
----------------------------------------
üìä RESULTADO:
   Total lido:      129
   Linhas APAGADAS: 3
   Linhas MANTIDAS: 126
   Arquivo salvo:   data/pt_input_data_FINAL_CLEAN.jsonl
----------------------------------------
‚úÖ Arquivo limpo gerado com sucesso.


Muda o arquivo instruction_utils/pt_instructions_util.py

Adiciona "nlp.add_pipe("sentencizer")" na fun√ß√£o     _get_sentence_tokenizer()

Antes o parser era desativado na fun√ß√£o, mas no spaCy.sents depende do parser OU de um sentencizer.


In [26]:
%%writefile instruction_utils/pt_instructions_util.py
# coding=utf-8
# Copyright 2024 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Utility library of instructions."""

import functools
import random
import re
from typing import List
import spacy
import immutabledict
import nltk

WORD_LIST = ["saudade", "amanh√£", "cidad√£o", "trabalho", "escola", "sa√∫de", "privado", "justi√ßa", "cultura", "verdade"]  # pylint: disable=line-too-long

# ISO 639-1 codes to language names.
LANGUAGE_CODES = immutabledict.immutabledict({
    "en": "English",
    "es": "Spanish",
    "pt": "Portuguese",
    "ar": "Arabic",
    "hi": "Hindi",
    "fr": "French",
    "ru": "Russian",
    "de": "German",
    "ja": "Japanese",
    "it": "Italian",
    "bn": "Bengali",
    "uk": "Ukrainian",
    "th": "Thai",
    "ur": "Urdu",
    "ta": "Tamil",
    "te": "Telugu",
    "bg": "Bulgarian",
    "ko": "Korean",
    "pl": "Polish",
    "he": "Hebrew",
    "fa": "Persian",
    "vi": "Vietnamese",
    "ne": "Nepali",
    "sw": "Swahili",
    "kn": "Kannada",
    "mr": "Marathi",
    "gu": "Gujarati",
    "pa": "Punjabi",
    "ml": "Malayalam",
    "fi": "Finnish",
    })

_ALPHABETS = "([A-Za-z√Å-√ö√°-√∫√Ä-√π√¢√™√Æ√¥√ª√É-√ï√£-√µ√á√ß])"
_PREFIXES = "(Sr|Sra|Srta|Dr|Dra|Prof|Profa)[.]"
_SUFFIXES = "(Ltda|ME|SA|Jr|Filho|Neto|Co)"
_STARTERS = r"(Sr|Sra|Srta|Dr|Dra|Prof|Ele\s|Ela\s|Eles\s|Elas\s|Isso\s|Aquilo\s|Aquele\s|Aquela\s|Mas\s|Por√©m\s|Contudo\s|Entretanto\s|Assim\s|Ent√£o\s|Onde\s|Quando\s|Enquanto\s|Se\s|Caso\s)"
_ACRONYMS = "([A-Z√Å-√ö][.][A-Z√Å-√ö][.](?:[A-Z√Å-√ö][.])?)"
_WEBSITES = "[.](com|net|org|io|gov|edu|me|br)"
_DIGITS = "([0-9])"
_MULTIPLE_DOTS = r"\.{2,}"

def split_into_sentences(text):
  """Split the text into sentences.

  Args:
    text: A string that consists of more than or equal to one sentences.

  Returns:
    A list of strings where each string is a sentence.
  """
  text = " " + text + "  "
  text = text.replace("\n", " ")
  text = re.sub(_PREFIXES, "\\1<prd>", text)
  text = re.sub(_WEBSITES, "<prd>\\1", text)
  text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1<prd>\\2", text)
  text = re.sub(
      _MULTIPLE_DOTS,
      lambda match: "<prd>" * len(match.group(0)) + "<stop>",
      text,
  )
  if "Ph.D" in text:
    text = text.replace("Ph.D.", "Ph<prd>D<prd>")
  text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1<prd> ", text)
  text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1<stop> \\2", text)
  text = re.sub(
      _ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]",
      "\\1<prd>\\2<prd>\\3<prd>",
      text,
  )
  text = re.sub(
      _ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1<prd>\\2<prd>", text
  )
  text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1<stop> \\2", text)
  text = re.sub(" " + _SUFFIXES + "[.]", " \\1<prd>", text)
  text = re.sub(" " + _ALPHABETS + "[.]", " \\1<prd>", text)
  if "‚Äù" in text:
    text = text.replace(".‚Äù", "‚Äù.")
  if '"' in text:
    text = text.replace('."', '".')
  if "!" in text:
    text = text.replace('!"', '"!')
  if "?" in text:
    text = text.replace('?"', '"?')
  text = text.replace(".", ".<stop>")
  text = text.replace("?", "?<stop>")
  text = text.replace("!", "!<stop>")
  text = text.replace("<prd>", ".")
  sentences = text.split("<stop>")
  sentences = [s.strip() for s in sentences]
  if sentences and not sentences[-1]:
    sentences = sentences[:-1]
  return sentences

def count_words(text):
  """Counts the number of words."""
  nlp = _get_sentence_tokenizer()
  tokenized_text = nlp(text)  # Process the text with the Portuguese tokenizer
  num_words = len([token.text for token in tokenized_text if not token.is_punct])  # Count non-punctuation tokens
  return num_words

@functools.lru_cache(maxsize=None)
def _get_sentence_tokenizer():
    # --- CORRE√á√ÉO APLICADA AQUI ---
    # Carregamos o modelo desabilitando componentes pesados
    nlp = spacy.load("pt_core_news_sm", disable=["tagger", "parser", "ner"])
    # Adicionamos manualmente o 'sentencizer' para permitir a divis√£o de frases (.sents)
    nlp.add_pipe("sentencizer")
    return nlp

def tokenize_words(text):
  """Returns a list of words from the text, respecting Portuguese special characters and features with spaCy."""
  # Load the Portuguese tokenizer model from spaCy
  nlp = _get_sentence_tokenizer()
  tokenized_text = nlp(text)  # Process the text with the Portuguese tokenizer
  # Extract non-punctuation tokens
  words = [token.text for token in tokenized_text if not token.is_punct]
  return words

def count_sentences(text) -> int:
    nlp = _get_sentence_tokenizer()
    tokenized_text = nlp(text)
    num_sentences = len(list(tokenized_text.sents))  # Count the number of sentences
    return num_sentences

def generate_keywords(num_keywords):
  """Randomly generates a few keywords."""
  return random.sample(WORD_LIST, k=num_keywords)

Overwriting instruction_utils/pt_instructions_util.py


In [28]:
import os
import subprocess
import shutil
from huggingface_hub import scan_cache_dir
import time
from datetime import timedelta

# --- CONFIGURA√á√ÉO DA ESCALA ---
MODELS_TO_BENCHMARK = [
    # 'gpt-4o-mini-2024-07-18',
    # 'gpt-4o-2024-08-06',
    # 'o1-preview-2024-09-12',
    # 'o1-mini-2024-09-12',
    # 'claude-3-haiku-20240307',
    # 'claude-3-5-sonnet-20240620',
    # 'claude-3-opus-20240229',
    # 'gemini-1.5-pro-002',
    # 'gemini-1.5-flash-002',
    # 'CohereForAI/c4ai-command-r-plus-4bit',
    # 'CohereForAI/c4ai-command-r-v01-4bit',
    # 'CohereForAI/aya-23-8B',
     'Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4',
    # 'Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4',
    # 'Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4',
    # 'Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4',
    # 'Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4',
    # 'Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4',
    # 'Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4'',
    # 'hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4',
    # 'hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4',
    # 'mistralai/Mistral-7B-Instruct-v0.3',
    # 'deepseek-ai/deepseek-llm-7b-chat''
]

def delete_model_cache(model_id):
    print(f"üßπ Limpando cache para liberar espa√ßo: {model_id}...")
    try:
        hf_cache_info = scan_cache_dir()
        found = False
        for repo in hf_cache_info.repos:
            if repo.repo_id == model_id:
                shutil.rmtree(repo.repo_path)
                found = True
        if found: print("   -> Cache removido.")
        else: print("   -> Nada no cache para remover.")
    except Exception as e:
        print(f"   -> Erro n√£o fatal na limpeza: {e}")

def format_time(seconds):
    return str(timedelta(seconds=int(seconds)))

# --- IN√çCIO ---
benchmark_start_time = time.time()

for model in MODELS_TO_BENCHMARK:
    model_start_time = time.time()
    safe_model_name = model.replace('/', '__')

    print(f"{'='*60}")
    print(f"üöÄ INICIANDO: {model}")
    print(f"{'='*60}")

    # --- PASSO 1: INFER√äNCIA ---
    print(">> Passo 1: Infer√™ncia (Processo Isolado)")
    t0_inf = time.time()

    try:
        # check=True faz o Python disparar erro se o worker retornar c√≥digo != 0
        # Isso acontece se der OOM ou crash no script filho
        process = subprocess.run(
            ["python", "universal_inference.py", "--model_name", model],
            check=True,
            text=True
        )
        inference_time = time.time() - t0_inf
        print(f"   ‚è±Ô∏è Tempo de Infer√™ncia: {format_time(inference_time)}")
        inferencia_sucesso = True

    except subprocess.CalledProcessError as e:
        print(f"\n‚ùå FALHA NO MODELO {model}")
        print(f"   Motivo: O processo de infer√™ncia retornou erro (C√≥digo {e.returncode}).")
        print("   Diagn√≥stico: Provavelmente falta de mem√≥ria (OOM) ou erro de biblioteca.")
        print("   A√ß√£o: Pulando avalia√ß√£o deste modelo e limpando recursos.")

        # Limpa e vai para o pr√≥ximo loop
        delete_model_cache(model)
        inferencia_sucesso = False
        continue # <--- Se quiser pular a avalia√ß√£o desse modelo espec√≠fico

    # --- PASSO 2: AVALIA√á√ÉO (S√≥ roda se a infer√™ncia funcionou) ---
    if inferencia_sucesso:
        print("\n>> Passo 2: Avalia√ß√£o")
        t0_eval = time.time()
        langs = ["pt"]

        for lang in langs:
            resp_file = f"data/{lang}_input_response_data_{safe_model_name}_new.jsonl"
            out_dir = f"evaluations/{lang}_input_response_data_{safe_model_name}_new"

            if os.path.exists(resp_file):
                os.makedirs(out_dir, exist_ok=True)
                try:
                    subprocess.run([
                        "python", "-m", "evaluation_main",
                        "--input_data", f"data/{lang}_input_data_FINAL_CLEAN.jsonl",
                        "--input_response_data", resp_file,
                        "--output_dir", out_dir
                    ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
                    print(f"   ‚úÖ {lang.upper()}: OK")
                except:
                    print(f"   ‚ùå {lang.upper()}: Falhou na etapa de c√°lculo de m√©tricas.")
            else:
                print(f"   ‚ö†Ô∏è {lang.upper()}: Arquivo de resposta n√£o encontrado.")

        print(f"   ‚è±Ô∏è Tempo Avalia√ß√£o: {format_time(time.time() - t0_eval)}")

    # --- LIMPEZA FINAL DO CICLO ---
    print(f"\n>> Passo 3: Limpeza P√≥s-Ciclo")
    delete_model_cache(model)

    total_model_time = time.time() - model_start_time
    print(f"‚úÖ Ciclo finalizado para {model}")
    print(f"‚è±Ô∏è Tempo total deste modelo: {format_time(total_model_time)}\n")

# --- FIM GERAL ---
total_benchmark_time = time.time() - benchmark_start_time
print(f"\n{'='*60}")
print(f"üéâ BENCHMARK COMPLETO! Tempo total: {format_time(total_benchmark_time)}")

üöÄ INICIANDO: Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4
>> Passo 1: Infer√™ncia (Processo Isolado)
   ‚è±Ô∏è Tempo de Infer√™ncia: 0:03:00

>> Passo 2: Avalia√ß√£o
   ‚úÖ PT: OK
   ‚è±Ô∏è Tempo Avalia√ß√£o: 0:00:24

>> Passo 3: Limpeza P√≥s-Ciclo
üßπ Limpando cache para liberar espa√ßo: Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4...
   -> Cache removido.
‚úÖ Ciclo finalizado para Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4
‚è±Ô∏è Tempo total deste modelo: 0:03:25


üéâ BENCHMARK COMPLETO! Tempo total: 0:03:25


In [30]:
# BAIXA OS JSONs

from google.colab import files
from datetime import datetime

# Usa a mesma l√≥gica de nome do modelo
if 'MODELS_TO_BENCHMARK' in globals() and MODELS_TO_BENCHMARK:
    model_name = MODELS_TO_BENCHMARK[0]
else:
    # Fallback caso a vari√°vel n√£o esteja na mem√≥ria
    model_name = "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4"

safe_model_name = model_name.replace('/', '__')

# Cria um nome √∫nico para o zip
timestamp = datetime.now().strftime("%H%M")
nome_zip = f"novos_jsons_{safe_model_name}_{timestamp}"
pasta_temp = "download_temp_jsons"

# Cria a pasta tempor√°ria para organizar os arquivos
os.makedirs(pasta_temp, exist_ok=True)

print(f"üì¶ Preparando pacote de download para: {model_name}")
print("-" * 60)

arquivos_zipados = 0
langs = ["pt"]

for lang in langs:
    # O caminho exato dos arquivos "_new"
    file_path = f"data/{lang}_input_response_data_{safe_model_name}_new.jsonl"

    if os.path.exists(file_path):
        # Copia o arquivo para a pasta de download
        shutil.copy(file_path, pasta_temp)
        print(f"‚úÖ Adicionado ao pacote: {os.path.basename(file_path)}")
        arquivos_zipados += 1
    else:
        print(f"‚ö†Ô∏è  Arquivo n√£o encontrado (n√£o ser√° baixado): {file_path}")

print("-" * 60)

if arquivos_zipados > 0:
    print(f"üìö Compactando {arquivos_zipados} arquivos...")
    shutil.make_archive(nome_zip, 'zip', pasta_temp)

    print(f"‚¨áÔ∏è Iniciando download de {nome_zip}.zip ...")
    files.download(f"{nome_zip}.zip")

    # Limpa a pasta tempor√°ria ap√≥s gerar o zip (opcional)
    shutil.rmtree(pasta_temp)
else:
    print("‚ùå Nenhum arquivo encontrado para baixar.")

üì¶ Preparando pacote de download para: Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4
------------------------------------------------------------
‚úÖ Adicionado ao pacote: pt_input_response_data_Qwen__Qwen2.5-0.5B-Instruct-GPTQ-Int4_new.jsonl
------------------------------------------------------------
üìö Compactando 1 arquivos...
‚¨áÔ∏è Iniciando download de novos_jsons_Qwen__Qwen2.5-0.5B-Instruct-GPTQ-Int4_0133.zip ...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [31]:
# BAIXA OS EVALUATIONS

# Tenta pegar o nome do modelo da vari√°vel global, ou usa o padr√£o se n√£o existir
if 'MODELS_TO_BENCHMARK' in globals() and MODELS_TO_BENCHMARK:
    model_name = MODELS_TO_BENCHMARK[0]
else:
    model_name = "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4"

safe_model_name = model_name.replace('/', '__')

# Cria nome √∫nico para o zip
timestamp = datetime.now().strftime("%H%M")
nome_zip = f"resultados_evaluation_new_{timestamp}"
pasta_temp = "download_temp_evals"

# Garante que a pasta tempor√°ria est√° limpa/criada
if os.path.exists(pasta_temp):
    shutil.rmtree(pasta_temp)
os.makedirs(pasta_temp)

print(f"üì¶ Empacotando resultados de avalia√ß√£o para: {model_name}")
print("-" * 60)

pastas_encontradas = 0
langs = ["pt"]

for lang in langs:
    # O caminho exato que o seu script anterior verificou
    src_dir = f"evaluations/{lang}_input_response_data_{safe_model_name}_new"

    # Define onde salvar dentro do zip (ex: download_temp/en_results)
    dst_dir = os.path.join(pasta_temp, f"{lang}_results")

    if os.path.exists(src_dir):
        # Copia a pasta inteira (com os jsons strict e loose dentro)
        shutil.copytree(src_dir, dst_dir)
        print(f"‚úÖ Adicionada pasta: {src_dir}")
        pastas_encontradas += 1
    else:
        print(f"‚ö†Ô∏è  Pasta n√£o encontrada (ignorada): {src_dir}")

print("-" * 60)

if pastas_encontradas > 0:
    print(f"üìö Compactando {pastas_encontradas} pastas de avalia√ß√£o...")
    shutil.make_archive(nome_zip, 'zip', pasta_temp)

    print(f"‚¨áÔ∏è Iniciando download de {nome_zip}.zip ...")
    files.download(f"{nome_zip}.zip")

    # Limpeza
    shutil.rmtree(pasta_temp)
else:
    print("‚ùå Nenhuma pasta de avalia√ß√£o encontrada para baixar.")

üì¶ Empacotando resultados de avalia√ß√£o para: Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4
------------------------------------------------------------
‚úÖ Adicionada pasta: evaluations/pt_input_response_data_Qwen__Qwen2.5-0.5B-Instruct-GPTQ-Int4_new
------------------------------------------------------------
üìö Compactando 1 pastas de avalia√ß√£o...
‚¨áÔ∏è Iniciando download de resultados_evaluation_new_0133.zip ...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>