<a href="https://colab.research.google.com/github/galenzo17/AI-personal-test/blob/main/fine_tune_on_fly.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import sys
import json
import glob
import shutil
import logging
import subprocess
from tqdm.auto import tqdm

# Configuración
class cfg:
    # Modelo
    model_path = '/kaggle/input/qwen2.5/transformers/0.5b-instruct/1'
    input_lora_path = '/kaggle/input/loras/transformers/qwen2.5-0.5b-instruct/8'
    prompt_version = 'output-from-examples-v1'
    merged_model_path = '/kaggle/tmp/qwen_merged_model'
    grid_encoder = 'GridShapeEncoder(RowNumberEncoder(MinimalGridEncoder()))'
    max_model_len = 10240
    # Conjunto de datos
    dataset_path = '/kaggle/input/arc-prize-2024/arc-agi_test_challenges.json'
    n_splits = 100
    split_size = 100 // n_splits
    # Parámetros de ajuste fino
    total_train_steps = 32000
    max_steps = total_train_steps // n_splits
    learning_rate = 8e-5
    lr_scheduler_type = "linear"
    batch_size = 1
    max_seq_len = 5120
    # Parámetros de inferencia
    predictions_per_task = 96
    inference_timeout = "12m"
    # Ensamble
    ensemble_with_2020 = True

# Verificación de ejecución de prueba
is_dry_run = (cfg.dataset_path == '/kaggle/input/arc-prize-2024/arc-agi_test_challenges.json' and
              not os.getenv('KAGGLE_IS_COMPETITION_RERUN'))
if is_dry_run:
    print('Esta es una ejecución de prueba; no se realizará inferencia ni instalación de paquetes.')

# Validación de la versión del prompt
if int(cfg.input_lora_path.split('/')[-1]) < 18 and cfg.input_lora_path.startswith('/kaggle/input/loras/transformers/qwen2-0.5b'):
    assert cfg.prompt_version == 'output-from-examples-v0'
else:
    assert cfg.prompt_version == 'output-from-examples-v1'

# Configuración de registros
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

# Lanzamiento en segundo plano de la solución 2020
if not is_dry_run and cfg.ensemble_with_2020:
    print('Lanzando la solución 2020 en segundo plano')
    args = [
        'python',
        '/kaggle/input/arc24-source-code/full_2020_solution.py',
        f'--dataset_filepath={cfg.dataset_path}',
        '--icecuber_output_filepath=icecuber_submission.json',
        '--dsl_output_filepath=submission_program_search.json'
    ]
    full_2020_solution_process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Instalación de bibliotecas y monitor de recursos
if not is_dry_run:
    subprocess.run(['bash', '/kaggle/input/arc24-source-code/install_libraries.sh'], check=True)
    from arc24.utils import ResourceMonitor
    monitor = ResourceMonitor(interval=1)
    monitor.start()

# Preparación de datos para entrenamiento
if not is_dry_run:
    single_task_datasets_path = 'single_task_datasets'
    os.makedirs(single_task_datasets_path, exist_ok=True)
    with open(cfg.dataset_path, 'r') as f:
        items = list(json.load(f).items())
    assert len(items) % cfg.split_size == 0
    for batch_idx in tqdm(range(len(items) // cfg.split_size), desc='Creando conjuntos de datos de una sola tarea'):
        data = dict(items[batch_idx * cfg.split_size: (batch_idx + 1) * cfg.split_size])
        assert len(data) == cfg.split_size
        task_id = list(data.keys())[0]
        with open(os.path.join(single_task_datasets_path, f'{task_id}.json'), 'w') as f:
            json.dump(data, f)
    print(f'Contenido de {single_task_datasets_path}:')
    print(os.listdir(single_task_datasets_path))

    training_datasets_path = 'single_task_training_datasets'
    os.makedirs(training_datasets_path, exist_ok=True)
    dataset_filepaths = glob.glob(os.path.join(single_task_datasets_path, '*.json'))
    for dataset_filepath in tqdm(dataset_filepaths, desc='Creando conjuntos de datos de entrenamiento ttft'):
        subprocess.run([
            'python', '/kaggle/input/arc24-source-code/create_n-1_dataset.py',
            dataset_filepath,
            os.path.join(training_datasets_path, os.path.basename(dataset_filepath))
        ], check=True)

# Función para limpiar salidas de entrenamiento excepto el adaptador
def clean_train_output_except_adapter(output_dir):
    patterns = [
        '*/*.pth', '*/*.pt', '*/*.md', '*/*.txt', '*/*.bin', '*/token*',
        '*/added_tokens.json', '*/special_tokens_map.json', '*/vocab.json', '*/trainer_state.json'
    ]
    for pattern in patterns:
        for file in glob.glob(os.path.join(output_dir, pattern)):
            os.remove(file)

# Ajuste fino en tiempo de prueba
if not is_dry_run:
    dataset_filepaths = sorted(glob.glob(os.path.join(training_datasets_path, '*.json')))
    checkpoints_folder = '/kaggle/tmp/checkpoints'
    os.makedirs(checkpoints_folder, exist_ok=True)
    for dataset_filepath in tqdm(dataset_filepaths, desc='Ajustando modelos'):
        output_dir = os.path.join(checkpoints_folder, os.path.splitext(os.path.basename(dataset_filepath))[0])
        subprocess.run([
            'python', '/kaggle/input/arc24-source-code/fine-tuning.py',
            f'--model_path={cfg.model_path}',
            f'--adapter_path={cfg.input_lora_path}',
            f'--output_dir={output_dir}',
            '--train_datasets', dataset_filepath, cfg.prompt_version,
            '--val_dataset', dataset_filepath, cfg.prompt_version,
            f'--max_steps={cfg.max_steps}',
            f'--eval_steps={cfg.max_steps * 2}',
            f'--max_seq_len={cfg.max_seq_len}',
            f'--learning_rate={cfg.learning_rate}',
            f'--lr_scheduler_type={cfg.lr_scheduler_type}',
            f'--batch_size={cfg.batch_size}',
            '--report_to=tensorboard',
            f'--grid_encoder={cfg.grid_encoder}',
            '--remove_train_samples_to_fit_max_seq_len',
            '--torch_dtype=float16',
            '--no-verbose'
        ], check=True)
        clean_train_output_except_adapter(output_dir)
        logging.info(f'Finalizado el ajuste fino para la división {dataset_filepaths.index(dataset_filepath) + 1}/{len(dataset_file
::contentReference[oaicite:0]{index=0}



SyntaxError: unterminated string literal (detected at line 133) (<ipython-input-1-70dc2ab7a449>, line 133)