In [None]:
# ============================================================
# 1. SETUP Y CONFIGURACI√ìN
# ============================================================
import sys
import os
import json
import random
from pathlib import Path
from collections import Counter, defaultdict
from datetime import datetime

# Detectar entorno
IN_COLAB = 'google.colab' in sys.modules

print(f"üñ•Ô∏è  Entorno: {'Google Colab' if IN_COLAB else 'Local (VS Code)'}")

# Resolver paths seg√∫n entorno
if IN_COLAB:
    # ‚îÄ‚îÄ En Colab: los archivos locales NO existen ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    # Montar Google Drive
    from google.colab import drive  # type: ignore
    drive.mount('/content/drive', force_remount=False)

    # Intentar usar archivos desde Drive primero
    DRIVE_FASE2 = Path('/content/drive/MyDrive/OKLA/chatbot-llm/FASE_2_DATASET')
    BASE_DIR = Path('/content/FASE_2_DATASET')
    BASE_DIR.mkdir(exist_ok=True)

    # Archivos necesarios para ejecutar el pipeline
    REQUIRED_FILES = [
        'seed_vehicles.json',
        'seed_dealers.json',
        'conversation_templates.py',
        'generate_dataset.py',
        'validate_dataset.py',
    ]
    REQUIRED_DIRS = {
        'augmentation': ['paraphrase_variants.py'],
    }

    # Verificar si los archivos est√°n en Drive
    if DRIVE_FASE2.exists() and (DRIVE_FASE2 / 'seed_vehicles.json').exists():
        print(f"‚úÖ Archivos encontrados en Drive: {DRIVE_FASE2}")
        import shutil
        # Copiar todo al runtime
        for f in REQUIRED_FILES:
            src = DRIVE_FASE2 / f
            if src.exists():
                shutil.copy2(src, BASE_DIR / f)
                print(f"   ‚úÖ {f}")
            else:
                print(f"   ‚ö†Ô∏è  {f} no encontrado en Drive")
        # Copiar directorios
        for dirname, dir_files in REQUIRED_DIRS.items():
            src_dir = DRIVE_FASE2 / dirname
            dest_dir = BASE_DIR / dirname
            dest_dir.mkdir(exist_ok=True)
            for df in dir_files:
                src = src_dir / df
                if src.exists():
                    shutil.copy2(src, dest_dir / df)
                    print(f"   ‚úÖ {dirname}/{df}")
    else:
        # Archivos no est√°n en Drive ‚Äî pedir upload manual
        print(f"‚ö†Ô∏è  Archivos de FASE 2 no encontrados en Drive.")
        print(f"   Esperado en: {DRIVE_FASE2}")
        print()
        print("   üì§ Subiendo archivos desde tu m√°quina...")
        print("   Selecciona TODOS estos archivos cuando se abra el di√°logo:")
        print("   (est√°n en docs/chatbot-llm/FASE_2_DATASET/)")
        print()
        for f in REQUIRED_FILES:
            print(f"   üìÑ {f}")
        for dirname, dir_files in REQUIRED_DIRS.items():
            for df in dir_files:
                print(f"   üìÑ {dirname}/{df}")
        print()

        from google.colab import files  # type: ignore
        print("üîº Selecciona los archivos (seed_vehicles.json, seed_dealers.json,")
        print("   conversation_templates.py, generate_dataset.py, validate_dataset.py):")
        uploaded = files.upload()

        for fname, content in uploaded.items():
            dest = BASE_DIR / fname
            dest.parent.mkdir(parents=True, exist_ok=True)
            with open(dest, 'wb') as f:
                f.write(content)
            print(f"   ‚úÖ {fname} ‚Üí {dest}")

        # Verificar si falta augmentation
        aug_dir = BASE_DIR / 'augmentation'
        if not (aug_dir / 'paraphrase_variants.py').exists():
            aug_dir.mkdir(exist_ok=True)
            print()
            print("üîº Ahora selecciona: augmentation/paraphrase_variants.py")
            uploaded2 = files.upload()
            for fname, content in uploaded2.items():
                with open(aug_dir / fname, 'wb') as f:
                    f.write(content)
                print(f"   ‚úÖ augmentation/{fname}")

    # Verificar que todo est√° listo
    missing = [f for f in REQUIRED_FILES if not (BASE_DIR / f).exists()]
    if missing:
        print(f"\n‚ùå Archivos faltantes: {missing}")
        raise FileNotFoundError(f"Faltan archivos: {missing}")

else:
    # ‚îÄ‚îÄ Local: usar ruta relativa al notebook ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    BASE_DIR = Path('.').resolve()
    if not (BASE_DIR / 'seed_vehicles.json').exists():
        for candidate in [BASE_DIR, BASE_DIR.parent / 'FASE_2_DATASET']:
            if (candidate / 'seed_vehicles.json').exists():
                BASE_DIR = candidate
                break

OUTPUT_DIR = BASE_DIR / 'output'
OUTPUT_DIR.mkdir(exist_ok=True)

# Agregar BASE_DIR al path para imports
if str(BASE_DIR) not in sys.path:
    sys.path.insert(0, str(BASE_DIR))

print(f"\nüìÇ Base:   {BASE_DIR}")
print(f"üìÇ Output: {OUTPUT_DIR}")

# Verificar archivos clave
for f in ['seed_vehicles.json', 'seed_dealers.json', 'conversation_templates.py', 'generate_dataset.py']:
    status = '‚úÖ' if (BASE_DIR / f).exists() else '‚ùå'
    print(f"   {status} {f}")

print(f"\n‚úÖ Setup completo")

: 

In [None]:
# Este c√≥digo ya est√° manejado en CELL INDEX 0
# No necesitas remontarlo aqu√≠

# Si necesitas acceder a Drive en esta celda:
if IN_COLAB:
	# Ya est√° montado en CELL INDEX 0
	drive_path = Path('/content/drive/MyDrive')
	print(f"‚úÖ Drive disponible en: {drive_path}")
else:
	print("‚ö†Ô∏è No est√°s en Colab ‚Äî usando archivos locales")

In [None]:
# ============================================================
# 1b. SUBIR DATASET A GOOGLE DRIVE (desde Colab)
# ============================================================
# Crea la estructura OKLA/dataset/ en Drive y sube los JSONL
# que generaste localmente en FASE 2.
# ============================================================
from pathlib import Path

drive_base = Path('/content/drive/MyDrive')
drive_dataset = drive_base / 'OKLA' / 'dataset'

# Crear carpeta si no existe
drive_dataset.mkdir(parents=True, exist_ok=True)
print(f"üìÇ Carpeta creada/verificada: {drive_dataset}")

# Verificar si ya hay archivos
JSONL_FILES = ['okla_train.jsonl', 'okla_eval.jsonl', 'okla_test.jsonl']
existing = [f for f in JSONL_FILES if (drive_dataset / f).exists()]

if len(existing) == len(JSONL_FILES):
    print(f"\n‚úÖ Dataset ya est√° en Drive:")
    for f in JSONL_FILES:
        fp = drive_dataset / f
        lines = sum(1 for _ in open(fp))
        size_mb = fp.stat().st_size / 1024 / 1024
        print(f"   ‚úÖ {f}: {lines} conv. ({size_mb:.1f} MB)")
    print(f"\nüöÄ Listo ‚Äî puedes ir a FASE_3 directamente.")
else:
    if existing:
        print(f"\n‚ö†Ô∏è Solo {len(existing)}/{len(JSONL_FILES)} archivos encontrados:")
        for f in existing:
            print(f"   ‚úÖ {f}")
        missing = [f for f in JSONL_FILES if f not in existing]
        for f in missing:
            print(f"   ‚ùå {f}")
    
    # Subir archivos via upload dialog
    print(f"\nüì§ Selecciona los 3 archivos JSONL de tu m√°quina:")
    print(f"   (est√°n en docs/chatbot-llm/FASE_2_DATASET/output/)")
    print()
    
    from google.colab import files  # type: ignore
    uploaded = files.upload()
    
    for fname, content in uploaded.items():
        dest = drive_dataset / fname
        with open(dest, 'wb') as f:
            f.write(content)
        size_mb = len(content) / 1024 / 1024
        print(f"   ‚úÖ {fname} ‚Üí {dest} ({size_mb:.1f} MB)")
    
    # Verificar resultado final
    print(f"\nüìä Verificaci√≥n final:")
    all_ok = True
    for f in JSONL_FILES:
        fp = drive_dataset / f
        if fp.exists():
            lines = sum(1 for _ in open(fp))
            print(f"   ‚úÖ {f}: {lines} conversaciones")
        else:
            print(f"   ‚ùå {f}: FALTA ‚Äî vuelve a ejecutar esta celda")
            all_ok = False
    
    if all_ok:
        print(f"\n‚úÖ Dataset completo en Drive: {drive_dataset}")
        print(f"üöÄ Ahora abre FASE_3_TRAINING/okla_finetune_llama3.ipynb")
    else:
        print(f"\n‚ö†Ô∏è Faltan archivos. Ejecuta esta celda otra vez.")

---
## 2Ô∏è‚É£ Cargar Datos Semilla

Carga los 50 veh√≠culos y 5 dealers del mercado dominicano.

In [None]:
# ============================================================
# 2. CARGAR SEED DATA
# ============================================================

# Cargar veh√≠culos
vehicles_path = BASE_DIR / 'seed_vehicles.json'
with open(vehicles_path) as f:
    vehicles = json.load(f)

print(f"üöó {len(vehicles)} veh√≠culos cargados")
print(f"\nüìä Distribuci√≥n por marca:")
makes = Counter(v['make'] for v in vehicles)
for make, count in makes.most_common(10):
    bar = '‚ñà' * count
    print(f"   {make:15s} {bar} ({count})")

print(f"\nüí∞ Rango de precios:")
prices = [v['price'] for v in vehicles]
print(f"   Min: RD${min(prices):,.0f}")
print(f"   Max: RD${max(prices):,.0f}")
print(f"   Avg: RD${sum(prices)/len(prices):,.0f}")

# Cargar dealers
dealers_path = BASE_DIR / 'seed_dealers.json'
with open(dealers_path) as f:
    dealers = json.load(f)

print(f"\nüè™ {len(dealers)} dealers cargados:")
for d in dealers:
    print(f"   ‚Ä¢ {d['name']} (Bot: {d['botName']}) ‚Äî {d['location']}")

print(f"\n‚úÖ Seed data lista")

---
## 3Ô∏è‚É£ Configuraci√≥n de Generaci√≥n

Ajusta los par√°metros para la generaci√≥n del dataset.

In [None]:
# ============================================================
# 3. CONFIGURACI√ìN
# ============================================================

# ‚îÄ‚îÄ Par√°metros ajustables ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
NUM_CONVERSATIONS = 3000      # Total de conversaciones a generar
TRAIN_RATIO = 0.80            # 80% train
EVAL_RATIO = 0.10             # 10% eval
TEST_RATIO = 0.10             # 10% test
SEED = 42                     # Semilla para reproducibilidad

# Distribuci√≥n de tipos de conversaci√≥n
SINGLE_TURN_PCT = 0.15        # 15% conversaciones de 1 turno
SHORT_MULTI_PCT = 0.60        # 60% multi-turno corto (2-4 turnos)
LONG_MULTI_PCT = 0.25         # 25% multi-turno largo (5-8 turnos)

random.seed(SEED)

# Calcular splits
n_train = int(NUM_CONVERSATIONS * TRAIN_RATIO)
n_eval = int(NUM_CONVERSATIONS * EVAL_RATIO)
n_test = NUM_CONVERSATIONS - n_train - n_eval

print(f"‚öôÔ∏è Configuraci√≥n:")
print(f"   Conversaciones: {NUM_CONVERSATIONS:,}")
print(f"   Train: {n_train:,} ({TRAIN_RATIO*100:.0f}%)")
print(f"   Eval:  {n_eval:,} ({EVAL_RATIO*100:.0f}%)")
print(f"   Test:  {n_test:,} ({TEST_RATIO*100:.0f}%)")
print(f"   Seed:  {SEED}")
print(f"\n   Single-turn: {SINGLE_TURN_PCT*100:.0f}%")
print(f"   Multi-turn short (2-4): {SHORT_MULTI_PCT*100:.0f}%")
print(f"   Multi-turn long (5-8): {LONG_MULTI_PCT*100:.0f}%")

---
## 4Ô∏è‚É£ Importar Templates y Generar

Importa `conversation_templates.py` y ejecuta la generaci√≥n.

In [None]:
# ============================================================
# 4. IMPORTAR TEMPLATES Y GENERAR DATASET
# ============================================================
from conversation_templates import (
    INTENT_REGISTRY,
    MULTI_TURN_CHAINS,
    BODY_TYPE_SLANG,
    PRICE_EXPRESSIONS,
    AFFIRMATIVES,
)

print(f"üìã Templates cargados:")
print(f"   Intents: {len(INTENT_REGISTRY)}")
for intent_name, intent_data in INTENT_REGISTRY.items():
    n_templates = len(intent_data.get('user_templates', []))
    print(f"   ‚Ä¢ {intent_name}: {n_templates} templates")
print(f"   Multi-turn chains: {len(MULTI_TURN_CHAINS)}")
print(f"   Body type slang: {len(BODY_TYPE_SLANG)} entries")
print(f"   Price expressions: {len(PRICE_EXPRESSIONS)} entries")

In [None]:
# ============================================================
# 4b. EJECUTAR GENERACI√ìN
# ============================================================
# Importar el generador
from generate_dataset import (
    generate_single_turn,
    generate_multi_turn,
    build_system_prompt,
)

conversations = []
intent_counts = Counter()
turn_counts = []

print(f"üîÑ Generando {NUM_CONVERSATIONS:,} conversaciones...")
print()

for i in range(NUM_CONVERSATIONS):
    # Seleccionar dealer y veh√≠culos aleatorios
    dealer = random.choice(dealers)
    dealer_vehicles = random.sample(vehicles, min(random.randint(5, 15), len(vehicles)))
    system_prompt = build_system_prompt(dealer, dealer_vehicles)
    
    # Decidir tipo de conversaci√≥n
    r = random.random()
    if r < SINGLE_TURN_PCT:
        conv = generate_single_turn(system_prompt, dealer, dealer_vehicles)
        conv_type = 'single'
    elif r < SINGLE_TURN_PCT + SHORT_MULTI_PCT:
        conv = generate_multi_turn(system_prompt, dealer, dealer_vehicles, 
                                   min_turns=2, max_turns=4)
        conv_type = 'short_multi'
    else:
        conv = generate_multi_turn(system_prompt, dealer, dealer_vehicles,
                                   min_turns=5, max_turns=8)
        conv_type = 'long_multi'
    
    conversations.append(conv)
    
    # Track stats
    n_turns = len([m for m in conv['messages'] if m['role'] == 'user'])
    turn_counts.append(n_turns)
    for m in conv['messages']:
        if m['role'] == 'assistant':
            try:
                parsed = json.loads(m['content'])
                intent_counts[parsed.get('intent', 'unknown')] += 1
            except (json.JSONDecodeError, KeyError):
                pass
    
    if (i + 1) % 500 == 0:
        print(f"   [{i+1:,}/{NUM_CONVERSATIONS:,}] generadas...")

print(f"\n‚úÖ {len(conversations):,} conversaciones generadas")
print(f"   Turnos promedio: {sum(turn_counts)/len(turn_counts):.1f}")
print(f"   Intents √∫nicos: {len(intent_counts)}")

---
## 5Ô∏è‚É£ Visualizar Estad√≠sticas

Distribuci√≥n de intents, turnos y longitudes.

In [None]:
# ============================================================
# 5. ESTAD√çSTICAS Y VISUALIZACI√ìN
# ============================================================
import statistics

print("=" * 60)
print("üìä ESTAD√çSTICAS DEL DATASET")
print("=" * 60)

# ‚îÄ‚îÄ Distribuci√≥n de intents ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
print(f"\nüéØ Distribuci√≥n de Intents ({len(intent_counts)} √∫nicos):")
max_count = max(intent_counts.values()) if intent_counts else 1
for intent, count in intent_counts.most_common():
    bar_len = int(30 * count / max_count)
    bar = '‚ñà' * bar_len
    pct = 100 * count / sum(intent_counts.values())
    print(f"   {intent:25s} {bar:30s} {count:4d} ({pct:5.1f}%)")

# ‚îÄ‚îÄ Distribuci√≥n de turnos ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
print(f"\nüí¨ Distribuci√≥n de Turnos por Conversaci√≥n:")
turn_dist = Counter(turn_counts)
for turns in sorted(turn_dist.keys()):
    count = turn_dist[turns]
    bar = '‚ñà' * int(30 * count / max(turn_dist.values()))
    print(f"   {turns} turnos: {bar:30s} {count:4d}")

print(f"\n   Media:   {statistics.mean(turn_counts):.1f} turnos")
print(f"   Mediana: {statistics.median(turn_counts):.0f} turnos")
print(f"   Std:     {statistics.stdev(turn_counts):.1f}")

# ‚îÄ‚îÄ Longitudes de mensajes ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
user_lengths = []
bot_lengths = []
for conv in conversations:
    for m in conv['messages']:
        if m['role'] == 'user':
            user_lengths.append(len(m['content']))
        elif m['role'] == 'assistant':
            bot_lengths.append(len(m['content']))

print(f"\nüìè Longitudes de Mensajes (caracteres):")
print(f"   User    ‚Äî Media: {statistics.mean(user_lengths):.0f}, Min: {min(user_lengths)}, Max: {max(user_lengths)}")
print(f"   Bot     ‚Äî Media: {statistics.mean(bot_lengths):.0f}, Min: {min(bot_lengths)}, Max: {max(bot_lengths)}")

---
## 6Ô∏è‚É£ Split y Guardar Dataset

Divide en train/eval/test y guarda como JSONL.

In [None]:
# ============================================================
# 6. SPLIT Y GUARDAR
# ============================================================

# Shuffle
random.shuffle(conversations)

# Split
train_data = conversations[:n_train]
eval_data = conversations[n_train:n_train + n_eval]
test_data = conversations[n_train + n_eval:]

# Guardar JSONL
def save_jsonl(data, path):
    with open(path, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    return len(data)

splits = {
    'okla_train.jsonl': train_data,
    'okla_eval.jsonl': eval_data,
    'okla_test.jsonl': test_data,
}

print("üíæ Guardando dataset...")
for fname, data in splits.items():
    path = OUTPUT_DIR / fname
    n = save_jsonl(data, path)
    size_kb = path.stat().st_size / 1024
    print(f"   ‚úÖ {fname}: {n:,} conversaciones ({size_kb:.0f} KB)")

# Guardar stats
stats = {
    'generated_at': datetime.now().isoformat(),
    'total': len(conversations),
    'train': len(train_data),
    'eval': len(eval_data),
    'test': len(test_data),
    'intents': dict(intent_counts.most_common()),
    'avg_turns': round(statistics.mean(turn_counts), 1),
    'seed': SEED,
}
with open(OUTPUT_DIR / 'stats.json', 'w') as f:
    json.dump(stats, f, indent=2, ensure_ascii=False)
print(f"   ‚úÖ stats.json")

print(f"\nüìÇ Output: {OUTPUT_DIR}")

---
## 7Ô∏è‚É£ Validaci√≥n

Valida la estructura y calidad del dataset generado.

In [None]:
# ============================================================
# 7. VALIDACI√ìN
# ============================================================
from validate_dataset import validate_conversation, check_quality

print("üîç Validando dataset...")
print()

total_valid = 0
total_errors = 0
error_types = Counter()

for split_name, split_file in [('train', 'okla_train.jsonl'), ('eval', 'okla_eval.jsonl'), ('test', 'okla_test.jsonl')]:
    path = OUTPUT_DIR / split_file
    valid = 0
    errors = 0
    
    with open(path) as f:
        for line_num, line in enumerate(f, 1):
            try:
                conv = json.loads(line)
                result = validate_conversation(conv, line_num)
                if result['valid']:
                    valid += 1
                else:
                    errors += 1
                    for err in result.get('errors', []):
                        error_types[err] += 1
            except json.JSONDecodeError:
                errors += 1
                error_types['invalid_json'] += 1
    
    total_valid += valid
    total_errors += errors
    status = '‚úÖ' if errors == 0 else '‚ö†Ô∏è'
    print(f"   {status} {split_name}: {valid}/{valid+errors} v√°lidas")

print(f"\n{'='*40}")
total = total_valid + total_errors
pct = 100 * total_valid / total if total > 0 else 0
print(f"üìä Total: {total_valid:,}/{total:,} v√°lidas ({pct:.1f}%)")

if total_errors > 0:
    print(f"\n‚ùå Errores encontrados:")
    for err, count in error_types.most_common():
        print(f"   ‚Ä¢ {err}: {count}")
else:
    print(f"\n‚úÖ DATASET 100% V√ÅLIDO ‚Äî Listo para FASE 3 (Fine-tuning)")

---
## 8Ô∏è‚É£ Subir a Google Drive (para FASE 3)

Si ejecutaste localmente, sube los JSONL a Drive para usarlos en el notebook de fine-tuning.

> üí° Si ejecutaste en Colab con Drive montado, los archivos ya est√°n accesibles.

In [None]:
# ============================================================
# 8. SUBIR A GOOGLE DRIVE (si ejecutaste localmente)
# ============================================================
import shutil

if IN_COLAB:
    # Ya en Colab con Drive montado
    DRIVE_DEST = Path('/content/drive/MyDrive/OKLA/dataset')
    DRIVE_DEST.mkdir(parents=True, exist_ok=True)
    
    for f in OUTPUT_DIR.glob('*.jsonl'):
        shutil.copy2(f, DRIVE_DEST / f.name)
        print(f"   ‚úÖ {f.name} ‚Üí Drive")
    shutil.copy2(OUTPUT_DIR / 'stats.json', DRIVE_DEST / 'stats.json')
    print(f"\nüìÅ Dataset en Drive: {DRIVE_DEST}")
else:
    print("üìã Ejecutaste LOCALMENTE. Los archivos est√°n en:")
    print(f"   {OUTPUT_DIR}")
    print()
    print("   Para usar en FASE 3 (fine-tuning con Colab):")
    print("   1. Sube los JSONL a Google Drive > OKLA > dataset/")
    print("   2. O arrastra los archivos a Google Drive desde Finder")
    print()
    for f in sorted(OUTPUT_DIR.glob('*.jsonl')):
        size = f.stat().st_size / 1024
        print(f"   üìÑ {f.name} ({size:.0f} KB)")

---
## ‚úÖ Resumen

### Artefactos Generados
| Archivo | Contenido |
|---------|----------|
| `okla_train.jsonl` | Datos de entrenamiento |
| `okla_eval.jsonl` | Datos de evaluaci√≥n |
| `okla_test.jsonl` | Datos de test |
| `stats.json` | Estad√≠sticas del dataset |

### üîú Siguiente: FASE 3
Abre `FASE_3_TRAINING/okla_finetune_llama3.ipynb` ‚Üí Select Kernel ‚Üí Colab ‚Üí GPU T4