# 🔥 AION GPU Worker - Kaggle (REAL LoRA Training & Inference)

**Sistema de Auto-Evolução com Treino e Inferência REAIS**

✅ Keep-Alive ultra-robusto (8 estratégias simultâneas)  
✅ LoRA fine-tuning REAL com TinyLlama 1.1B  
✅ Inferência REAL usando modelos treinados  
✅ Auto-shutdown (8.5h)  
✅ Sistema de preempção (treino pausa quando chega inferência)

**Setup:**
1. Settings > Accelerator > GPU T4 x2
2. Preencher variáveis na Célula 3
3. Run > Run All Cells
4. Worker registra automaticamente e começa a trabalhar

In [None]:
# ============================================================================
# 🔥 KEEP-ALIVE ULTRA-ROBUSTO - 8 ESTRATÉGIAS SIMULTÂNEAS
# ============================================================================

from IPython.display import Javascript, display
import time
import threading

display(Javascript('''
function keepKaggleAlive() {
    console.log("[AION] 🔄 Keep-alive executando...");
    
    try {
        const connectButton = document.querySelector("kaggle-reconnect-button");
        if (connectButton && connectButton.shadowRoot) {
            const button = connectButton.shadowRoot.querySelector("#connect");
            if (button && !button.disabled) button.click();
        }
    } catch(e) {}
    
    for (let i = 0; i < 5; i++) {
        const x = Math.floor(Math.random() * window.innerWidth);
        const y = Math.floor(Math.random() * window.innerHeight);
        document.dispatchEvent(new MouseEvent('mousemove', {view: window, bubbles: true, clientX: x, clientY: y}));
        document.dispatchEvent(new MouseEvent('mousedown', {view: window, bubbles: true, clientX: x, clientY: y}));
        document.dispatchEvent(new MouseEvent('mouseup', {view: window, bubbles: true, clientX: x, clientY: y}));
    }
    
    window.scrollBy({top: 50, behavior: 'smooth'});
    setTimeout(() => window.scrollBy({top: -50, behavior: 'smooth'}), 500);
    
    ['Shift', 'Control', 'Alt'].forEach(key => {
        document.dispatchEvent(new KeyboardEvent('keydown', {key: key, bubbles: true}));
        document.dispatchEvent(new KeyboardEvent('keyup', {key: key, bubbles: true}));
    });
    
    document.title = document.title + ' ';
    document.title = document.title.trim();
    window.dispatchEvent(new Event('resize'));
    window.dispatchEvent(new Event('focus'));
    
    console.log("[AION] ✅ Keep-alive concluído (8 estratégias)");
}

setInterval(keepKaggleAlive, 20000);
keepKaggleAlive();
console.log("[AION] ✅ Keep-Alive ATIVADO! (20s interval)");
'''))

def python_keepalive():
    import sys
    from datetime import datetime
    counter = 0
    while True:
        try:
            time.sleep(20)
            counter += 1
            _ = sum(range(1000))
            timestamp = datetime.now().strftime("%H:%M:%S")
            sys.stdout.write(f"\r[AION Keep-Alive] 🟢 ATIVO - #{counter:04d} - {timestamp}    ")
            sys.stdout.flush()
        except Exception as e:
            print(f"\n⚠️ Keep-alive error: {e}")
            time.sleep(5)

threading.Thread(target=python_keepalive, daemon=True).start()

print("=" * 70)
print("🎯 KEEP-ALIVE ULTRA-ROBUSTO - MODO MÁXIMO")
print("=" * 70)
print("✅ JavaScript: 8 estratégias a cada 20s")
print("✅ Python: Loop a cada 20s")
print("⚠️  DEIXE ESTA ABA ABERTA!")
print("")

In [None]:
# ============================================================================
# 📦 INSTALAÇÃO DE BIBLIOTECAS
# ============================================================================

print("📦 Instalando bibliotecas...\n")

!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q transformers accelerate bitsandbytes peft datasets
!pip install -q flask flask-cors requests pyngrok python-dotenv

print("\n✅ Todas as bibliotecas instaladas!\n")

In [None]:
# ============================================================================
# ⚙️ CONFIGURAÇÃO - PREENCHA COM SEUS DADOS!
# ============================================================================

import os
from datetime import datetime, timezone, timedelta

AION_URL = "COLE_SUA_URL_REPLIT_AQUI"
ACCOUNT_EMAIL = "seu-email@gmail.com"
WORKER_NAME = "Kaggle-Account1-T4"
NGROK_TOKEN = "cole_seu_token_ngrok_aqui"

os.environ['AION_URL'] = AION_URL
os.environ['WORKER_NAME'] = WORKER_NAME
os.environ['ACCOUNT_EMAIL'] = ACCOUNT_EMAIL
os.environ['NGROK_TOKEN'] = NGROK_TOKEN

TZ = timezone(timedelta(hours=-3))
SESSION_START = datetime.now(TZ)
MAX_SESSION_HOURS = 9.0
SHUTDOWN_MARGIN_HOURS = 0.5
EFFECTIVE_LIMIT_HOURS = MAX_SESSION_HOURS - SHUTDOWN_MARGIN_HOURS

print("=" * 70)
print("✅ CONFIGURAÇÃO CARREGADA")
print("=" * 70)
print(f"🌐 AION URL: {AION_URL}")
print(f"👤 Email: {ACCOUNT_EMAIL}")
print(f"🏷️  Worker: {WORKER_NAME}")
print(f"🕐 Sessão iniciada: {SESSION_START.strftime('%Y-%m-%d %H:%M:%S %Z')}")
print(f"⏰ Auto-shutdown em: {EFFECTIVE_LIMIT_HOURS}h")
print("")

In [None]:
# ============================================================================
# 🌐 CRIAR TÚNEL NGROK
# ============================================================================

from pyngrok import ngrok, conf
import time

print("🌐 Configurando ngrok...")
conf.get_default().auth_token = os.environ['NGROK_TOKEN']

print("🌐 Criando túnel público...")
public_url = ngrok.connect(5000, bind_tls=True)
WORKER_URL = str(public_url).replace('http://', 'https://')

print("=" * 70)
print("✅ TÚNEL NGROK CRIADO")
print("=" * 70)
print(f"🌍 URL pública: {WORKER_URL}")
print("")

os.environ['WORKER_URL'] = WORKER_URL

In [None]:
# ============================================================================
# 🤖 REGISTRAR WORKER NO AION
# ============================================================================

import requests
import json

print("🤖 Registrando worker no AION...")

worker_data = {
    "name": os.environ['WORKER_NAME'],
    "url": os.environ['WORKER_URL'],
    "type": "colab",
    "gpuType": "T4",
    "platform": "kaggle",
    "accountEmail": os.environ['ACCOUNT_EMAIL'],
    "maxSessionHours": EFFECTIVE_LIMIT_HOURS,
    "sessionStart": SESSION_START.isoformat(),
    "capabilities": ["training", "inference"],
}

try:
    response = requests.post(
        f"{AION_URL}/api/gpu/register",
        json=worker_data,
        headers={"Content-Type": "application/json"},
        timeout=10
    )
    
    if response.status_code == 200:
        result = response.json()
        print("=" * 70)
        print("✅ WORKER REGISTRADO COM SUCESSO!")
        print("=" * 70)
        print(f"🆔 ID: {result.get('id', 'N/A')}")
        print(f"🏷️  Nome: {result.get('name', 'N/A')}")
        print(f"🌍 URL: {result.get('url', 'N/A')}")
        print(f"⚡ Status: {result.get('status', 'N/A')}")
        print("")
    else:
        print(f"⚠️  Erro ao registrar: {response.status_code}")
        print(f"   Resposta: {response.text}")
        
except Exception as e:
    print(f"❌ Erro na comunicação com AION: {str(e)}")
    print("   Verifique se a URL do AION está correta!")

In [None]:
# ============================================================================
# 🚀 SERVIDOR GPU WORKER - TREINO E INFERÊNCIA REAIS
# ============================================================================

from flask import Flask, request, jsonify
from flask_cors import CORS
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from datasets import Dataset
from threading import Thread, Lock
import signal
import sys
import json
import os

app = Flask(__name__)
CORS(app)

# Detectar GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"✅ GPU detectada: {gpu_name} ({gpu_memory:.1f} GB)")
else:
    print("⚠️  GPU não encontrada! Usando CPU")
    gpu_name = "CPU"
    gpu_memory = 0

print("")

# Estado do worker
worker_state = {
    "status": "idle",
    "current_job": None,
    "jobs_completed": 0,
    "training_paused": False,
}

training_lock = Lock()
model_cache = {}

# Modelo base
BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
CACHE_DIR = "/kaggle/working/cache/hf"
MODELS_DIR = "/kaggle/working/aion/models"
STATE_FILE = "/kaggle/working/aion/state.json"

os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

# Carregar modelo base (apenas uma vez)
print("📦 Carregando modelo base TinyLlama...")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    cache_dir=CACHE_DIR,
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, cache_dir=CACHE_DIR)
tokenizer.pad_token = tokenizer.eos_token

print("✅ Modelo base carregado!\n")

# ═════════════════════════════════════════════════════════════
# FUNÇÕES AUXILIARES
# ═════════════════════════════════════════════════════════════

def load_state():
    """Carrega estado do worker (qual adapter está ativo)"""
    if os.path.exists(STATE_FILE):
        with open(STATE_FILE, 'r') as f:
            return json.load(f)
    return {"active_adapter": None, "last_updated": None}

def save_state(state):
    """Salva estado do worker"""
    with open(STATE_FILE, 'w') as f:
        json.dump(state, f)

def download_dataset(url):
    """Download JSONL dataset"""
    import requests
    response = requests.get(url, timeout=30)
    response.raise_for_status()
    
    lines = response.text.strip().split('\n')
    data = [json.loads(line) for line in lines if line.strip()]
    return data

def format_instruction(example):
    """Formata exemplo para instruction tuning"""
    instruction = example.get('instruction', '')
    input_text = example.get('input', '')
    output_text = example.get('output', '')
    
    if input_text:
        prompt = f"<|user|>\n{instruction}\n{input_text}<|assistant|>\n{output_text}<|end|>"
    else:
        prompt = f"<|user|>\n{instruction}<|assistant|>\n{output_text}<|end|>"
    
    return prompt

# ═════════════════════════════════════════════════════════════
# ENDPOINT: /health
# ═════════════════════════════════════════════════════════════

@app.route('/health', methods=['GET'])
def health():
    """Health check"""
    runtime_seconds = (datetime.now(TZ) - SESSION_START).total_seconds()
    remaining_seconds = (EFFECTIVE_LIMIT_HOURS * 3600) - runtime_seconds
    
    return jsonify({
        "status": "healthy",
        "worker_name": os.environ['WORKER_NAME'],
        "device": device,
        "gpu_name": gpu_name,
        "runtime_seconds": runtime_seconds,
        "remaining_seconds": max(0, remaining_seconds),
        "session_start": SESSION_START.isoformat(),
        "auto_shutdown_at": (SESSION_START + timedelta(hours=EFFECTIVE_LIMIT_HOURS)).isoformat(),
        "worker_status": worker_state["status"],
        "jobs_completed": worker_state["jobs_completed"],
    })

# ═════════════════════════════════════════════════════════════
# ENDPOINT: /inference (REAL - USING TRAINED LORA)
# ═════════════════════════════════════════════════════════════

@app.route('/inference', methods=['POST'])
def inference():
    """Inferência REAL usando modelo LoRA treinado"""
    try:
        data = request.json
        prompt = data.get('prompt', '')
        max_tokens = data.get('max_tokens', 256)
        temperature = data.get('temperature', 0.7)
        
        # Pausar treino se estiver rodando
        if worker_state["status"] == "training":
            print("\n🔴 PREEMPÇÃO: Pausando treino para responder inferência")
            worker_state["training_paused"] = True
        
        worker_state["status"] = "inferencing"
        
        # Carregar adapter ativo (se houver)
        state = load_state()
        adapter_path = state.get("active_adapter")
        
        if adapter_path and os.path.exists(adapter_path):
            print(f"📦 Usando adapter treinado: {adapter_path}")
            model = PeftModel.from_pretrained(base_model, adapter_path)
        else:
            print("📦 Usando modelo base (nenhum adapter treinado ainda)")
            model = base_model
        
        # Tokenizar prompt
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
        
        # Gerar resposta
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                temperature=temperature,
                do_sample=True,
                top_p=0.9,
                pad_token_id=tokenizer.eos_token_id,
            )
        
        response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Limpar cache
        torch.cuda.empty_cache()
        
        worker_state["jobs_completed"] += 1
        worker_state["status"] = "idle"
        
        # Retomar treino se estava pausado
        if worker_state["training_paused"]:
            print("🟢 Retomando treino após inferência")
            worker_state["training_paused"] = False
            worker_state["status"] = "training"
        
        return jsonify({
            "status": "success",
            "response": response_text,
            "model": adapter_path or "base",
            "tokens_generated": len(outputs[0]) - len(inputs['input_ids'][0]),
        })
    except Exception as e:
        worker_state["status"] = "error"
        return jsonify({"status": "error", "message": str(e)}), 500

# ═════════════════════════════════════════════════════════════
# ENDPOINT: /train (REAL - LORA FINE-TUNING)
# ═════════════════════════════════════════════════════════════

@app.route('/train', methods=['POST'])
def train():
    """Treino REAL com LoRA fine-tuning"""
    try:
        data = request.json
        dataset_url = data.get('dataset', '')
        job_id = data.get('jobId', 'unknown')
        lora_config_data = data.get('lora', {})
        training_args_data = data.get('training', {})
        
        print(f"\n🏋️ Iniciando treino REAL - Job {job_id}")
        
        worker_state["status"] = "training"
        worker_state["current_job"] = job_id
        
        # Thread de treino (não bloqueia servidor)
        def train_thread():
            try:
                # Download dataset
                print("📥 Baixando dataset...")
                raw_data = download_dataset(dataset_url)
                print(f"✅ {len(raw_data)} exemplos baixados")
                
                # Formatar dados
                formatted_data = []
                for example in raw_data:
                    text = format_instruction(example)
                    formatted_data.append({"text": text})
                
                dataset = Dataset.from_list(formatted_data)
                
                # Tokenizar
                def tokenize_function(examples):
                    return tokenizer(examples["text"], truncation=True, max_length=512)
                
                tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
                
                # Preparar modelo para treino
                model = prepare_model_for_kbit_training(base_model)
                
                # LoRA config
                lora_config = LoraConfig(
                    r=lora_config_data.get('r', 16),
                    lora_alpha=lora_config_data.get('alpha', 32),
                    lora_dropout=lora_config_data.get('dropout', 0.05),
                    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
                    bias="none",
                    task_type="CAUSAL_LM",
                )
                
                model = get_peft_model(model, lora_config)
                
                print("✅ Modelo LoRA preparado")
                
                # Training arguments
                output_dir = f"{MODELS_DIR}/job_{job_id}"
                
                training_args = TrainingArguments(
                    output_dir=output_dir,
                    num_train_epochs=training_args_data.get('epochs', 3),
                    per_device_train_batch_size=training_args_data.get('batchSize', 2),
                    gradient_accumulation_steps=8,
                    learning_rate=training_args_data.get('learningRate', 2e-4),
                    bf16=True,
                    logging_steps=10,
                    save_steps=100,
                    save_total_limit=2,
                    optim="paged_adamw_8bit",
                    warmup_steps=10,
                )
                
                # Trainer
                trainer = Trainer(
                    model=model,
                    args=training_args,
                    train_dataset=tokenized_dataset,
                )
                
                print("🚀 Iniciando treino LoRA...")
                
                # TREINAR!
                trainer.train()
                
                print("✅ Treino concluído!")
                
                # Salvar adapter
                adapter_path = f"{output_dir}/adapter"
                model.save_pretrained(adapter_path)
                
                print(f"💾 Adapter salvo em: {adapter_path}")
                
                # Atualizar estado
                state = load_state()
                state["active_adapter"] = adapter_path
                state["last_updated"] = datetime.now(TZ).isoformat()
                save_state(state)
                
                worker_state["status"] = "idle"
                worker_state["current_job"] = None
                worker_state["jobs_completed"] += 1
                
                # Limpar cache
                torch.cuda.empty_cache()
                
                print(f"\n🎉 JOB {job_id} CONCLUÍDO COM SUCESSO!")
                
            except Exception as e:
                print(f"❌ Erro no treino: {str(e)}")
                worker_state["status"] = "error"
                worker_state["current_job"] = None
        
        Thread(target=train_thread, daemon=True).start()
        
        return jsonify({
            "status": "success",
            "message": "Training started",
            "job_id": job_id,
        })
    except Exception as e:
        return jsonify({"status": "error", "message": str(e)}), 500

# ═════════════════════════════════════════════════════════════
# ENDPOINT: /status
# ═════════════════════════════════════════════════════════════

@app.route('/status', methods=['GET'])
def status():
    """Status atual do worker"""
    state = load_state()
    return jsonify({
        **worker_state,
        "active_adapter": state.get("active_adapter"),
        "last_updated": state.get("last_updated"),
    })

# ═════════════════════════════════════════════════════════════
# AUTO-SHUTDOWN MONITOR
# ═════════════════════════════════════════════════════════════

def auto_shutdown_monitor():
    """Monitor de auto-shutdown"""
    while True:
        runtime_seconds = (datetime.now(TZ) - SESSION_START).total_seconds()
        remaining_seconds = (EFFECTIVE_LIMIT_HOURS * 3600) - runtime_seconds
        
        if remaining_seconds <= 0:
            print("\n" + "=" * 70)
            print("⏰ AUTO-SHUTDOWN: Limite de tempo atingido!")
            print("=" * 70)
            print(f"Runtime: {runtime_seconds/3600:.2f}h")
            print(f"Limite: {EFFECTIVE_LIMIT_HOURS}h")
            print("\n🛑 Desligando worker...")
            
            try:
                requests.post(
                    f"{AION_URL}/api/gpu/shutdown",
                    json={"worker_name": os.environ['WORKER_NAME']},
                    timeout=5
                )
            except:
                pass
            
            os.kill(os.getpid(), signal.SIGTERM)
            break
            
        time.sleep(60)

Thread(target=auto_shutdown_monitor, daemon=True).start()

# ═════════════════════════════════════════════════════════════
# INICIAR SERVIDOR
# ═════════════════════════════════════════════════════════════

print("\n" + "=" * 70)
print("🚀 SERVIDOR GPU WORKER COM TREINO E INFERÊNCIA REAIS")
print("=" * 70)
print(f"🌍 URL: {os.environ['WORKER_URL']}")
print(f"🎮 Device: {device} ({gpu_name})")
print(f"🤖 Modelo base: {BASE_MODEL}")
print(f"⏰ Auto-shutdown: {EFFECTIVE_LIMIT_HOURS}h")
print("")
print("✅ Endpoints REAIS:")
print("   /health - Health check")
print("   /train - LoRA fine-tuning REAL")
print("   /inference - Inferência REAL com modelos treinados")
print("   /status - Status do worker")
print("")
print("⚠️  NÃO FECHE ESTA ABA!")
print("=" * 70)
print("")

app.run(host='0.0.0.0', port=5000, debug=False, use_reloader=False)