In [None]:
# Failure GHA step log extractor

import requests
import json
import os
from datetime import datetime, timedelta, timezone
import time
from dotenv import load_dotenv
import re
import base64

# Configuración GitHub
load_dotenv()
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
OWNER = "vercel"
REPO = "next.js"

def sanitize_filename(name, max_length=100):
    name = re.sub(r'[^a-zA-Z0-9 \-_]', '', name)
    return name.strip()[:max_length]

# Patrones de inicio de step y detección de error
STEP_MARKERS = [
    re.compile(r"##\[(?:section|group)\]\s*(?:Starting:|Run)?\s*['\"]?(.*?)['\"]?$", re.I)
]

EXIT_CODE_RE = re.compile(r"(?:exit code|completed with exit code)\s+(\d+)", re.I)

def extract_steps(logs: str):
    """
    Divide el log completo de un job en pasos individuales y marca
    si cada paso falló o no.

    Un paso se considera fallido si:
      • Aparece '##[error]' en sus líneas, o
      • Contiene 'exit code <n>' / 'completed with exit code <n>'
        con n distinto de 0.

    El inicio de un step se detecta a través de la existencia de
    '##[section]Starting:' y/o '##[group]Run'.
    Además se establece que estás dos líneas seguidas no generarán falsas detecciones.
    
    """
    steps = []
    current_name = None
    current_lines = []
    current_failed = False
    last_step_started_at_line = -10

    lines = logs.splitlines()
    for idx, line in enumerate(lines):
        step_found = False
        for pattern in STEP_MARKERS:
            match = pattern.search(line)
            if match:
                name = match.group(1).strip()
                # Evita duplicar si viene de inmediato otro marcador
                if current_name is not None and (idx - last_step_started_at_line) <= 1:
                    continue

                # Cierra step anterior
                if current_name is not None:
                    steps.append({
                        "step_name": current_name,
                        "logs": "\n".join(current_lines),
                        "failed": current_failed,
                    })

                # Abre nuevo
                current_name = name
                current_lines = [line]
                current_failed = False
                last_step_started_at_line = idx
                step_found = True
                break

        if step_found:
            continue

        current_lines.append(line)

        if "##[error]" in line:
            current_failed = True
        else:
            m_exit = EXIT_CODE_RE.search(line)
            if m_exit and int(m_exit.group(1)) != 0:
                current_failed = True

    # Cierra último step
    if current_name is not None:
        steps.append({
            "step_name": current_name,
            "logs": "\n".join(current_lines),
            "failed": current_failed,
        })

    return steps

class GitHubWorkflowLogger:
    def __init__(self, token):
        self.headers = {
            'Authorization': f'token {token}',
            'Accept': 'application/vnd.github.v3+json'
        }
        self.base_url = 'https://api.github.com'

    def get_workflow_runs(self, owner, repo, max_runs=None):
        url = f"{self.base_url}/repos/{owner}/{repo}/actions/runs"
        all_runs = []
        page = 1

        while True:
            params = {'status': 'failure', 'per_page': 100, 'page': page}
            try:
                response = requests.get(url, headers=self.headers, params=params)
                response.raise_for_status()
                runs = response.json().get('workflow_runs', [])
                if not runs:
                    break

                for run in runs:
                    all_runs.append(run)
                    if max_runs and len(all_runs) >= max_runs:
                        return all_runs

                print(f"Página {page}: {len(runs)} workflows fallidos encontrados.")
                page += 1
            except requests.exceptions.RequestException as e:
                print(f"Error obteniendo workflow runs (página {page}): {e}")
                break

        return all_runs

    def get_workflow_run_json(self, owner, repo, run_id):
        url = f"{self.base_url}/repos/{owner}/{repo}/actions/runs/{run_id}"
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error obteniendo detalles del workflow run {run_id}: {e}")
            return None

    def get_job_logs(self, owner, repo, job_id):
        url = f"{self.base_url}/repos/{owner}/{repo}/actions/jobs/{job_id}/logs"
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            return response.text
        except requests.exceptions.RequestException:
            return None

    def get_jobs_json(self, owner, repo, run_id):
        url = f"{self.base_url}/repos/{owner}/{repo}/actions/runs/{run_id}/jobs"
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error obteniendo jobs JSON para run {run_id}: {e}")
            return None

    def get_file_content(self, owner, repo, path, ref=None):
        """
        Devuelve el contenido del archivo en texto plano.
        Usa la API /contents que devuelve Base64 si es binario o texto.
        """
        url = f"{self.base_url}/repos/{owner}/{repo}/contents/{path}"
        params = {'ref': ref} if ref else {}
        try:
            response = requests.get(url, headers=self.headers, params=params)
            response.raise_for_status()
            data = response.json()
            if data.get("encoding") == "base64":
                return base64.b64decode(data["content"]).decode("utf-8", errors="replace")
            return data.get("content", "")
        except requests.exceptions.RequestException as e:
            print(f"Error obteniendo contenido de {path} @ {ref}: {e}")
            return None

    def create_output_structure(self, owner, repo, run_id, run_name):
        base = os.path.join('failure_workflows', f"{owner}_{repo}", f"{run_id}_{sanitize_filename(run_name)}")
        os.makedirs(os.path.join(base, 'jobs'), exist_ok=True)
        os.makedirs(os.path.join(base, 'steps'), exist_ok=True)
        return base

    def save_json(self, data, path):
        with open(path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2)

    def save_text(self, content, path):
        with open(path, 'w', encoding='utf-8') as f:
            f.write(content)

    # Flujo principal
    def analyze_repository(self, owner, repo, max_runs=None):
        print(f"Analizando repositorio: {owner}/{repo}")
        runs = self.get_workflow_runs(owner, repo, max_runs=max_runs)

        if not runs:
            print("No se encontraron workflows fallidos.")
            return

        ninety_days_ago = datetime.now(timezone.utc) - timedelta(days=90)

        for run in runs:
            run_id = run['id']
            run_name = run['name']

            created_at = datetime.strptime(run['created_at'], "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
            if created_at < ninety_days_ago:
                print(f"  Ignorado (más de 90 días): {run_name} (ID: {run_id})")
                continue

            print(f"\nProcesando run {run_name} (ID: {run_id})")

            run_detail = self.get_workflow_run_json(owner, repo, run_id)
            if not run_detail:
                continue

            run_dir = self.create_output_structure(owner, repo, run_id, run_name)

            self.save_json(run_detail, os.path.join(run_dir, 'workflow_run.json'))

            jobs_data = self.get_jobs_json(owner, repo, run_id)
            if not jobs_data:
                continue
            self.save_json(jobs_data, os.path.join(run_dir, 'jobs.json'))

            # Descargar y guardar YAML del workflow
            workflow_path = run_detail.get("path")
            head_sha = run_detail.get("head_sha")
            if workflow_path:
                yaml_content = self.get_file_content(owner, repo, workflow_path, ref=head_sha)
                if yaml_content:
                    yaml_filename = os.path.basename(workflow_path)
                    self.save_text(yaml_content, os.path.join(run_dir, yaml_filename))
                    print(f"  YAML guardado: {yaml_filename}")

            # Procesar cada job fallido (job json, job log, failure steps logs)
            for job in jobs_data.get('jobs', []):
                if job.get('conclusion') != 'failure':
                    continue

                job_id = job['id']
                job_name = job['name']
                print(f"  Job fallido: {job_name} (ID: {job_id})")

                self.save_json(job, os.path.join(run_dir, 'jobs', f"{job_id}_{sanitize_filename(job_name)}.json"))

                job_logs = self.get_job_logs(owner, repo, job_id)
                if job_logs:
                    self.save_text(job_logs, os.path.join(run_dir, 'jobs', f"{job_id}_{sanitize_filename(job_name)}.txt"))

                    steps = extract_steps(job_logs)
                    for step in steps:
                        if not step['failed']:
                            continue  # solo guardar steps fallidos

                        step_filename = f"{job_id}_{sanitize_filename(step['step_name'])}"
                        step_dir = os.path.join(run_dir, 'steps')

                        self.save_text(step['logs'], os.path.join(step_dir, f"{step_filename}.txt"))

                    time.sleep(1)

        print("\nAnálisis completado.")

# Ejecución
if not GITHUB_TOKEN:
    print("ERROR: configurar token de GitHub en la variable GITHUB_TOKEN")
elif not OWNER or not REPO:
    print("ERROR: configurar OWNER y REPO")
else:
    logger = GitHubWorkflowLogger(GITHUB_TOKEN)
    # logger.analyze_repository(OWNER, REPO) # Sin límite en el campo max_runs
    logger.analyze_repository(OWNER, REPO, max_runs=10) # Con límite n en el campo max_runs


Analizando repositorio: vercel/next.js

Procesando run Generate Pull Request Stats (ID: 16040602554)
  YAML guardado: pull_request_stats.yml
  Job fallido: PR Stats (ID: 45261240385)

Procesando run build-and-test (ID: 16039435259)
  YAML guardado: build_and_test.yml

Procesando run test-e2e-deploy-release (ID: 16038663582)
  YAML guardado: test_e2e_deploy_release.yml
  Job fallido: test deploy (5/6) / build (ID: 45256145353)
  Job fallido: test deploy (2/6) / build (ID: 45256145360)
  Job fallido: test deploy (4/6) / build (ID: 45256145361)
  Job fallido: test deploy (3/6) / build (ID: 45256145362)
  Job fallido: test deploy (1/6) / build (ID: 45256145364)
  Job fallido: test deploy (6/6) / build (ID: 45256145365)

Procesando run Test examples (ID: 16038559798)
  YAML guardado: test_examples.yml
  Job fallido: Test Examples (20) (ID: 45255522839)
  Job fallido: Test Examples (18) (ID: 45255522848)

Procesando run Update Font Data (ID: 16038505226)
  YAML guardado: update_fonts_data.ym