In [None]:
# Failure GHA step log extractor

import requests
import json
import os
from datetime import datetime
import time
from dotenv import load_dotenv
import re

# Configuración
load_dotenv()
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')  # Token de GitHub
OWNER = "vercel"              # Owner del repositorio
REPO = "next.js"       # Nombre del repositorio

def sanitize_filename(name, max_length=100):
    name = re.sub(r'[^a-zA-Z0-9 \-_]', '', name)
    return name.strip()[:max_length]

def extract_error_lines(logs):
    error_lines = []
    for line in logs.split('\n'):
        if '##[error]' in line or 'Error:' in line or 'error:' in line:
            error_lines.append(line)
    return '\n'.join(error_lines) if error_lines else '[No se detectó un mensaje de error explícito]'

class GitHubWorkflowLogger:
    def __init__(self, token):
        self.token = token
        self.headers = {
            'Authorization': f'token {token}',
            'Accept': 'application/vnd.github.v3+json'
        }
        self.base_url = 'https://api.github.com'
    
    def get_workflow_runs(self, owner, repo, per_page=100):
        """Obtiene todos los workflow runs fallidos de un repositorio utilizando paginación"""
        url = f"{self.base_url}/repos/{owner}/{repo}/actions/runs"
        all_runs = []
        page = 1

        while True:
            params = {
                'status': 'failure', # GitHub usa el nombre status como parámetro del query, pero internamente está filtrando por el campo conclusion del run.
                'per_page': per_page,
                'page': page
            }
            try:
                response = requests.get(url, headers=self.headers, params=params)
                response.raise_for_status()
                runs = response.json().get('workflow_runs', [])
                if not runs:
                    break
                all_runs.extend(runs)
                print(f"Página {page}: {len(runs)} workflows fallidos encontrados.")
                page += 1
            except requests.exceptions.RequestException as e:
                print(f"Error obteniendo workflow runs (página {page}): {e}")
                break

        return all_runs

    def get_jobs_for_run(self, owner, repo, run_id):
        url = f"{self.base_url}/repos/{owner}/{repo}/actions/runs/{run_id}/jobs"
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            return response.json()['jobs']
        except requests.exceptions.RequestException as e:
            print(f"Error obteniendo jobs para run {run_id}: {e}")
            return []
    
    def get_job_logs(self, owner, repo, job_id):
        url = f"{self.base_url}/repos/{owner}/{repo}/actions/jobs/{job_id}/logs"
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            return response.text
        except requests.exceptions.RequestException as e:
            print(f"Error obteniendo logs para job {job_id}: {e}")
            return None

    def extract_steps(self, logs):
        steps = []
        lines = logs.split('\n')
        current_step = None
        step_logs = []
        for line in lines:
            if '##[group]' in line and 'Run ' in line:
                if current_step:
                    steps.append({
                        'step_name': current_step,
                        'logs': '\n'.join(step_logs),
                        'failed': any('##[error]' in l or 'Error:' in l or 'error:' in l for l in step_logs)
                    })
                current_step = line.replace('##[group]', '').strip()
                step_logs = []
            step_logs.append(line)
        if current_step:
            steps.append({
                'step_name': current_step,
                'logs': '\n'.join(step_logs),
                'failed': any('##[error]' in l or 'Error:' in l or 'error:' in l for l in step_logs)
            })
        return steps

    def create_base_output_directory(self, owner, repo):
        output_dir = os.path.join('github_failure_logs', f"{owner}_{repo}")
        os.makedirs(output_dir, exist_ok=True)
        return output_dir

    def save_workflow_run_log(self, run, run_dir):
        filename = os.path.join(run_dir, 'workflow_run_log.txt')
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(f"Workflow: {run['name']}\n")
            f.write(f"Run ID: {run['id']}\n")
            f.write(f"Status: {run['status']}\n")
            f.write(f"Conclusion: {run['conclusion']}\n")
            f.write(f"Creado: {run['created_at']}\n")
            f.write(f"Actualizado: {run['updated_at']}\n")
            f.write(f"URL: {run['html_url']}\n")

    def save_job_log(self, job_id, job_name, logs, job_dir):
        filename = os.path.join(job_dir, f"{job_id}_{sanitize_filename(job_name)}.txt")
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(logs)

    def save_step_log(self, job_id, step, step_dir):
        prefix = "FAILED_" if step['failed'] else "OK_"
        base_filename = f"{job_id}_{prefix}{sanitize_filename(step['step_name'])}"
        
        # Log completo del step
        log_path = os.path.join(step_dir, f"{base_filename}.txt")
        with open(log_path, 'w', encoding='utf-8') as f:
            f.write(step['logs'])

        # Solo errores si falló
        if step['failed']:
            error_path = os.path.join(step_dir, f"{base_filename}_errors.txt")
            with open(error_path, 'w', encoding='utf-8') as ef:
                ef.write(extract_error_lines(step['logs']))

    def analyze_repository(self, owner, repo):
        print(f"Analizando repositorio: {owner}/{repo}")
        base_output_dir = self.create_base_output_directory(owner, repo)
        workflow_runs = self.get_workflow_runs(owner, repo)

        if not workflow_runs:
            print(f"No se encontraron workflow runs fallidos en {owner}/{repo}")
            return

        for run in workflow_runs:
            run_id = run['id']
            workflow_name = run['name']
            run_dir = os.path.join(base_output_dir, f"{run_id}_{sanitize_filename(workflow_name)}")
            os.makedirs(run_dir, exist_ok=True)

            print(f"Procesando workflow run: {workflow_name} (ID: {run_id})")
            self.save_workflow_run_log(run, run_dir)

            jobs = self.get_jobs_for_run(owner, repo, run_id)
            for job in jobs:
                if job['conclusion'] == 'failure':
                    job_id = job['id']
                    job_name = job['name']
                    print(f"  Analizando job fallido: {job_name} (ID: {job_id})")

                    logs = self.get_job_logs(owner, repo, job_id)
                    if not logs:
                        continue
                    
                    job_dir = os.path.join(run_dir, "jobs")
                    step_dir = os.path.join(run_dir, "steps")
                    os.makedirs(job_dir, exist_ok=True)
                    os.makedirs(step_dir, exist_ok=True)

                    self.save_job_log(job_id, job_name, logs, job_dir)

                    steps = self.extract_steps(logs)
                    for step in steps:
                        self.save_step_log(job_id, step, step_dir)
            
            time.sleep(0.5)

        print("\nAnálisis completado.")

# Ejecución
if not GITHUB_TOKEN:
    print("ERROR: configurar token de GitHub en la variable GITHUB_TOKEN")
elif not OWNER or not REPO:
    print("ERROR: configurar OWNER y REPO")
else:
    logger = GitHubWorkflowLogger(GITHUB_TOKEN)
    logger.analyze_repository(OWNER, REPO)


Analizando repositorio: vercel/next.js
Página 1: 100 workflows fallidos encontrados.
Página 2: 100 workflows fallidos encontrados.
Página 3: 100 workflows fallidos encontrados.
Página 4: 100 workflows fallidos encontrados.
Página 5: 100 workflows fallidos encontrados.
Página 6: 100 workflows fallidos encontrados.
Página 7: 100 workflows fallidos encontrados.
Página 8: 100 workflows fallidos encontrados.
Página 9: 100 workflows fallidos encontrados.
Página 10: 100 workflows fallidos encontrados.
Procesando workflow run: Test examples (ID: 15960952131)
  Analizando job fallido: Test Examples (20) (ID: 45013363976)
  Analizando job fallido: Test Examples (18) (ID: 45013363981)
Procesando workflow run: Lock Threads (ID: 15960814962)
  Analizando job fallido: action (ID: 45013014784)
Procesando workflow run: Notify new Next.js release (ID: 15960558653)
  Analizando job fallido: notify (ID: 45012416756)
  Analizando job fallido: front-sync (ID: 45012416759)
Procesando workflow run: build-and