In [6]:
# Failure GHA step log extractor

import requests
import json
import os
from datetime import datetime, timedelta, timezone
import time
from dotenv import load_dotenv
import re

# Configuración
load_dotenv()
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
OWNER = "davidibanezibanez"
REPO = "example-failing-workflows"

def sanitize_filename(name, max_length=100):
    name = re.sub(r'[^a-zA-Z0-9 \-_]', '', name)
    return name.strip()[:max_length]

def extract_error_lines(logs):
    return '\n'.join([
        line for line in logs.split('\n')
        if '##[error]' in line or 'Error:' in line or 'error:' in line
    ]) or '[No se detectó un mensaje de error explícito]'

class GitHubWorkflowLogger:
    def __init__(self, token):
        self.headers = {
            'Authorization': f'token {token}',
            'Accept': 'application/vnd.github.v3+json'
        }
        self.base_url = 'https://api.github.com'

    def get_workflow_runs(self, owner, repo, max_runs=None):
        url = f"{self.base_url}/repos/{owner}/{repo}/actions/runs"
        all_runs = []
        page = 1

        while True:
            params = {'status': 'failure', 'per_page': 100, 'page': page}
            try:
                response = requests.get(url, headers=self.headers, params=params)
                response.raise_for_status()
                runs = response.json().get('workflow_runs', [])
                if not runs:
                    break

                for run in runs:
                    all_runs.append(run)
                    if max_runs and len(all_runs) >= max_runs:
                        return all_runs

                print(f"Página {page}: {len(runs)} workflows fallidos encontrados.")
                page += 1
            except requests.exceptions.RequestException as e:
                print(f"Error obteniendo workflow runs (página {page}): {e}")
                break

        return all_runs

    def get_run_detail_json(self, owner, repo, run_id):
        url = f"{self.base_url}/repos/{owner}/{repo}/actions/runs/{run_id}"
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error obteniendo detalles del workflow run {run_id}: {e}")
            return None

    def get_job_logs(self, owner, repo, job_id):
        url = f"{self.base_url}/repos/{owner}/{repo}/actions/jobs/{job_id}/logs"
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            return response.text
        except requests.exceptions.RequestException:
            return None

    def get_jobs_json(self, owner, repo, run_id):
        url = f"{self.base_url}/repos/{owner}/{repo}/actions/runs/{run_id}/jobs"
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error obteniendo jobs JSON para run {run_id}: {e}")
            return None

    def extract_steps(self, logs):
        steps = []
        lines = logs.split('\n')
        current_step = None
        step_logs = []

        for line in lines:
            if '##[group]' in line and 'Run ' in line:
                if current_step:
                    steps.append({
                        'step_name': current_step,
                        'logs': '\n'.join(step_logs),
                        'failed': any('##[error]' in l or 'Error:' in l or 'error:' in l for l in step_logs)
                    })
                current_step = line.replace('##[group]', '').strip()
                step_logs = []
            step_logs.append(line)

        if current_step:
            steps.append({
                'step_name': current_step,
                'logs': '\n'.join(step_logs),
                'failed': any('##[error]' in l or 'Error:' in l or 'error:' in l for l in step_logs)
            })
        return steps

    def create_output_structure(self, owner, repo, run_id, run_name):
        base = os.path.join('github_failure_logs', f"{owner}_{repo}", f"{run_id}_{sanitize_filename(run_name)}")
        os.makedirs(os.path.join(base, 'jobs'), exist_ok=True)
        os.makedirs(os.path.join(base, 'steps'), exist_ok=True)
        return base

    def save_json(self, data, path):
        with open(path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2)

    def save_text(self, content, path):
        with open(path, 'w', encoding='utf-8') as f:
            f.write(content)

    def analyze_repository(self, owner, repo, max_runs=None):
        print(f"Analizando repositorio: {owner}/{repo}")
        runs = self.get_workflow_runs(owner, repo, max_runs=max_runs)

        if not runs:
            print("No se encontraron workflows fallidos.")
            return

        ninety_days_ago = datetime.now(timezone.utc) - timedelta(days=90)

        for run in runs:
            run_id = run['id']
            run_name = run['name']

            created_at = datetime.strptime(run['created_at'], "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
            if created_at < ninety_days_ago:
                print(f"  Ignorado (más de 90 días): {run_name} (ID: {run_id})")
                continue

            print(f"\nProcesando run {run_name} (ID: {run_id})")

            run_detail = self.get_run_detail_json(owner, repo, run_id)
            if not run_detail:
                continue

            run_dir = self.create_output_structure(owner, repo, run_id, run_name)

            self.save_json(run_detail, os.path.join(run_dir, 'workflow_run.json'))

            jobs_data = self.get_jobs_json(owner, repo, run_id)
            if not jobs_data:
                continue

            self.save_json(jobs_data, os.path.join(run_dir, 'jobs.json'))

            for job in jobs_data.get('jobs', []):
                if job.get('conclusion') != 'failure':
                    continue

                job_id = job['id']
                job_name = job['name']
                print(f"  Job fallido: {job_name} (ID: {job_id})")

                self.save_json(job, os.path.join(run_dir, 'jobs', f"{job_id}_{sanitize_filename(job_name)}.json"))

                job_logs = self.get_job_logs(owner, repo, job_id)
                if job_logs:
                    self.save_text(job_logs, os.path.join(run_dir, 'jobs', f"{job_id}_{sanitize_filename(job_name)}.txt"))

                    steps = self.extract_steps(job_logs)
                    for step in steps:
                        prefix = "FAILED_" if step['failed'] else "OK_"
                        step_filename = f"{job_id}_{prefix}{sanitize_filename(step['step_name'])}"
                        step_dir = os.path.join(run_dir, 'steps')

                        self.save_text(step['logs'], os.path.join(step_dir, f"{step_filename}.txt"))

                        if step['failed']:
                            self.save_text(
                                extract_error_lines(step['logs']),
                                os.path.join(step_dir, f"{step_filename}_errors.txt")
                            )

            time.sleep(0.5)

        print("\nAnálisis completado.")

# Ejecución
if not GITHUB_TOKEN:
    print("ERROR: configurar token de GitHub en la variable GITHUB_TOKEN")
elif not OWNER or not REPO:
    print("ERROR: configurar OWNER y REPO")
else:
    logger = GitHubWorkflowLogger(GITHUB_TOKEN)
    logger.analyze_repository(OWNER, REPO) # Sin máximo de workflow runs
    #logger.analyze_repository(OWNER, REPO, max_runs=10) # Máximo de n workflow runs


Analizando repositorio: davidibanezibanez/example-failing-workflows
Página 1: 4 workflows fallidos encontrados.

Procesando run Fail Workflow 1 (ID: 15814985792)
  Job fallido: intentional-fail-job-1 (ID: 44572169286)

Procesando run Fail Workflow 4 (ID: 15814985795)
  Job fallido: intentional-fail-job-4 (ID: 44572169298)
  Job fallido: intentional-fail-job-4_1 (ID: 44572169299)

Procesando run Fail Workflow 3 (ID: 15814985796)
  Job fallido: intentional-fail-job-3 (ID: 44572169313)

Procesando run Fail Workflow 2 (ID: 15814985801)
  Job fallido: intentional-fail-job-2 (ID: 44572169303)

Análisis completado.
