# 📊 NIC ETL - Report API

## 📋 O que este notebook faz

Este notebook **expõe o relatório de execução do pipeline** via REST API:

- 📥 **Lê o report.json** gerado pelo pipeline ETL
- 🔍 **Valida existência** do relatório
- 🌐 **Retorna via HTTP** para sistemas externos
- 📈 **Fornece status** da última execução do pipeline

## 🔗 Endpoint disponível

```
GET /nic/v1/pipelines/gitlab-qdrant/runs/last
```

## 📊 Resposta esperada

Retorna o JSON completo do relatório com:
- Contexto de execução
- Status de cada etapa
- Validação do fluxo de dados
- Métricas de performance

---

## 🌐 Endpoint REST

In [None]:
# +GET /nic/v1/pipelines/gitlab-qdrant/runs/last

In [None]:
import json
from pathlib import Path
from datetime import datetime

# Caminho do relatório
report_path = Path("pipeline-data/report.json")

# Verificar se o relatório existe
if not report_path.exists():
    # Retornar informação de que ainda não foi executado
    response = {
        "pipeline_info": {
            "version": "1.0.0",
            "last_execution": None,
            "environment": "unknown"
        },
        "context": {},
        "stages": [],
        "summary": {
            "pipeline_status": "NOT_EXECUTED",
            "message": "Pipeline has not been executed yet. Please run the ETL pipeline first.",
            "total_duration_seconds": 0,
            "data_flow": {
                "input_files": 0,
                "processed_documents": 0,
                "total_chunks": 0,
                "embeddings_generated": 0,
                "vectors_stored": 0
            },
            "validation": {
                "overall": "NOT_AVAILABLE"
            }
        },
        "api_metadata": {
            "endpoint": "/nic/v1/pipelines/gitlab-qdrant/runs/last",
            "served_at": datetime.now().isoformat() + "Z",
            "report_exists": False
        }
    }
    print(json.dumps(response, indent=2, ensure_ascii=False))
else:
    # Ler e retornar o relatório
    try:
        with open(report_path, "r", encoding="utf-8") as f:
            report = json.load(f)
        
        # Adicionar metadados da API
        report["api_metadata"] = {
            "endpoint": "/nic/v1/pipelines/gitlab-qdrant/runs/last",
            "served_at": datetime.now().isoformat() + "Z",
            "report_file": str(report_path),
            "report_exists": True
        }
        
        # Retornar o relatório completo
        print(json.dumps(report, indent=2, ensure_ascii=False))
        
    except json.JSONDecodeError as e:
        # Erro ao decodificar JSON
        response = {
            "error": "Invalid report format",
            "message": f"The report file exists but contains invalid JSON: {str(e)}",
            "status_code": 500,
            "timestamp": datetime.now().isoformat() + "Z"
        }
        print(json.dumps(response, indent=2))
        
    except Exception as e:
        # Erro genérico
        response = {
            "error": "Internal server error",
            "message": f"An error occurred while reading the report: {str(e)}",
            "status_code": 500,
            "timestamp": datetime.now().isoformat() + "Z"
        }
        print(json.dumps(response, indent=2))

## 📊 Endpoint de Estatísticas Resumidas

In [None]:
# +GET /nic/v1/pipelines/gitlab-qdrant/runs/last/summary

In [None]:
import json
from pathlib import Path
from datetime import datetime

# Caminho do relatório
report_path = Path("pipeline-data/report.json")

if not report_path.exists():
    # Retornar resumo vazio quando não executado
    response = {
        "pipeline_status": "NOT_EXECUTED",
        "last_execution": None,
        "message": "Pipeline has not been executed yet.",
        "total_duration_seconds": 0,
        "data_flow": {
            "input_files": 0,
            "processed_documents": 0,
            "total_chunks": 0,
            "embeddings_generated": 0,
            "vectors_stored": 0
        },
        "validation": {
            "overall": "NOT_AVAILABLE"
        },
        "api_metadata": {
            "endpoint": "/nic/v1/pipelines/gitlab-qdrant/runs/last/summary",
            "served_at": datetime.now().isoformat() + "Z"
        }
    }
    print(json.dumps(response, indent=2))
else:
    try:
        with open(report_path, "r", encoding="utf-8") as f:
            report = json.load(f)
        
        # Extrair apenas o resumo
        summary = {
            "pipeline_status": report.get("summary", {}).get("pipeline_status", "UNKNOWN"),
            "last_execution": report.get("pipeline_info", {}).get("last_execution", "N/A"),
            "total_duration_seconds": report.get("summary", {}).get("total_duration_seconds", 0),
            "data_flow": report.get("summary", {}).get("data_flow", {}),
            "validation": report.get("summary", {}).get("validation", {}),
            "commit": report.get("context", {}).get("commit", "unknown"),
            "api_metadata": {
                "endpoint": "/nic/v1/pipelines/gitlab-qdrant/runs/last/summary",
                "served_at": datetime.now().isoformat() + "Z"
            }
        }
        
        # Adicionar contagem de falhas se houver
        failed_stages = report.get("summary", {}).get("failed_stages", [])
        if failed_stages:
            summary["failed_stages"] = failed_stages
            summary["failure_count"] = len(failed_stages)
        
        print(json.dumps(summary, indent=2, ensure_ascii=False))
        
    except Exception as e:
        response = {
            "error": "Internal server error",
            "message": str(e),
            "status_code": 500
        }
        print(json.dumps(response, indent=2))

## 🔍 Endpoint de Status por Etapa

In [None]:
# +GET /nic/v1/pipelines/gitlab-qdrant/runs/last/stages

In [None]:
import json
from pathlib import Path
from datetime import datetime

# Caminho do relatório
report_path = Path("pipeline-data/report.json")

if not report_path.exists():
    # Retornar lista vazia quando não executado
    response = {
        "stages": [],
        "total_stages": 0,
        "successful_stages": 0,
        "failed_stages": 0,
        "message": "Pipeline has not been executed yet.",
        "api_metadata": {
            "endpoint": "/nic/v1/pipelines/gitlab-qdrant/runs/last/stages",
            "served_at": datetime.now().isoformat() + "Z"
        }
    }
    print(json.dumps(response, indent=2))
else:
    try:
        with open(report_path, "r", encoding="utf-8") as f:
            report = json.load(f)
        
        # Extrair informações simplificadas das etapas
        stages = []
        for stage in report.get("stages", []):
            stage_info = {
                "stage": stage["stage"],
                "name": stage["name"],
                "status": stage["status"],
                "duration_seconds": stage.get("duration_seconds", 0)
            }
            
            # Adicionar métricas principais de cada etapa
            if stage["stage"] == 2:  # GitLab
                stage_info["files_downloaded"] = stage["results"].get("files_downloaded", 0)
            elif stage["stage"] == 3:  # Docling
                stage_info["documents_processed"] = stage["results"].get("total_processed", 0)
            elif stage["stage"] == 4:  # Chunks
                stage_info["chunks_created"] = stage["results"].get("total_chunks", 0)
            elif stage["stage"] == 5:  # Embeddings
                stage_info["embeddings_generated"] = stage["results"].get("embeddings_generated", 0)
            elif stage["stage"] == 6:  # Qdrant
                stage_info["vectors_stored"] = stage["results"].get("chunks_inserted", 0)
            
            stages.append(stage_info)
        
        response = {
            "stages": stages,
            "total_stages": len(stages),
            "successful_stages": sum(1 for s in stages if s["status"] == "SUCCESS"),
            "failed_stages": sum(1 for s in stages if s["status"] == "FAILED"),
            "api_metadata": {
                "endpoint": "/nic/v1/pipelines/gitlab-qdrant/runs/last/stages",
                "served_at": datetime.now().isoformat() + "Z"
            }
        }
        
        print(json.dumps(response, indent=2, ensure_ascii=False))
        
    except Exception as e:
        response = {
            "error": "Internal server error",
            "message": str(e),
            "status_code": 500
        }
        print(json.dumps(response, indent=2))

## 🏥 Health Check

In [None]:
# +GET /nic/v1/pipelines/gitlab-qdrant/health

In [None]:
import json
from pathlib import Path
from datetime import datetime

# Verificar saúde do sistema
report_path = Path("pipeline-data/report.json")
pipeline_data_dir = Path("pipeline-data")

health = {
    "status": "healthy",
    "checks": {
        "report_exists": report_path.exists(),
        "pipeline_data_exists": pipeline_data_dir.exists(),
        "documents_dir": (pipeline_data_dir / "documents").exists(),
        "chunks_dir": (pipeline_data_dir / "chunks").exists(),
        "embeddings_dir": (pipeline_data_dir / "embeddings").exists()
    },
    "timestamp": datetime.now().isoformat() + "Z"
}

# Se há relatório, adicionar informações da última execução
if report_path.exists():
    try:
        with open(report_path, "r") as f:
            report = json.load(f)
        
        health["last_pipeline_run"] = {
            "execution_date": report.get("pipeline_info", {}).get("last_execution", "unknown"),
            "status": report.get("summary", {}).get("pipeline_status", "unknown"),
            "validation": report.get("summary", {}).get("validation", {}).get("overall", "unknown")
        }
    except:
        health["last_pipeline_run"] = {"status": "error_reading_report"}
else:
    health["last_pipeline_run"] = {"status": "no_runs_yet"}

# Determinar status geral
if not all(health["checks"].values()):
    health["status"] = "degraded"

print(json.dumps(health, indent=2, ensure_ascii=False))