# Great Expectations Rule Generator con IA

Este notebook implementa un motor completo para generar reglas de Great Expectations usando un agente de IA. El sistema procesa datos CSV y documentación markdown para generar automáticamente contratos de calidad de datos con fundamento y razonamiento.


## 1. Environment Setup and Dependencies

In [None]:
# Imports principales
import os
import json
import pandas as pd
from pathlib import Path
from datetime import datetime

# Environment variables
from dotenv import load_dotenv
load_dotenv()  # Cargar variables de entorno desde .env

# LLM y MCP
import openai
from pydantic import BaseModel, Fieltrf
import instructor
from fastmcp import FastMCP

# Configurar OpenAI con variables de entorno
openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key:
    openai.api_key = openai_api_key
    print("✅ OpenAI API key configurada desde .env")
else:
    print("⚠️ OPENAI_API_KEY no encontrada en .env - configúrala manualmente si es necesario")

# Configuración
import warnings
warnings.filterwarnings('ignore')


✅ OpenAI API key configurada desde .env


## 2. Data Extraction Layer - CSV and Documentation Loading

Cargamos los datos CSV y la documentación markdown para procesamiento.

In [2]:
# Configuración de rutas usando variables de entorno con fallbacks
DATA_DIR = Path(os.getenv('DATA_DIR', './data'))
CSV_FILE = Path(os.getenv('CSV_FILE', DATA_DIR / "orders.csv"))
DOC_FILE = Path(os.getenv('DOC_FILE', DATA_DIR / "orders.md"))

print(f"📁 Directorio de datos: {DATA_DIR}")
print(f"📊 Archivo CSV: {CSV_FILE}")
print(f"📋 Archivo de documentación: {DOC_FILE}")

class DataExtractor:
    def __init__(self, csv_path: Path, doc_path: Path):
        self.csv_path = csv_path
        self.doc_path = doc_path
        self.df = None
        self.documentation = None

    def load_csv(self):
        """Carga el archivo CSV y extrae información básica"""
        try:
            self.df = pd.read_csv(self.csv_path)
            print(f"✅ CSV cargado: {len(self.df)} filas, {len(self.df.columns)} columnas")
            print(f"Columnas: {list(self.df.columns)}")
            return self.df
        except Exception as e:
            print(f"❌ Error cargando CSV: {e}")
            return None

    def load_documentation(self):
        """Carga la documentación markdown"""
        try:
            with open(self.doc_path, 'r', encoding='utf-8') as f:
                self.documentation = f.read()
            print(f"✅ Documentación cargada: {len(self.documentation)} caracteres")
            return self.documentation
        except Exception as e:
            print(f"❌ Error cargando documentación: {e}")
            return None

    def get_data_profile(self, max_unique_values_to_show=10):
        """
        Genera un perfil básico de los datos

        Args:
            max_unique_values_to_show (int): Número máximo de valores únicos para mostrar todos los valores posibles
        """
        if self.df is None:
            return None

        profile = {
            "shape": self.df.shape,
            "columns": {}
        }

        for col in self.df.columns:
            unique_count = int(self.df[col].nunique())
            null_count = int(self.df[col].isnull().sum())

            # Información básica de la columna
            col_info = {
                "dtype": str(self.df[col].dtype),
                "null_count": null_count,
                "unique_count": unique_count,
                "sample_values": [str(val) for val in self.df[col].dropna().head(5).tolist()]
            }

            # Si tiene pocos valores únicos, mostrar todos los valores posibles
            if unique_count <= max_unique_values_to_show and unique_count > 0:
                unique_values = self.df[col].dropna().unique()
                col_info["unique_values"] = [str(val) for val in sorted(unique_values)]
                col_info["is_categorical"] = True

                # Contar frecuencias de cada valor
                value_counts = self.df[col].value_counts()
                col_info["value_frequencies"] = {
                    str(val): int(count) for val, count in value_counts.items()
                }
            else:
                col_info["is_categorical"] = False

            profile["columns"][col] = col_info

        return profile

# Inicializar extractor
extractor = DataExtractor(CSV_FILE, DOC_FILE)
df = extractor.load_csv()
documentation = extractor.load_documentation()
data_profile = extractor.get_data_profile()

# Mostrar vista previa
print("\n📊 Vista previa de los datos:")
if df is not None:
    print(df.head())
    print(f"\n🔍 Perfil de datos:")
    if data_profile:
        try:
            print(json.dumps(data_profile, indent=2, ensure_ascii=False))
        except Exception as e:
            print(f"Error mostrando perfil completo: {e}")
            print(f"Forma del dataset: {data_profile['shape']}")
            print(f"Columnas: {list(data_profile['columns'].keys())}")

            # Mostrar información categórica específica
            print(f"\n📋 Columnas categóricas detectadas (≤10 valores únicos):")
            for col, info in data_profile['columns'].items():
                if info.get('is_categorical', False):
                    print(f"   • {col}: {info['unique_values']} (total: {info['unique_count']} valores)")
                    print(f"     Frecuencias: {info['value_frequencies']}")

📁 Directorio de datos: data
📊 Archivo CSV: data/orders.csv
📋 Archivo de documentación: data/orders.md
✅ CSV cargado: 20 filas, 5 columnas
Columnas: ['order_id', 'status', 'amount', 'event_time', 'country_code']
✅ Documentación cargada: 1748 caracteres

📊 Vista previa de los datos:
   order_id    status  amount                 event_time country_code
0      1001   PENDING   49.90  2025-09-20T09:15:00+02:00           ES
1      1002   SHIPPED  120.00  2025-09-20T10:05:00+02:00           FR
2      1003  CANCELED    0.00  2025-09-20T11:22:00+02:00           DE
3      1004   SHIPPED   75.50  2025-09-20T12:48:00+02:00           IT
4      1005   PENDING   15.99  2025-09-20T14:03:00+02:00           PT

🔍 Perfil de datos:
{
  "shape": [
    20,
    5
  ],
  "columns": {
    "order_id": {
      "dtype": "int64",
      "null_count": 0,
      "unique_count": 20,
      "sample_values": [
        "1001",
        "1002",
        "1003",
        "1004",
        "1005"
      ],
      "is_categorical": f

In [None]:
from src.expectations import (
    # Importar todos los modelos de expectativas
    ExpectColumnToExist, ExpectColumnValuesToNotBeNull, ExpectColumnValuesToBeUnique,
    ExpectCompoundColumnsToBeUnique, ExpectColumnValuesToBeInSet, ExpectColumnValuesToMatchRegex,
    ExpectColumnValuesToBeBetween, ExpectColumnValuesToBeOfType, #ExpectColumnValuesToMatchStrftimeFormat,
    ExpectColumnMeanToBeBetween, ExpectTableRowCountToBeBetween, ExpectColumnMinToBeBetween,
    ExpectColumnMaxToBeBetween, ExpectColumnSumToBeBetween, GreatExpectation
)

In [None]:
# Preparar toda esta información para un prompt de LLM

import yaml
from typing import get_origin, get_args, Union
import inspect

from src.expectations import (
    # Importar todos los modelos de expectativas
    ExpectColumnToExist, ExpectColumnValuesToNotBeNull, ExpectColumnValuesToBeUnique,
    ExpectCompoundColumnsToBeUnique, ExpectColumnValuesToBeInSet, ExpectColumnValuesToMatchRegex,
    ExpectColumnValuesToBeBetween, ExpectColumnValuesToBeOfType, #ExpectColumnValuesToMatchStrftimeFormat,
    ExpectColumnMeanToBeBetween, ExpectTableRowCountToBeBetween, ExpectColumnMinToBeBetween,
    ExpectColumnMaxToBeBetween, ExpectColumnSumToBeBetween, GreatExpectation, convert_model_to_str
)

def get_pydantic_expectations_info():
    """
    Extrae información de los modelos Pydantic de expectativas
    en lugar de usar la configuración YAML
    """
    # Lista de todos los modelos de expectativas
    expectation_models = [
        ExpectColumnToExist, ExpectColumnValuesToNotBeNull, ExpectColumnValuesToBeUnique,
        ExpectCompoundColumnsToBeUnique, ExpectColumnValuesToBeInSet, ExpectColumnValuesToMatchRegex,
        ExpectColumnValuesToBeBetween, ExpectColumnValuesToBeOfType, #ExpectColumnValuesToMatchStrftimeFormat,
        ExpectColumnMeanToBeBetween, ExpectTableRowCountToBeBetween, ExpectColumnMinToBeBetween,
        ExpectColumnMaxToBeBetween, ExpectColumnSumToBeBetween
    ]

    expectations_info = []

    for model in expectation_models:
        expectations_info.append(convert_model_to_str(model))

    return expectations_info

def format_pydantic_expectations_for_prompt(expectations_info):
    """
    Formatea la información de expectativas extraída de Pydantic para el prompt del LLM
    """
    if not expectations_info:
        return "No hay información de expectativas disponible."

    formatted_text = "EXPECTATIVAS DE GREAT EXPECTATIONS PERMITIDAS (desde modelos Pydantic):\n\n"

    for exp in expectations_info:
        formatted_text += f"{exp}\n"

    return formatted_text

def prepare_data_analysis_prompt():
    """
    Prepara toda la información extraída (datos, documentación, perfil)
    para ser enviada a un LLM para análisis de calidad de datos
    Ahora usa información extraída de modelos Pydantic en lugar de YAML
    """

    prompt_data = {
        "context": {
            "task": "Análisis de calidad de datos para generar reglas de Great Expectations",
            "dataset_name": "orders",
            "analysis_date": datetime.now().isoformat()
        },

        "dataset_info": {
            "file_path": str(CSV_FILE),
            "shape": data_profile["shape"] if data_profile else None,
            "total_rows": data_profile["shape"][0] if data_profile else None,
            "total_columns": data_profile["shape"][1] if data_profile else None,
            "sample_data": df.head(3).to_dict('records') if df is not None else None
        },

        "column_analysis": data_profile["columns"] if data_profile else {},

        "documentation": {
            "content": documentation,
            "length": len(documentation) if documentation else 0,
            "available": documentation is not None
        },

        "data_quality_observations": {
            "columns_with_nulls": [
                col for col, info in (data_profile["columns"] if data_profile else {}).items()
                if info["null_count"] > 0
            ],
            "potential_id_columns": [
                col for col, info in (data_profile["columns"] if data_profile else {}).items()
                if data_profile and info["unique_count"] == data_profile["shape"][0]
            ],
            "categorical_columns": [
                col for col, info in (data_profile["columns"] if data_profile else {}).items()
                if info.get("is_categorical", False)
            ],
            "numeric_columns": [
                col for col, info in (data_profile["columns"] if data_profile else {}).items()
                if info["dtype"] in ["int64", "float64"]
            ]
        },

        # Nueva sección: información de expectativas desde modelos Pydantic
        "pydantic_expectations_info": get_pydantic_expectations_info()
    }

    return prompt_data

def create_llm_prompt(prompt_data):
    """
    Crea un prompt estructurado para el LLM basado en los datos analizados
    Ahora incluye información de expectativas extraída de modelos Pydantic
    """

    # Formatear expectativas Pydantic para el prompt
    expectations_text = format_pydantic_expectations_for_prompt(
        prompt_data.get('pydantic_expectations_info', {})
    )

    # Incluir información específica de columnas categóricas en el prompt
    categorical_info = ""
    if prompt_data['data_quality_observations']['categorical_columns']:
        categorical_info = "\n## INFORMACIÓN DETALLADA DE COLUMNAS CATEGÓRICAS\n"
        for col in prompt_data['data_quality_observations']['categorical_columns']:
            col_info = prompt_data['column_analysis'].get(col, {})
            if col_info.get('is_categorical'):
                categorical_info += f"- {col}: valores posibles = {col_info.get('unique_values', [])}\n"
                categorical_info += f"  frecuencias = {col_info.get('value_frequencies', {})}\n"

    prompt = f"""
Eres un experto en calidad de datos y Great Expectations. Tu tarea es analizar la documentación y el resumen de datos proporcionados
para identificar un conjunto de reglas de calidad iniciales que sean críticas para el negocio y detectables con Great Expectations.

IMPORTANTE: Solo puedes usar las expectativas de Great Expectations que están definidas en los modelos Pydantic proporcionados.

## INFORMACIÓN DEL DATASET
- Nombre: {prompt_data['context']['dataset_name']}
- Filas: {prompt_data['dataset_info']['total_rows']}
- Columnas: {prompt_data['dataset_info']['total_columns']}

## MUESTRA DE DATOS
{json.dumps(prompt_data['dataset_info']['sample_data'], indent=2, ensure_ascii=False)}

## ANÁLISIS POR COLUMNA
{json.dumps(prompt_data['column_analysis'], indent=2, ensure_ascii=False)}

## OBSERVACIONES DE CALIDAD
- Columnas con nulos: {prompt_data['data_quality_observations']['columns_with_nulls']}
- Posibles columnas ID: {prompt_data['data_quality_observations']['potential_id_columns']}
- Columnas categóricas: {prompt_data['data_quality_observations']['categorical_columns']}
- Columnas numéricas: {prompt_data['data_quality_observations']['numeric_columns']}{categorical_info}

## DOCUMENTACIÓN DISPONIBLE
{prompt_data['documentation']['content'] if prompt_data['documentation']['available'] else 'No hay documentación disponible'}

## {expectations_text}

## INSTRUCCIONES
Por favor, analiza esta información y genera un GreatExpectationsSuite completo con:

1. **Reglas de validación críticas para este dataset**
   - Usa SOLO las expectativas definidas en los modelos Pydantic listados arriba
   - Especifica exactamente qué expectativa usar y con qué parámetros
   - Prioriza reglas críticas para la integridad del negocio

2. **Expectativas específicas recomendadas**
   - Para columnas categóricas, usa los valores exactos observados en expect_column_values_to_be_in_set
   - Para columnas numéricas, define rangos apropiados con expect_column_values_to_be_between
   - Para columnas ID, usa expect_column_values_to_be_unique
   - Para completitud, usa expect_column_values_to_not_be_null con parámetro 'mostly' apropiado

3. **Metadatos detallados**
   - Proporciona IDs descriptivos para cada expectativa
   - Incluye descripciones claras de por qué cada expectativa es importante
   - Especifica la fuente de la recomendación (datos observados, documentación, etc.)

4. **Uso de información categórica específica**
   - Para las columnas categóricas detectadas, usa los valores únicos exactos observados
   - Considera las frecuencias para determinar si usar parámetro 'mostly'

RESTRICCIONES CRÍTICAS:
- NUNCA inventes expectativas que no estén en los modelos Pydantic
- SIEMPRE usa los nombres exactos de parámetros definidos en los modelos
- USA los valores categóricos exactos observados en los datos
- Considera los parámetros requeridos vs opcionales según los modelos Pydantic
- Usa parámetros 'mostly' cuando sea apropiado para tolerancia realista
"""

    return prompt



In [5]:
# Crear estructura de datos para el prompt
prompt_data = prepare_data_analysis_prompt()

# Generar el prompt final
llm_prompt = create_llm_prompt(prompt_data)

# Guardar en archivos para referencia
output_dir = Path("./output")
output_dir.mkdir(exist_ok=True)

with open(output_dir / "prompt_data.json", "w", encoding="utf-8") as f:
    json.dump(prompt_data, f, indent=2, ensure_ascii=False, default=str)

# Guardar prompt final
with open(output_dir / "llm_prompt.txt", "w", encoding="utf-8") as f:
    f.write(llm_prompt)



In [6]:
import src.expectations as expectations

In [7]:
print(llm_prompt)


Eres un experto en calidad de datos y Great Expectations. Tu tarea es analizar la documentación y el resumen de datos proporcionados
para identificar un conjunto de reglas de calidad iniciales que sean críticas para el negocio y detectables con Great Expectations.

IMPORTANTE: Solo puedes usar las expectativas de Great Expectations que están definidas en los modelos Pydantic proporcionados.

## INFORMACIÓN DEL DATASET
- Nombre: orders
- Filas: 20
- Columnas: 5

## MUESTRA DE DATOS
[
  {
    "order_id": 1001,
    "status": "PENDING",
    "amount": 49.9,
    "event_time": "2025-09-20T09:15:00+02:00",
    "country_code": "ES"
  },
  {
    "order_id": 1002,
    "status": "SHIPPED",
    "amount": 120.0,
    "event_time": "2025-09-20T10:05:00+02:00",
    "country_code": "FR"
  },
  {
    "order_id": 1003,
    "status": "CANCELED",
    "amount": 0.0,
    "event_time": "2025-09-20T11:22:00+02:00",
    "country_code": "DE"
  }
]

## ANÁLISIS POR COLUMNA
{
  "order_id": {
    "dtype": "int64",


In [8]:
import instructor
from openai import OpenAI


client = instructor.from_openai(
    OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
)


resp = client.chat.completions.create(
    model="gpt-5",
    messages=[
        {"role": "system", "content": llm_prompt},
        {"role": "user", "content": "Generate a complete great expectations suite with the information you have."}
    ],
    response_model=expectations.GreatExpectationsSuite,
    max_retries=10
)


In [11]:
# Print the structured response
resp

GreatExpectationsSuite(expectations=[ExpectationWithMetadata(id='orders.columns.order_id.exists', expectation=ExpectColumnToExist(expectation_type='expect_column_to_exist', column='order_id'), description="La columna 'order_id' debe existir según el esquema esperado del dataset.", source='Documentation - Esquema esperado'), ExpectationWithMetadata(id='orders.columns.status.exists', expectation=ExpectColumnToExist(expectation_type='expect_column_to_exist', column='status'), description="La columna 'status' debe existir según el esquema esperado del dataset.", source='Documentation - Esquema esperado'), ExpectationWithMetadata(id='orders.columns.amount.exists', expectation=ExpectColumnToExist(expectation_type='expect_column_to_exist', column='amount'), description="La columna 'amount' debe existir según el esquema esperado del dataset.", source='Documentation - Esquema esperado'), ExpectationWithMetadata(id='orders.columns.event_time.exists', expectation=ExpectColumnToExist(expectation_t

In [12]:
# Imprime la respuesta estructurada
for exp in resp.expectations:
    print(exp)

id='orders.columns.order_id.exists' expectation=ExpectColumnToExist(expectation_type='expect_column_to_exist', column='order_id') description="La columna 'order_id' debe existir según el esquema esperado del dataset." source='Documentation - Esquema esperado'
id='orders.columns.status.exists' expectation=ExpectColumnToExist(expectation_type='expect_column_to_exist', column='status') description="La columna 'status' debe existir según el esquema esperado del dataset." source='Documentation - Esquema esperado'
id='orders.columns.amount.exists' expectation=ExpectColumnToExist(expectation_type='expect_column_to_exist', column='amount') description="La columna 'amount' debe existir según el esquema esperado del dataset." source='Documentation - Esquema esperado'
id='orders.columns.event_time.exists' expectation=ExpectColumnToExist(expectation_type='expect_column_to_exist', column='event_time') description="La columna 'event_time' debe existir según el esquema esperado del dataset." source='

In [33]:
print(resp.model_dump_json(indent=2))

{
  "expectations": [
    {
      "id": "orders.columns.order_id.exists",
      "expectation": {
        "expectation_type": "expect_column_to_exist",
        "column": "order_id"
      },
      "description": "La columna 'order_id' debe existir según el esquema esperado del dataset.",
      "source": "Documentation - Esquema esperado"
    },
    {
      "id": "orders.columns.status.exists",
      "expectation": {
        "expectation_type": "expect_column_to_exist",
        "column": "status"
      },
      "description": "La columna 'status' debe existir según el esquema esperado del dataset.",
      "source": "Documentation - Esquema esperado"
    },
    {
      "id": "orders.columns.amount.exists",
      "expectation": {
        "expectation_type": "expect_column_to_exist",
        "column": "amount"
      },
      "description": "La columna 'amount' debe existir según el esquema esperado del dataset.",
      "source": "Documentation - Esquema esperado"
    },
    {
      "id": "or

In [None]:
ExpectationWithMetadata()

# Herramientas de la Suite con ExpectationManager

In [15]:
# Importar la librería ExpectationManager
from src.expectation_manager import (
    ExpectationManager,
    create_manager,
    pydantic_to_gx,
    save_suite_yaml,
    load_suite_yaml
)

print("✅ ExpectationManager library imported successfully")

✅ ExpectationManager library imported successfully


In [16]:
# Crear el manager y obtener resumen de la suite generada por el LLM
manager = create_manager()

# Obtener resumen detallado de la suite
summary = manager.get_suite_summary(resp)

summary

INFO:great_expectations.data_context.data_context.context_factory:Could not find local file-backed GX project
INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmpy_b7cxh2' for ephemeral docs site
INFO:great_expectations.datasource.fluent.config:Loading 'datasources' ->
[]
INFO:src.expectation_manager:Using existing Great Expectations context


{'total_expectations': 26,
 'expectation_types': {'expect_column_to_exist': 5,
  'expect_table_row_count_to_be_between': 1,
  'expect_column_values_to_be_of_type': 5,
  'expect_column_values_to_not_be_null': 5,
  'expect_column_values_to_be_unique': 1,
  'expect_column_values_to_be_in_set': 3,
  'expect_column_values_to_match_regex': 3,
  'expect_column_values_to_be_between': 1,
  'expect_column_mean_to_be_between': 1,
  'expect_column_max_to_be_between': 1},
 'expectation_ids': ['orders.columns.order_id.exists',
  'orders.columns.status.exists',
  'orders.columns.amount.exists',
  'orders.columns.event_time.exists',
  'orders.columns.country_code.exists',
  'orders.table.row_count.daily_range',
  'orders.order_id.type.int',
  'orders.status.type.str',
  'orders.amount.type.float',
  'orders.event_time.type.str',
  'orders.country_code.type.str',
  'orders.order_id.not_null',
  'orders.status.not_null',
  'orders.amount.not_null.mostly_95',
  'orders.event_time.not_null',
  'orders.cou

Validación de la suite

In [17]:
is_valid, validation_report = manager.validate_pydantic_suite(resp)
is_valid

INFO:src.expectation_manager:Converted Pydantic suite to GX suite 'suite_20250925_090008' with 26 expectations
INFO:src.expectation_manager:Validated GX suite 'suite_20250925_090008': PASS


Validando suite


True

In [18]:
validation_report

{'suite_name': 'suite_20250925_090008',
 'total_expectations': 26,
 'validation_errors': [],
 'expectation_details': [{'index': 0,
   'expectation_type': 'expect_column_to_exist',
   'is_valid': True,
   'errors': [],
  {'index': 1,
   'expectation_type': 'expect_column_to_exist',
   'is_valid': True,
   'errors': [],
  {'index': 2,
   'expectation_type': 'expect_column_to_exist',
   'is_valid': True,
   'errors': [],
  {'index': 3,
   'expectation_type': 'expect_column_to_exist',
   'is_valid': True,
   'errors': [],
  {'index': 4,
   'expectation_type': 'expect_column_to_exist',
   'is_valid': True,
   'errors': [],
  {'index': 5,
   'expectation_type': 'expect_table_row_count_to_be_between',
   'is_valid': True,
   'errors': [],
  {'index': 6,
   'expectation_type': 'expect_column_values_to_be_of_type',
   'is_valid': True,
   'errors': [],
  {'index': 7,
   'expectation_type': 'expect_column_values_to_be_of_type',
   'is_valid': True,
   'errors': [],
  {'index': 8,
   'expectation

Conversión a GX

In [19]:
gx_suite = manager.pydantic_to_gx_suite(resp, "llm_generated_suite")
gx_suite

INFO:src.expectation_manager:Converted Pydantic suite to GX suite 'llm_generated_suite' with 26 expectations


{
  "name": "llm_generated_suite",
  "id": null,
  "expectations": [
    {
      "type": "expect_column_to_exist",
      "kwargs": {
        "column": "order_id"
      },
      "meta": {
        "expectation_id": "orders.columns.order_id.exists",
        "description": "La columna 'order_id' debe existir seg\u00fan el esquema esperado del dataset.",
        "source": "Documentation - Esquema esperado"
      },
      "severity": "critical"
    },
    {
      "type": "expect_column_to_exist",
      "kwargs": {
        "column": "status"
      },
      "meta": {
        "expectation_id": "orders.columns.status.exists",
        "description": "La columna 'status' debe existir seg\u00fan el esquema esperado del dataset.",
        "source": "Documentation - Esquema esperado"
      },
      "severity": "critical"
    },
    {
      "type": "expect_column_to_exist",
      "kwargs": {
        "column": "amount"
      },
      "meta": {
        "expectation_id": "orders.columns.amount.exists",
 

Serialización

In [20]:
yaml_path = output_dir / "validated_llm_suite.yaml"
yaml_str = manager.serialize_to_yaml(resp, str(yaml_path))

INFO:src.expectation_manager:Saved Pydantic suite to YAML: output/validated_llm_suite.yaml


Deserialización

In [21]:
suite_pydantic_read = manager.deserialize_from_yaml(str(yaml_path))

manager.get_suite_summary(suite_pydantic_read)

INFO:src.expectation_manager:Loaded Pydantic suite from YAML: output/validated_llm_suite.yaml


{'total_expectations': 26,
 'expectation_types': {'expect_column_to_exist': 5,
  'expect_table_row_count_to_be_between': 1,
  'expect_column_values_to_be_of_type': 5,
  'expect_column_values_to_not_be_null': 5,
  'expect_column_values_to_be_unique': 1,
  'expect_column_values_to_be_in_set': 3,
  'expect_column_values_to_match_regex': 3,
  'expect_column_values_to_be_between': 1,
  'expect_column_mean_to_be_between': 1,
  'expect_column_max_to_be_between': 1},
 'expectation_ids': ['orders.columns.order_id.exists',
  'orders.columns.status.exists',
  'orders.columns.amount.exists',
  'orders.columns.event_time.exists',
  'orders.columns.country_code.exists',
  'orders.table.row_count.daily_range',
  'orders.order_id.type.int',
  'orders.status.type.str',
  'orders.amount.type.float',
  'orders.event_time.type.str',
  'orders.country_code.type.str',
  'orders.order_id.not_null',
  'orders.status.not_null',
  'orders.amount.not_null.mostly_95',
  'orders.event_time.not_null',
  'orders.cou

In [34]:
suite_pydantic_read

GreatExpectationsSuite(expectations=[ExpectationWithMetadata(id='orders.columns.order_id.exists', expectation=ExpectColumnToExist(expectation_type='expect_column_to_exist', column='order_id'), description="La columna 'order_id' debe existir según el esquema esperado del dataset.", source='Documentation - Esquema esperado'), ExpectationWithMetadata(id='orders.columns.status.exists', expectation=ExpectColumnToExist(expectation_type='expect_column_to_exist', column='status'), description="La columna 'status' debe existir según el esquema esperado del dataset.", source='Documentation - Esquema esperado'), ExpectationWithMetadata(id='orders.columns.amount.exists', expectation=ExpectColumnToExist(expectation_type='expect_column_to_exist', column='amount'), description="La columna 'amount' debe existir según el esquema esperado del dataset.", source='Documentation - Esquema esperado'), ExpectationWithMetadata(id='orders.columns.event_time.exists', expectation=ExpectColumnToExist(expectation_t

In [19]:
manager.validate_pydantic_suite(suite_pydantic_read)

INFO:src.expectation_manager:Converted Pydantic suite to GX suite 'suite_20250924_172446' with 24 expectations
INFO:src.expectation_manager:Validated GX suite 'suite_20250924_172446': PASS


Validando suite


(True,
 {'suite_name': 'suite_20250924_172446',
  'total_expectations': 24,
  'validation_errors': [],
  'expectation_details': [{'index': 0,
    'expectation_type': 'expect_column_to_exist',
    'is_valid': True,
    'errors': [],
   {'index': 1,
    'expectation_type': 'expect_column_to_exist',
    'is_valid': True,
    'errors': [],
   {'index': 2,
    'expectation_type': 'expect_column_to_exist',
    'is_valid': True,
    'errors': [],
   {'index': 3,
    'expectation_type': 'expect_column_to_exist',
    'is_valid': True,
    'errors': [],
   {'index': 4,
    'expectation_type': 'expect_column_to_exist',
    'is_valid': True,
    'errors': [],
   {'index': 5,
    'expectation_type': 'expect_table_row_count_to_be_between',
    'is_valid': True,
    'errors': [],
   {'index': 6,
    'expectation_type': 'expect_column_values_to_be_of_type',
    'is_valid': True,
    'errors': [],
   {'index': 7,
    'expectation_type': 'expect_column_values_to_not_be_null',
    'is_valid': True,
    '

Conversión a GX (utilidad)

In [22]:
gx_suite = pydantic_to_gx(resp, "llm_generated_suite")

INFO:great_expectations.data_context.data_context.context_factory:Could not find local file-backed GX project
INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmpmfcj8fxt' for ephemeral docs site
INFO:great_expectations.datasource.fluent.config:Loading 'datasources' ->
[]
INFO:src.expectation_manager:Using existing Great Expectations context
INFO:src.expectation_manager:Converted Pydantic suite to GX suite 'llm_generated_suite' with 26 expectations


# Validación sobre datos

In [23]:
import src.gx_utils as gx_utils

In [24]:
import logging
logging.getLogger('great_expectations').setLevel(logging.ERROR)
validation_result = gx_utils.run_gx_dataset_validation(gx_suite, df)
validation_result["success"]

INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmpt3hfc2my' for ephemeral docs site


Calculating Metrics:   0%|          | 0/94 [00:00<?, ?it/s]

True

Validación con datos inválidos

In [25]:
CSV_INVALID_FILE = Path(os.getenv('CSV_FILE', DATA_DIR / "orders_invalid.csv"))
df_invalid = pd.read_csv(CSV_INVALID_FILE)
validation_result_invalid = gx_utils.run_gx_dataset_validation(gx_suite, df_invalid)
validation_result_invalid["success"]

INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmpsb29cr7e' for ephemeral docs site


Calculating Metrics:   0%|          | 0/94 [00:00<?, ?it/s]

False

Demostración de tooling para inspeccionar resultados de validación

In [26]:
gx_utils.get_failed_expectations_summary(validation_result_invalid)

{'total_failures': 4,
 'failed_expectations': [{'expectation_id': 'orders.order_id.type.int',
   'expectation_type': 'expect_column_values_to_be_of_type',
   'column': 'order_id',
   'description': "El tipo de 'order_id' se valida como entero para alinear con el perfilado (int64). Nota: la documentación indica string; revisar contrato si procede.",
   'severity': 'critical',
   'source': 'Data Profiling + Documentation',
   'success': False,
   'status': 'FAIL',
   'failure_info': {'total_elements': 0,
    'invalid_count': 0,
    'invalid_percentage': 0.0,
    'invalid_values': [],
    'invalid_indices': [],
    'value_counts': []},
   'expectation_kwargs': {'batch_id': 'direct_datasource-direct_asset',
    'column': 'order_id',
    'type_': 'int'}},
  {'expectation_id': 'orders.order_id.not_null',
   'expectation_type': 'expect_column_values_to_not_be_null',
   'column': 'order_id',
   'description': "'order_id' no debe ser nulo; es clave para identidad y joins.",
   'severity': 'crit

In [27]:
# Test severity functionality
print("=== Testing Severity Support ===")

# First, let's check the original LLM-generated suite
print("\n1. Original LLM suite severity distribution:")
original_summary = manager.get_suite_summary(resp)
print("Severity distribution:", original_summary.get('severity_distribution', {}))

# Display the first few expectations with their severities
print("\nFirst 3 expectations with severity:")
for exp in resp.expectations[:3]:
    print(f"- {exp.id}: {exp.expectation.expectation_type} -> Severity: {exp.severity.value}")

# Import the ExpectationSeverity enum to test different severities
from src.expectations import ExpectationSeverity

# Create a test expectation with WARNING severity
from src.expectations import ExpectColumnToExist, ExpectationWithMetadata

test_warning_exp = ExpectationWithMetadata(
    id="test_warning",
    expectation=ExpectColumnToExist(column="order_id"),
    description="Test expectation with WARNING severity",
    source="Manual Test",
    severity=ExpectationSeverity.WARNING
)

# Create a test expectation with INFO severity
test_info_exp = ExpectationWithMetadata(
    id="test_info",
    expectation=ExpectColumnToExist(column="customer_id"),
    description="Test expectation with INFO severity",
    source="Manual Test",
    severity=ExpectationSeverity.INFO
)

# Create a new suite with mixed severities
from src.expectations import GreatExpectationsSuite

mixed_severity_suite = GreatExpectationsSuite(
    expectations=[
        resp.expectations[0],  # Keep one original (CRITICAL)
        test_warning_exp,      # WARNING
        test_info_exp          # INFO
    ]
)

print("\n2. Mixed severity test suite:")
mixed_summary = manager.get_suite_summary(mixed_severity_suite)
print("Severity distribution:", mixed_summary.get('severity_distribution', {}))

print("\nExpectation details:")
for detail in mixed_summary['expectation_details']:
    print(f"- {detail['id']}: {detail['type']} -> Severity: {detail['severity']}")

print("\n3. Testing GX conversion with severities:")
# Convert to GX and back to verify severity preservation
gx_suite_mixed = manager.pydantic_to_gx_suite(mixed_severity_suite, "mixed_severity_test")

# Check the GX suite severities
print("GX Suite expectations with severities:")
for i, gx_exp in enumerate(gx_suite_mixed.expectations):
    print(f"- Expectation {i+1}: {gx_exp.type} -> GX Severity: {gx_exp.severity}")

# Convert back to Pydantic
pydantic_suite_converted_back = manager.gx_suite_to_pydantic(gx_suite_mixed)

print("\n4. Round-trip conversion test:")
back_summary = manager.get_suite_summary(pydantic_suite_converted_back)
print("Severity distribution after round-trip:", back_summary.get('severity_distribution', {}))

print("\nRound-trip expectation details:")
for detail in back_summary['expectation_details']:
    print(f"- {detail['id']}: {detail['type']} -> Severity: {detail['severity']}")

print("\n✅ Severity functionality test completed!")

=== Testing Severity Support ===

1. Original LLM suite severity distribution:
Severity distribution: {}

First 3 expectations with severity:


AttributeError: 'ExpectationWithMetadata' object has no attribute 'severity'

In [28]:
# Reload modules to get the updated expectations with severity support
import importlib
import src.expectations
import src.expectation_manager

# Reload the modules
importlib.reload(src.expectations)
importlib.reload(src.expectation_manager)

# Re-import the updated classes
from src.expectations import ExpectationSeverity, ExpectationWithMetadata, GreatExpectationsSuite
from src.expectation_manager import ExpectationManager, create_manager

print("✅ Modules reloaded with severity support!")

# Create a new manager with the updated modules
manager = create_manager()

print("✅ New manager created with severity support!")

INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmp9q6havwd' for ephemeral docs site
INFO:src.expectation_manager:Using existing Great Expectations context


✅ Modules reloaded with severity support!
✅ New manager created with severity support!


In [None]:
# Test severity functionality with new updated models
print("=== Testing Updated Severity Support ===")

# Import the required classes
from src.expectations import (
    ExpectColumnToExist, ExpectColumnValuesToNotBeNull, ExpectColumnValuesToBeInSet,
    ExpectationWithMetadata, GreatExpectationsSuite, ExpectationSeverity
)

# Create test expectations with different severities
test_expectations = [
    ExpectationWithMetadata(
        id="critical_column_exists",
        expectation=ExpectColumnToExist(column="order_id"),
        description="Critical: Order ID column must exist",
        source="Business Requirements",
        severity=ExpectationSeverity.CRITICAL
    ),
    ExpectationWithMetadata(
        id="warning_nulls_check",
        expectation=ExpectColumnValuesToNotBeNull(column="customer_id", mostly=0.95),
        description="Warning: Customer ID should mostly not be null",
        source="Data Quality Analysis",
        severity=ExpectationSeverity.WARNING
    ),
    ExpectationWithMetadata(
        id="info_status_values",
        expectation=ExpectColumnValuesToBeInSet(column="status", value_set=["pending", "shipped", "delivered"]),
        description="Info: Status should be one of expected values",
        source="Data Documentation",
        severity=ExpectationSeverity.INFO
    )
]

# Create a test suite
test_suite = GreatExpectationsSuite(expectations=test_expectations)

print("1. Test suite created with mixed severities:")
summary = manager.get_suite_summary(test_suite)
print("Severity distribution:", summary.get('severity_distribution', {}))

print("\nExpectation details:")
for detail in summary['expectation_details']:
    print(f"- {detail['id']}: {detail['type']} -> Severity: {detail['severity']}")

print("\n2. Testing conversion to GX:")
# Convert to GX suite
gx_test_suite = manager.pydantic_to_gx_suite(test_suite, "severity_test_suite")

print("GX expectations with severities:")
for i, gx_exp in enumerate(gx_test_suite.expectations):
    print(f"- {gx_exp.type}: {gx_exp.severity}")

print("\n3. Testing round-trip conversion:")
# Convert back to Pydantic
pydantic_back = manager.gx_suite_to_pydantic(gx_test_suite)

print("Round-trip summary:")
back_summary = manager.get_suite_summary(pydantic_back)
print("Severity distribution:", back_summary.get('severity_distribution', {}))

print("\n4. Testing validation with severities:")
is_valid, validation_report = manager.validate_pydantic_suite(test_suite)
print(f"Suite validation: {'PASS' if is_valid else 'FAIL'}")
print("Validation report summary:", {
    'total_expectations': validation_report.get('total_expectations', 0),
    'severity_info': validation_report.get('pydantic_summary', {}).get('severity_distribution', {})
})

print("\n✅ Severity functionality test completed successfully!")