In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluación de Datos Sintéticos\n",
    "\n",
    "Este notebook permite evaluar y visualizar en detalle la calidad de los datos sintéticos generados con el modelo Chronos-T5."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Importar bibliotecas necesarias\n",
    "import os\n",
    "import sys\n",
    "import json\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import yaml\n",
    "from sklearn.manifold import TSNE\n",
    "from sklearn.decomposition import PCA\n",
    "from scipy.stats import ks_2samp, wasserstein_distance\n",
    "\n",
    "# Añadir el directorio principal al path para importar módulos del proyecto\n",
    "sys.path.append('..')\n",
    "from src.evaluation import SyntheticDataEvaluator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Cargar configuración\n",
    "with open('../config/config.yaml', 'r') as file:\n",
    "    config = yaml.safe_load(file)\n",
    "\n",
    "# Inicializar el evaluador de datos sintéticos\n",
    "evaluator = SyntheticDataEvaluator('../config/config.yaml')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Carga de Datos\n",
    "\n",
    "A continuación, cargaremos los datos reales y sintéticos para su evaluación."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Configurar nombres de archivos\n",
    "# Cambia estos valores según los nombres de tus archivos\n",
    "real_data_file = \"test_data.csv\"\n",
    "synthetic_data_file = \"synthetic_data.csv\"  # Ajusta al nombre de tu archivo sintético\n",
    "\n",
    "# Cargar los conjuntos de datos\n",
    "try:\n",
    "    real_df, synthetic_df = evaluator.load_datasets(real_data_file, synthetic_data_file)\n",
    "    print(f\"Datos reales: {real_df.shape[0]} filas, {real_df.shape[1]} columnas\")\n",
    "    print(f\"Datos sintéticos: {synthetic_df.shape[0]} filas, {synthetic_df.shape[1]} columnas\")\n",
    "except FileNotFoundError as e:\n",
    "    print(f\"Error: {str(e)}\")\n",
    "    print(\"Asegúrate de que los archivos existen en los directorios correctos.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Mostrar las primeras filas de ambos conjuntos\n",
    "print(\"Muestra de datos reales:\")\n",
    "display(real_df.head())\n",
    "\n",
    "print(\"\\nMuestra de datos sintéticos:\")\n",
    "display(synthetic_df.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluación Estadística\n",
    "\n",
    "Evaluaremos la similitud estadística entre los datos reales y sintéticos."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Calcular métricas de similitud estadística\n",
    "statistical_results = evaluator.evaluate_statistical_similarity(real_df, synthetic_df)\n",
    "\n",
    "# Crear un DataFrame con las diferencias porcentuales para facilitar visualización\n",
    "stats_summary = {}\n",
    "for col, results in statistical_results.items():\n",
    "    for stat, diff in results['percent_diff'].items():\n",
    "        stats_summary.setdefault('Columna', []).append(col)\n",
    "        stats_summary.setdefault('Estadística', []).append(stat)\n",
    "        stats_summary.setdefault('Diferencia (%)', []).append(diff)\n",
    "        stats_summary.setdefault('Valor Real', []).append(results['real_stats'][stat])\n",
    "        stats_summary.setdefault('Valor Sintético', []).append(results['synth_stats'][stat])\n",
    "\n",
    "stats_df = pd.DataFrame(stats_summary)\n",
    "\n",
    "# Mostrar resultados ordenados por diferencia\n",
    "display(stats_df.sort_values('Diferencia (%)', ascending=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Visualizar las diferencias porcentuales para cada columna\n",
    "# Crear un DataFrame con las diferencias de media y desviación estándar\n",
    "column_diff = []\n",
    "for col, results in statistical_results.items():\n",
    "    column_diff.append({\n",
    "        'Columna': col,\n",
    "        'Diferencia Media (%)': results['percent_diff']['mean'],\n",
    "        'Diferencia Std (%)': results['percent_diff']['std'],\n",
    "        'Test KS (p-valor)': results['ks_pvalue'],\n",
    "        'Distancia Wasserstein': results['wasserstein_distance']\n",
    "    })\n",
    "\n",
    "diff_df = pd.DataFrame(column_diff).sort_values('Diferencia Media (%)')\n",
    "\n",
    "# Visualizar resultados\n",
    "plt.figure(figsize=(12, 8))\n",
    "bar_width = 0.35\n",
    "x = np.arange(len(diff_df))\n",
    "\n",
    "plt.bar(x - bar_width/2, diff_df['Diferencia Media (%)'], bar_width, label='Diferencia Media (%)')\n",
    "plt.bar(x + bar_width/2, diff_df['Diferencia Std (%)'], bar_width, label='Diferencia Std (%)')\n",
    "\n",
    "plt.xlabel('Columna')\n",
    "plt.ylabel('Diferencia (%)')\n",
    "plt.title('Diferencias Estadísticas entre Datos Reales y Sintéticos')\n",
    "plt.xticks(x, diff_df['Columna'], rotation=45, ha='right')\n",
    "plt.legend()\n",
    "plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Mostrar el DataFrame completo\n",
    "display(diff_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Comparación de Distribuciones\n",
    "\n",
    "Visualizaremos y compararemos las distribuciones de los datos reales y sintéticos."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Seleccionar columnas numéricas para visualizar\n",
    "numeric_cols = real_df.select_dtypes(include=['number']).columns.intersection(synthetic_df.columns)\n",
    "cols_to_plot = numeric_cols[:min(5, len(numeric_cols))]  # Limitar a 5 columnas\n",
    "\n",
    "# Visualizar distribuciones\n",
    "evaluator.visualize_distributions(real_df, synthetic_df, cols_to_plot)\n",
    "\n",
    "# Mostrar las imágenes generadas\n",
    "for col in cols_to_plot:\n",
    "    try:\n",
    "        img_path = os.path.join(evaluator.results_path, f'distribution_{col}.png')\n",
    "        img = plt.imread(img_path)\n",
    "        plt.figure(figsize=(12, 6))\n",
    "        plt.imshow(img)\n",
    "        plt.axis('off')\n",
    "        plt.show()\n",
    "    except FileNotFoundError:\n",
    "        print(f\"Archivo de imagen para columna '{col}' no encontrado\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Comparación de distribuciones categóricas\n",
    "categorical_cols = real_df.select_dtypes(include=['object', 'category']).columns.intersection(synthetic_df.columns)\n",
    "\n",
    "for col in categorical_cols[:min(3, len(categorical_cols))]:\n",
    "    if real_df[col].nunique() < 20:  # Solo para categorías manejables\n",
    "        fig, ax = plt.subplots(1, 2, figsize=(14, 6))\n",
    "        \n",
    "        # Datos reales\n",
    "        real_counts = real_df[col].value_counts(normalize=True) * 100\n",
    "        real_counts.plot(kind='bar', ax=ax[0], color='blue', alpha=0.7)\n",
    "        ax[0].set_title(f'Distribución Real: {col}')\n",
    "        ax[0].set_ylabel('Porcentaje (%)')\n",
    "        ax[0].tick_params(axis='x', rotation=45)\n",
    "        \n",
    "        # Datos sintéticos\n",
    "        synth_counts = synthetic_df[col].value_counts(normalize=True) * 100\n",
    "        synth_counts.plot(kind='bar', ax=ax[1], color='green', alpha=0.7)\n",
    "        ax[1].set_title(f'Distribución Sintética: {col}')\n",
    "        ax[1].set_ylabel('Porcentaje (%)')\n",
    "        ax[1].tick_params(axis='x', rotation=45)\n",
    "        \n",
    "        plt.tight_layout()\n",
    "        plt.show()\n",
    "        \n",
    "        # Mostrar tabla comparativa\n",
    "        comp_df = pd.DataFrame({\n",
    "            'Real (%)': real_counts,\n",
    "            'Sintético (%)': synth_counts\n",
    "        }).fillna(0).sort_values('Real (%)', ascending=False)\n",
    "        \n",
    "        comp_df['Diferencia (pp)'] = (comp_df['Sintético (%)'] - comp_df['Real (%)']).abs()\n",
    "        display(comp_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualización de Embeddings\n",
    "\n",
    "Visualizaremos los datos reales y sintéticos en un espacio de menor dimensión para comparar sus estructuras."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Generar y mostrar la visualización t-SNE\n",
    "evaluator.visualize_embedding_comparison(real_df, synthetic_df)\n",
    "\n",
    "# Mostrar la imagen generada\n",
    "try:\n",
    "    img_path = os.path.join(evaluator.results_path, 'tsne_comparison.png')\n",
    "    img = plt.imread(img_path)\n",
    "    plt.figure(figsize=(10, 8))\n",
    "    plt.imshow(img)\n",
    "    plt.axis('off')\n",
    "    plt.show()\n",
    "except FileNotFoundError:\n",
    "    print(\"Archivo de imagen t-SNE no encontrado\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Visualización alternativa usando PCA\n",
    "numeric_cols = real_df.select_dtypes(include=['number']).columns.intersection(synthetic_df.columns)\n",
    "\n",
    "if len(numeric_cols) >= 2:\n",
    "    # Preparar datos\n",
    "    real_data = real_df[numeric_cols].fillna(0).values\n",
    "    synth_data = synthetic_df[numeric_cols].fillna(0).values\n",
    "    \n",
    "    # Limitar a 1000 muestras para eficiencia\n",
    "    max_samples = 1000\n",
    "    real_data = real_data[:min(max_samples, len(real_data))]\n",
    "    synth_data = synth_data[:min(max_samples, len(synth_data))]\n",
    "    \n",
    "    # Combinar para PCA\n",
    "    combined_data = np.vstack([real_data, synth_data])\n",
    "    \n",
    "    # Aplicar PCA\n",
    "    pca = PCA(n_components=2, random_state=42)\n",
    "    embedded_data = pca.fit_transform(combined_data)\n",
    "    \n",
    "    # Separar los resultados\n",
    "    real_embedded = embedded_data[:len(real_data)]\n",
    "    synth_embedded = embedded_data[len(real_data):]\n",
    "    \n",
    "    # Visualizar\n",
    "    plt.figure(figsize=(10, 8))\n",
    "    plt.scatter(real_embedded[:, 0], real_embedded[:, 1], alpha=0.5, label='Real', color='blue')\n",
    "    plt.scatter(synth_embedded[:, 0], synth_embedded[:, 1], alpha=0.5, label='Sintético', color='green')\n",
    "    plt.title('Comparación de datos reales vs sintéticos (PCA)')\n",
    "    plt.xlabel(f'Componente Principal 1 ({pca.explained_variance_ratio_[0]:.2%} var)')\n",
    "    plt.ylabel(f'Componente Principal 2 ({pca.explained_variance_ratio_[1]:.2%} var)')\n",
    "    plt.legend()\n",
    "    plt.grid(alpha=0.3)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluación de Privacidad\n",
    "\n",
    "Evaluaremos el riesgo de divulgación de información confidencial en los datos sintéticos."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Evaluar privacidad\n",
    "privacy_results = evaluator.evaluate_privacy(real_df, synthetic_df)\n",
    "\n",
    "# Mostrar resultados\n",
    "print(\"Evaluación de Privacidad:\")\n",
    "for key, value in privacy_results.items():\n",
    "    print(f\"  - {key}: {value}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Visualización del riesgo de privacidad\n",
    "plt.figure(figsize=(10, 6))\n",
    "risk_percentage = privacy_results['duplicate_percentage']\n",
    "\n",
    "# Crear gráfico de barras horizontal\n",
    "plt.barh(['Riesgo de Privacidad'], [risk_percentage], color='green' if risk_percentage < 1 else 'orange' if risk_percentage < 5 else 'red')\n",
    "plt.xlim(0, 10)  # Limitar a 10% para mejor visualización\n",
    "plt.xlabel('Porcentaje de Duplicación (%)')\n",
    "plt.title('Evaluación de Riesgo de Privacidad')\n",
    "\n",
    "# Añadir líneas de umbral\n",
    "plt.axvline(x=1, color='orange', linestyle='--', alpha=0.7, label='Umbral Medio (1%)')\n",
    "plt.axvline(x=5, color='red', linestyle='--', alpha=0.7, label='Umbral Alto (5%)')\n",
    "plt.legend()\n",
    "\n",
    "# Añadir anotación con el valor exacto\n",
    "plt.annotate(f'{risk_percentage:.2f}%', \n",
    "             xy=(risk_percentage, 0), \n",
    "             xytext=(5, 5),\n",
    "             textcoords='offset points',\n",
    "             ha='center', \n",
    "             fontweight='bold')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Análisis Adicional: Correlaciones\n",
    "\n",
    "Compararemos la matriz de correlación entre datos reales y sintéticos para evaluar si se mantienen las relaciones entre variables."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Calcular y comparar matrices de correlación\n",
    "numeric_cols = real_df.select_dtypes(include=['number']).columns.intersection(synthetic_df.columns)\n",
    "\n",
    "if len(numeric_cols) > 1:\n",
    "    # Matrices de correlación\n",
    "    real_corr = real_df[numeric_cols].corr().round(2)\n",
    "    synth_corr = synthetic_df[numeric_cols].corr().round(2)\n",
    "    \n",
    "    # Diferencia de correlaciones\n",
    "    corr_diff = (synth_corr - real_corr).abs()\n",
    "    \n",
    "    # Visualizar matrices\n",
    "    fig, axes = plt.subplots(1, 3, figsize=(18, 6))\n",
    "    \n",
    "    # Correlación real\n",
    "    sns.heatmap(real_corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1, ax=axes[0])\n",
    "    axes[0].set_title('Correlación Real')\n",
    "    \n",
    "    # Correlación sintética\n",
    "    sns.heatmap(synth_corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1, ax=axes[1])\n",
    "    axes[1].set_title('Correlación Sintética')\n",
    "    \n",
    "    # Diferencia absoluta\n",
    "    sns.heatmap(corr_diff, annot=True, cmap='Reds', vmin=0, vmax=2, ax=axes[2])\n",
    "    axes[2].set_title('Diferencia Absoluta')\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "    \n",
    "    # Calcular y mostrar estadísticas sobre las diferencias\n",
    "    mean_diff = corr_diff.values[np.triu_indices_from(corr_diff.values, k=1)].mean()\n",
    "    max_diff = corr_diff.values[np.triu_indices_from(corr_diff.values, k=1)].max()\n",
    "    \n",
    "    print(f\"Diferencia media de correlación: {mean_diff:.4f}\")\n",
    "    print(f\"Diferencia máxima de correlación: {max_diff:.4f}\")\n",
    "    \n",
    "    # Umbral de evaluación\n",
    "    threshold = 0.2\n",
    "    num_large_diffs = (corr_diff.values[np.triu_indices_from(corr_diff.values, k=1)] > threshold).sum()\n",
    "    total_corrs = len(corr_diff.values[np.triu_indices_from(corr_diff.values, k=1)])\n",
    "    \n",
    "    print(f\"Correlaciones con diferencia > {threshold}: {num_large_diffs} de {total_corrs} ({num_large_diffs/total_corrs:.1%})\")\n",
    "else:\n",
    "    print(\"No hay suficientes columnas numéricas para calcular correlaciones\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Análisis de Series Temporales (si aplica)\n",
    "\n",
    "Si los datos contienen información temporal, podemos analizar cómo se preservan los patrones temporales."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Detectar posibles columnas de fecha\n",
    "date_cols = [col for col in real_df.columns if 'date' in col.lower() or 'time' in col.lower()]\n",
    "\n",
    "if date_cols:\n",
    "    # Intentar convertir a datetime\n",
    "    date_col = date_cols[0]  # Usar la primera columna de fecha encontrada\n",
    "    try:\n",
    "        real_df[date_col] = pd.to_datetime(real_df[date_col])\n",
    "        synthetic_df[date_col] = pd.to_datetime(synthetic_df[date_col])\n",
    "        \n",
    "        # Seleccionar una columna numérica para visualizar\n",
    "        numeric_col = numeric_cols[0] if len(numeric_cols) > 0 else None\n",
    "        \n",
    "        if numeric_col:\n",
    "            plt.figure(figsize=(12, 6))\n",
    "            \n",
    "            # Ordenar por fecha\n",
    "            real_temp = real_df.sort_values(date_col)\n",
    "            synth_temp = synthetic_df.sort_values(date_col)\n",
    "            \n",
    "            # Graficar series temporales\n",
    "            plt.plot(real_temp[date_col], real_temp[numeric_col], label='Real', alpha=0.7)\n",
    "            plt.plot(synth_temp[date_col], synth_temp[numeric_col], label='Sintético', alpha=0.7)\n",
    "            \n",
    "            plt.title(f'Comparación de Series Temporales: {numeric_col}')\n",
    "            plt.xlabel('Fecha')\n",
    "            plt.ylabel(numeric_col)\n",
    "            plt.legend()\n",
    "            plt.grid(alpha=0.3)\n",
    "            plt.xticks(rotation=45)\n",
    "            plt.tight_layout()\n",
    "            plt.show()\n",
    "    except Exception as e:\n",
    "        print(f\"Error al procesar series temporales: {str(e)}\")\n",
    "else:\n",
    "    print(\"No se detectaron columnas de fecha/tiempo en los datos\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluación Integral\n",
    "\n",
    "Finalmente, ejecutaremos la evaluación integral proporcionada por el módulo de evaluación."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Ejecutar evaluación completa\n",
    "all_results = evaluator.run_evaluation(real_data_file, synthetic_data_file)\n",
    "\n",
    "# Mostrar resumen de resultados\n",
    "print(\"\\nResumen de Evaluación:\")\n",
    "\n",
    "if 'statistical_similarity' in all_results:\n",
    "    print(\"\\n1. Similitud Estadística:\")\n",
    "    avg_diffs = []\n",
    "    for col, results in all_results['statistical_similarity'].items():\n",
    "        avg_diff = sum(results['percent_diff'].values()) / len(results['percent_diff'])\n",
    "        avg_diffs.append(avg_diff)\n",
    "        print(f\"  - {col}: Diferencia media {avg_diff:.2f}%, p-valor KS: {results['ks_pvalue']:.4f}\")\n",
    "    \n",
    "    overall_diff = sum(avg_diffs) / len(avg_diffs) if avg_diffs else 0\n",
    "    print(f\"\\n  Diferencia estadística promedio global: {overall_diff:.2f}%\")\n",
    "    print(f\"  Calidad estadística: {'Alta' if overall_diff < 10 else 'Media' if overall_diff < 30 else 'Baja'}\")\n",
    "\n",
    "if 'privacy' in all_results:\n",
    "    print(\"\\n2. Evaluación de Privacidad:\")\n",
    "    for key, value in all_results['privacy'].items():\n",
    "        print(f\"  - {key}: {value}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Conclusiones y Recomendaciones\n",
    "\n",
    "Basándonos en los análisis anteriores, podemos extraer las siguientes conclusiones:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Extraer métricas clave para conclusiones\n",
    "try:\n",
    "    # Calcular métricas promedio\n",
    "    if 'statistical_similarity' in all_results:\n",
    "        # Diferencias estadísticas\n",
    "        avg_diffs = []\n",
    "        ks_pvalues = []\n",
    "        for col, results in all_results['statistical_similarity'].items():\n",
    "            avg_diff = sum(results['percent_diff'].values()) / len(results['percent_diff'])\n",
    "            avg_diffs.append(avg_diff)\n",
    "            ks_pvalues.append(results['ks_pvalue'])\n",
    "        \n",
    "        overall_diff = sum(avg_diffs) / len(avg_diffs) if avg_diffs else 0\n",
    "        avg_ks_pvalue = sum(ks_pvalues) / len(ks_pvalues) if ks_pvalues else 0\n",
    "        \n",
    "    # Evaluación de privacidad\n",
    "    if 'privacy' in all_results:\n",
    "        privacy_risk = all_results['privacy']['privacy_risk']\n",
    "        duplicate_percentage = all_results['privacy']['duplicate_percentage']\n",
    "        \n",
    "    # Generar conclusiones basadas en métricas\n",
    "    print(\"### Conclusiones:\")\n",
    "    \n",
    "    # Calidad estadística\n",
    "    print(\"\\n#### Calidad Estadística:\")\n",
    "    quality_level = 'Alta' if overall_diff < 10 else 'Media' if overall_diff < 30 else 'Baja'\n",
    "    print(f\"- Los datos sintéticos tienen una calidad estadística **{quality_level}** (diferencia promedio: {overall_diff:.2f}%)\")\n",
    "    \n",
    "    if avg_ks_pvalue < 0.05:\n",
    "        print(\"- Las distribuciones de datos reales y sintéticos son **estadísticamente diferentes** según la prueba KS\")\n",
    "    else:\n",
    "        print(\"- Las distribuciones de datos reales y sintéticos son **estadísticamente similares** según la prueba KS\")\n",
    "    \n",
    "    # Privacidad\n",
    "    print(\"\\n#### Privacidad:\")\n",
    "    print(f\"- El riesgo de privacidad es **{privacy_risk}** con {duplicate_percentage:.2f}% de duplicación\")\n",
    "    \n",
    "    # Recomendaciones\n",
    "    print(\"\\n#### Recomendaciones:\")\n",
    "    \n",
    "    if overall_diff > 20:\n",
    "        print(\"- **Ajustar los parámetros del modelo**: Considerar modificar parámetros como temperature, top_k o top_p\")\n",
    "        print(\"- **Incrementar datos de entrenamiento**: Usar más datos para mejorar la calidad de generación\")\n",
    "    \n",
    "    if duplicate_percentage > 3:\n",
    "        print(\"- **Mejorar privacidad**: Aumentar la temperatura o diversidad en la generación para reducir duplicados\")\n",
    "    \n",
    "    print(\"- **Evaluación continua**: Realizar evaluaciones periódicas con diferentes configuraciones del modelo\")\n",
    "    print(\"- **Validación específica**: Realizar pruebas adicionales para los casos de uso específicos de los datos\")\n",
    "    \n",
    "except Exception as e:\n",
    "    print(f\"No se pudieron generar conclusiones automáticas: {str(e)}\")\n",
    "    print(\"Por favor, revisa manualmente los resultados para extraer conclusiones.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exportación de Resultados\n",
    "\n",
    "Podemos exportar los resultados detallados para su uso posterior."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Exportar resultados a formatos útiles\n",
    "output_dir = os.path.join('..', config['data']['synthetic_path'], 'reports')\n",
    "os.makedirs(output_dir, exist_ok=True)\n",
    "\n",
    "# Exportar estadísticas a CSV\n",
    "try:\n",
    "    if 'stats_df' in locals():\n",
    "        stats_df.to_csv(os.path.join(output_dir, 'statistical_comparison.csv'), index=False)\n",
    "        print(f\"Estadísticas exportadas a {os.path.join(output_dir, 'statistical_comparison.csv')}\")\n",
    "    \n",
    "    if 'diff_df' in locals():\n",
    "        diff_df.to_csv(os.path.join(output_dir, 'column_differences.csv'), index=False)\n",
    "        print(f\"Diferencias por columna exportadas a {os.path.join(output_dir, 'column_differences.csv')}\")\n",
    "    \n",
    "    # Crear un informe resumido en markdown\n",
    "    report_path = os.path.join(output_dir, 'synthetic_data_report.md')\n",
    "    \n",
    "    with open(report_path, 'w') as f:\n",
    "        f.write(f\"# Informe de Evaluación de Datos Sintéticos\\n\\n\")\n",
    "        f.write(f\"Fecha: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}\\n\\n\")\n",
    "        \n",
    "        f.write(f\"## Resumen\\n\\n\")\n",
    "        f.write(f\"- Archivo real: {real_data_file}\\n\")\n",
    "        f.write(f\"- Archivo sintético: {synthetic_data_file}\\n\")\n",
    "        f.write(f\"- Filas reales: {len(real_df)}\\n\")\n",
    "        f.write(f\"- Filas sintéticas: {len(synthetic_df)}\\n\\n\")\n",
    "        \n",
    "        if 'overall_diff' in locals():\n",
    "            f.write(f\"## Calidad Estadística\\n\\n\")\n",
    "            f.write(f\"- Diferencia estadística promedio: {overall_diff:.2f}%\\n\")\n",
    "            f.write(f\"- Nivel de calidad: {quality_level}\\n\\n\")\n",
    "        \n",
    "        if 'privacy_risk' in locals():\n",
    "            f.write(f\"## Privacidad\\n\\n\")\n",
    "            f.write(f\"- Riesgo de privacidad: {privacy_risk}\\n\")\n",
    "            f.write(f\"- Porcentaje de duplicación: {duplicate_percentage:.2f}%\\n\\n\")\n",
    "        \n",
    "        f.write(f\"## Recomendaciones\\n\\n\")\n",
    "        f.write(f\"1. Ajustar parámetros del modelo según sea necesario\\n\")\n",
    "        f.write(f\"2. Evaluar el rendimiento en casos de uso específicos\\n\")\n",
    "        f.write(f\"3. Realizar pruebas adicionales con diferentes configuraciones\\n\")\n",
    "    \n",
    "    print(f\"Informe resumido exportado a {report_path}\")\n",
    "    \n",
    "except Exception as e:\n",
    "    print(f\"Error al exportar resultados: {str(e)}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}