In [1]:
import os
import json
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.feature_selection import VarianceThreshold
from category_encoders import TargetEncoder
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch

def is_boolean_column(series):
    """Detección rápida de columnas booleanas usando numpy"""
    if pd.api.types.is_bool_dtype(series):
        return True
    # Convertir a numpy array para procesamiento más rápido
    unique_values = pd.unique(series.dropna().to_numpy())
    return set(unique_values).issubset({0, 1, 0.0, 1.0})

def identify_columns_for_processing(df):
    """Identifica eficientemente las columnas para procesar"""
    # Usar numpy para operaciones vectorizadas
    column_types = {}
    for col in df.columns:
        if col == 'nivel_triage':
            continue
        
        series = df[col]
        if is_boolean_column(series):
            column_types['boolean_cols'] = column_types.get('boolean_cols', []) + [col]
        elif np.issubdtype(series.dtype, np.number):
            column_types['numeric_cols'] = column_types.get('numeric_cols', []) + [col]
    
    return column_types

def batch_safe_transformation(data, cols):
    """Aplica transformaciones en lotes usando operaciones vectorizadas"""
    transformations = {}
    
    # Convertir a numpy array para operaciones más rápidas
    data_array = data[cols].to_numpy()
    
    # Calcular máscaras una vez
    non_negative_mask = (data_array >= 0).all(axis=0)
    positive_mask = (data_array > 0).all(axis=0)
    
    # Aplicar transformaciones en lotes
    for i, col in enumerate(cols):
        if non_negative_mask[i]:
            transformations[f"{col}_log"] = np.log1p(data_array[:, i])
        if positive_mask[i]:
            transformations[f"{col}_sqrt"] = np.sqrt(data_array[:, i])
        
        # Transformaciones seguras para cualquier distribución
        transformations[f"{col}_squared"] = np.square(data_array[:, i])
        
        # Calcular z-score vectorizado
        mean = np.mean(data_array[:, i])
        std = np.std(data_array[:, i])
        transformations[f"{col}_zscore"] = (data_array[:, i] - mean) / std
    
    return pd.DataFrame(transformations, index=data.index)

def generate_safe_interactions(df, numeric_cols, target_col='nivel_triage', batch_size=1000):
    """Genera interacciones en lotes"""
    encoder = TargetEncoder(cols=numeric_cols, smoothing=20)
    
    # Procesar en lotes para reducir uso de memoria
    n_batches = len(df) // batch_size + (1 if len(df) % batch_size != 0 else 0)
    encoded_dfs = []
    
    for i in range(n_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        batch_df = df.iloc[start_idx:end_idx]
        
        encoded_batch = encoder.fit_transform(batch_df[numeric_cols], batch_df[target_col])
        interactions_batch = encoded_batch.mul(batch_df[target_col], axis=0)
        encoded_dfs.append(interactions_batch)
    
    interactions = pd.concat(encoded_dfs)
    interactions.columns = [f"{col}_x_{target_col}" for col in interactions.columns]
    return interactions

def get_column_summary(stage, df, column_types=None):
    """Genera un resumen mejorado de las columnas con información de tipos"""
    summary = []
    summary.append(f"Resumen de columnas - {stage}")
    summary.append(f"Total columnas: {len(df.columns)}")
    summary.append(f"Muestras: {len(df)}")
    
    # Análisis de tipos
    type_counts = df.dtypes.value_counts()
    for dtype, count in type_counts.items():
        summary.append(f"  - {dtype}: {count}")
    
    # Información de tipos de columnas si está disponible
    if column_types:
        summary.append("\nDistribución de columnas:")
        summary.append(f"  - Booleanas: {len(column_types.get('boolean_cols', []))}")
        summary.append(f"  - Numéricas: {len(column_types.get('numeric_cols', []))}")
    
    # Análisis de completitud
    completeness = df.notna().mean().mean()
    summary.append(f"\nCompletitud promedio: {completeness:.2%}")
    
    return summary

def generate_pdf_report(summaries, output_path):
    """Genera un reporte PDF con los resúmenes de columnas."""
    pdf_path = os.path.join(output_path, "feature_engineering_report.pdf")
    doc = SimpleDocTemplate(pdf_path, pagesize=letter)
    styles = getSampleStyleSheet()
    story = []

    # Título
    title_style = ParagraphStyle(
        'CustomTitle',
        parent=styles['Heading1'],
        fontSize=24,
        spaceAfter=30
    )
    story.append(Paragraph("Reporte de Feature Engineering", title_style))
    story.append(Spacer(1, 20))

    # Fecha y hora
    date_style = ParagraphStyle(
        'DateStyle',
        parent=styles['Normal'],
        fontSize=12,
        spaceAfter=30
    )
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    story.append(Paragraph(f"Generado el: {current_time}", date_style))
    story.append(Spacer(1, 20))

    # Contenido principal
    for section_title, content in summaries.items():
        # Título de sección
        section_style = ParagraphStyle(
            'SectionTitle',
            parent=styles['Heading2'],
            fontSize=16,
            spaceAfter=12
        )
        story.append(Paragraph(section_title, section_style))
        story.append(Spacer(1, 10))

        # Contenido de la sección
        content_style = ParagraphStyle(
            'ContentStyle',
            parent=styles['Normal'],
            fontSize=12,
            leftIndent=20,
            spaceAfter=6
        )
        
        if isinstance(content, list):
            for line in content:
                story.append(Paragraph(str(line), content_style))
        else:
            story.append(Paragraph(str(content), content_style))
        
        story.append(Spacer(1, 20))

    # Generar PDF
    doc.build(story)
    return pdf_path

def main():
    # Cargar configuración
    base_path = os.path.dirname(os.getcwd())
    config_path = os.path.join(base_path, "config.json")
    
    with open(config_path, "r") as f:
        config = json.load(f)
    
    output_path = os.path.join(base_path, config["paths"]["intermediate"]["featured"])
    os.makedirs(output_path, exist_ok=True)
    
    # Inicializar diccionario de resúmenes
    summaries = {}
    
    # Cargar datos
    encoded_path = os.path.join(base_path, config["paths"]["intermediate"]["encoded"], "df_triage_encoded.parquet")
    df = pd.read_parquet(encoded_path)
    
    # Resumen inicial
    summaries["Estado Inicial"] = get_column_summary("Inicial", df)
    initial_columns = len(df.columns)
    
    # 1. Identificar columnas para procesar
    print("Identificando columnas...")
    column_types = identify_columns_for_processing(df)
    numeric_cols = column_types.get('numeric_cols', [])
    
    summaries["Clasificación de Columnas"] = get_column_summary("Post-clasificación", df, column_types)
    
    # 2. Imputación multivariada solo en columnas numéricas no booleanas
    print("Realizando imputación...")
    imputer = IterativeImputer(max_iter=15, random_state=42)
    df_numeric_imputed = pd.DataFrame(
        imputer.fit_transform(df[numeric_cols]),
        columns=numeric_cols,
        index=df.index
    )
    
    # 3. Generación de características en lotes
    print("Generando características...")
    batch_size = 50  # Procesar 50 columnas a la vez
    new_features_dfs = []
    
    for i in range(0, len(numeric_cols), batch_size):
        batch_cols = numeric_cols[i:i + batch_size]
        batch_features = batch_safe_transformation(df_numeric_imputed, batch_cols)
        new_features_dfs.append(batch_features)
    
    new_features = pd.concat(new_features_dfs, axis=1)
    
    summaries["Generación de Features"] = [
        f"Features generadas: {len(new_features.columns)}",
        "Tipos de transformaciones:",
        "  - Logarítmicas (log1p)",
        "  - Raíz cuadrada",
        "  - Cuadráticas",
        "  - Z-score"
    ]
    
    # 4. Interacciones seguras
    print("Generando interacciones...")
    safe_interactions = generate_safe_interactions(
        pd.concat([df_numeric_imputed, df[['nivel_triage']]], axis=1),
        numeric_cols
    )
    
    summaries["Interacciones"] = [
        f"Interacciones generadas: {len(safe_interactions.columns)}",
        "Método: Target Encoding con interacciones"
    ]
    
    # 5. Selección de características
    print("Seleccionando características...")
    full_set = pd.concat([
        df_numeric_imputed,
        new_features,
        safe_interactions,
        df[column_types.get('boolean_cols', [])],  # Mantener columnas booleanas sin procesar
    ], axis=1)
    
    selector = VarianceThreshold(threshold=0.01)
    selected = selector.fit_transform(full_set)
    selected_cols = full_set.columns[selector.get_support()]
    
    # 6. Dataset final
    df_feateng = pd.concat([
        df[['nivel_triage']],
        pd.DataFrame(selected, columns=selected_cols, index=df.index)
    ], axis=1)
    
    # Resumen final
    summaries["Resumen Final"] = get_column_summary("Final", df_feateng)
    
    summaries["Resumen Comparativo"] = [
        f"Columnas iniciales: {initial_columns}",
        f"Features generadas: {len(new_features.columns)}",
        f"Interacciones generadas: {len(safe_interactions.columns)}",
        f"Columnas finales: {len(df_feateng.columns)}"
    ]
    
    # Generar PDF
    print("Generando reporte PDF...")
    pdf_path = generate_pdf_report(summaries, output_path)
    
    # Guardar dataset
    output_file = os.path.join(output_path, "df_feateng.parquet")
    pq.write_table(pa.Table.from_pandas(df_feateng), output_file)
    
    print(f"\n✅ Feature engineering completado.")
    print(f"Dataset guardado en: {output_file}")
    print(f"Reporte PDF generado en: {pdf_path}")

if __name__ == "__main__":
    main()

Identificando columnas...
Realizando imputación...
Generando características...
Generando interacciones...
Seleccionando características...
Generando reporte PDF...

✅ Feature engineering completado.
Dataset guardado en: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\intermediate/featured\df_feateng.parquet
Reporte PDF generado en: c:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\intermediate/featured\feature_engineering_report.pdf
