In [None]:
# Importación de bibliotecas para preprocesamiento
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (StandardScaler, MinMaxScaler, RobustScaler, 
                                 LabelEncoder, OneHotEncoder, OrdinalEncoder)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib
import warnings

warnings.filterwarnings('ignore')

print("✅ Bibliotecas de preprocesamiento importadas correctamente")
print("🔧 Listo para implementar pipelines de transformación")


In [None]:
# Cargar el dataset procesado del EDA
try:
    df = pd.read_csv('../data_processed_eda.csv')
    print("✅ Dataset del EDA cargado correctamente")
except FileNotFoundError:
    # Cargar dataset original y aplicar transformaciones del EDA
    df = pd.read_csv('../retail_sales_dataset.csv')
    
    # Convertir la columna Date a datetime
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Extraer características temporales
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Month_Name'] = df['Date'].dt.strftime('%B')
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['DayName'] = df['Date'].dt.strftime('%A')
    df['Quarter'] = df['Date'].dt.quarter
    
    # Clasificación de ventas
    def clasificador_ventas(amount):
        if amount >= 1000:
            return 'Alta'
        elif amount >= 300:
            return 'Media'
        else:
            return 'Baja'
    
    df['Sales_Category'] = df['Total Amount'].apply(clasificador_ventas)
    
    # Clasificación de edad
    def clasificador_edad(age):
        if age >= 50:
            return 'Adulto Mayor'
        elif age >= 30 and age < 50:
            return 'Adulto'
        else:
            return 'Joven'
    
    df['Age_Group'] = df['Age'].apply(clasificador_edad)
    
    # Normalización Min-Max
    min_sales = df['Total Amount'].min()
    max_sales = df['Total Amount'].max()
    df['Total_Amount_Normalized'] = (df['Total Amount'] - min_sales) / (max_sales - min_sales)
    
    print("✅ Transformaciones del EDA aplicadas al dataset original")

print(f"📊 Dimensiones del dataset: {df.shape}")
print(f"📋 Columnas disponibles: {len(df.columns)}")
print("\n🔍 Primeras filas:")
df.head()


In [None]:
# Definir diferentes problemas de machine learning que podemos resolver

print("🎯 DEFINICIÓN DE PROBLEMAS DE MACHINE LEARNING")
print("="*55)

# Problema 1: Clasificación de categorías de venta (Sales_Category)
# Target: Sales_Category (Alta, Media, Baja)
print("\n📊 Problema 1: Clasificación de Categorías de Venta")
print("   🎯 Target: Sales_Category (Alta, Media, Baja)")
print("   📈 Tipo: Clasificación multiclase")

# Problema 2: Regresión para predecir Total Amount
print("\n💰 Problema 2: Predicción del Monto Total de Venta")
print("   🎯 Target: Total Amount (variable continua)")
print("   📈 Tipo: Regresión")

# Problema 3: Clasificación binaria de ventas altas
# Crear variable binaria para ventas >= 1000
df['High_Sales'] = (df['Total Amount'] >= 1000).astype(int)
print("\n🚀 Problema 3: Predicción de Ventas Altas (Binary)")
print("   🎯 Target: High_Sales (1: >= $1000, 0: < $1000)")
print("   📈 Tipo: Clasificación binaria")

# Seleccionar el problema principal: Clasificación de Sales_Category
target_variable = 'Sales_Category'
print(f"\n✅ PROBLEMA SELECCIONADO: Clasificación de {target_variable}")

# Definir features para el modelo
# Excluir variables que no deben usarse como features
exclude_columns = [
    'Transaction ID', 'Customer ID', 'Date',  # IDs y fechas
    'Total Amount', 'Total_Amount_Normalized',  # Target leak
    'Sales_Category', 'High_Sales'  # Variables target
]

features_all = [col for col in df.columns if col not in exclude_columns]
print(f"\n📋 FEATURES DISPONIBLES ({len(features_all)}):")
for i, feature in enumerate(features_all, 1):
    print(f"   {i:2d}. {feature}")

# Separar features por tipo
numeric_features = df[features_all].select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df[features_all].select_dtypes(include=['object']).columns.tolist()

print(f"\n🔢 FEATURES NUMÉRICAS ({len(numeric_features)}):")
for feature in numeric_features:
    print(f"   • {feature}")

print(f"\n🏷️ FEATURES CATEGÓRICAS ({len(categorical_features)}):")
for feature in categorical_features:
    unique_values = df[feature].nunique()
    print(f"   • {feature} ({unique_values} valores únicos)")

# Verificar distribución del target
print(f"\n🎯 DISTRIBUCIÓN DE LA VARIABLE TARGET ({target_variable}):")
target_distribution = df[target_variable].value_counts()
for category, count in target_distribution.items():
    percentage = (count / len(df)) * 100
    print(f"   📊 {category}: {count} ({percentage:.1f}%)")"


In [None]:
# Crear diferentes pipelines de preprocesamiento

print("🔧 CREACIÓN DE PIPELINES DE PREPROCESAMIENTO")
print("="*50)

# Pipeline 1: StandardScaler para numéricas + OneHotEncoder para categóricas
numeric_pipeline_1 = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline_1 = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

preprocessor_1 = ColumnTransformer([
    ('numeric', numeric_pipeline_1, numeric_features),
    ('categorical', categorical_pipeline_1, categorical_features)
])

print("✅ Pipeline 1: StandardScaler + OneHotEncoder")

# Pipeline 2: MinMaxScaler para numéricas + OneHotEncoder para categóricas
numeric_pipeline_2 = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

preprocessor_2 = ColumnTransformer([
    ('numeric', numeric_pipeline_2, numeric_features),
    ('categorical', categorical_pipeline_1, categorical_features)
])

print("✅ Pipeline 2: MinMaxScaler + OneHotEncoder")

# Pipeline 3: RobustScaler para numéricas + OrdinalEncoder para categóricas
categorical_pipeline_3 = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

numeric_pipeline_3 = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

preprocessor_3 = ColumnTransformer([
    ('numeric', numeric_pipeline_3, numeric_features),
    ('categorical', categorical_pipeline_3, categorical_features)
])

print("✅ Pipeline 3: RobustScaler + OrdinalEncoder")

# Crear diccionario de preprocessors
preprocessors = {
    'StandardScaler_OneHot': preprocessor_1,
    'MinMaxScaler_OneHot': preprocessor_2,
    'RobustScaler_Ordinal': preprocessor_3
}

print(f"\n📊 Total de pipelines creados: {len(preprocessors)}")
print("🎯 Cada pipeline maneja automáticamente:")
print("   • Imputación de valores faltantes")
print("   • Escalado de variables numéricas")
print("   • Codificación de variables categóricas")


In [None]:
# Preparar datos para machine learning
X = df[features_all]
y = df[target_variable]

# Codificar la variable target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("📊 PREPARACIÓN DE DATOS PARA MACHINE LEARNING")
print("="*50)
print(f"✅ Features (X): {X.shape}")
print(f"✅ Target (y): {y.shape}")
print(f"📋 Clases del target: {list(label_encoder.classes_)}")
print(f"🔢 Target codificado: {dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))}")

# División en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_encoded
)

print(f"\n📊 DIVISIÓN DE DATOS:")
print(f"   🏋️ Entrenamiento: {X_train.shape[0]} muestras ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"   🧪 Prueba: {X_test.shape[0]} muestras ({X_test.shape[0]/len(X)*100:.1f}%)")

# Verificar distribución de clases en cada conjunto
train_distribution = pd.Series(y_train).value_counts().sort_index()
test_distribution = pd.Series(y_test).value_counts().sort_index()

print(f"\n🎯 DISTRIBUCIÓN DE CLASES:")
print("Conjunto de Entrenamiento:")
for class_idx, count in train_distribution.items():
    class_name = label_encoder.inverse_transform([class_idx])[0]
    print(f"   📊 {class_name}: {count} ({count/len(y_train)*100:.1f}%)")

print("Conjunto de Prueba:")
for class_idx, count in test_distribution.items():
    class_name = label_encoder.inverse_transform([class_idx])[0]
    print(f"   📊 {class_name}: {count} ({count/len(y_test)*100:.1f}%)")


In [None]:
# Aplicar cada pipeline de preprocesamiento y guardar resultados
processed_datasets = {}

print("🔧 APLICACIÓN DE PIPELINES DE PREPROCESAMIENTO")
print("="*55)

for name, preprocessor in preprocessors.items():
    print(f"\n🚀 Aplicando pipeline: {name}")
    
    # Ajustar y transformar datos de entrenamiento
    X_train_processed = preprocessor.fit_transform(X_train)
    
    # Transformar datos de prueba (solo transform, no fit)
    X_test_processed = preprocessor.transform(X_test)
    
    # Guardar resultados
    processed_datasets[name] = {
        'X_train': X_train_processed,
        'X_test': X_test_processed,
        'y_train': y_train,
        'y_test': y_test,
        'preprocessor': preprocessor,
        'label_encoder': label_encoder
    }
    
    print(f"   ✅ Entrenamiento: {X_train_processed.shape}")
    print(f"   ✅ Prueba: {X_test_processed.shape}")
    
    # Mostrar información sobre las transformaciones aplicadas
    if hasattr(preprocessor, 'transformers_'):
        for transformer_name, transformer, columns in preprocessor.transformers_:
            if transformer_name != 'remainder':
                print(f"   🔹 {transformer_name}: {len(columns)} columnas")

print(f"\n📊 RESUMEN DE DATASETS PROCESADOS:")
for name in processed_datasets.keys():
    print(f"   ✅ {name}: Listo para machine learning")

# Guardar los preprocessors para uso futuro
import os
os.makedirs('../models', exist_ok=True)

for name, preprocessor in preprocessors.items():
    filename = f'../models/preprocessor_{name}.joblib'
    joblib.dump(preprocessor, filename)
    print(f"💾 Preprocessor guardado: {filename}")

# Guardar label encoder
joblib.dump(label_encoder, '../models/label_encoder.joblib')
print(f"💾 Label encoder guardado: ../models/label_encoder.joblib")

print(f"\n✅ PREPROCESAMIENTO COMPLETADO")
print(f"🎯 {len(processed_datasets)} datasets listos para benchmarking de modelos")


In [None]:
# Analizar el impacto de diferentes transformaciones
import matplotlib.pyplot as plt

print("📊 ANÁLISIS DEL IMPACTO DE LAS TRANSFORMACIONES")
print("="*50)

# Seleccionar una variable numérica para comparar transformaciones
sample_feature = 'Age'  # Usar Age como ejemplo
original_data = X_train[sample_feature]

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle(f'📊 COMPARACIÓN DE TRANSFORMACIONES - Variable: {sample_feature}', 
             fontsize=16, fontweight='bold')

# Subplot 1: Datos originales
axes[0,0].hist(original_data, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,0].set_title('📋 Datos Originales', fontweight='bold')
axes[0,0].set_xlabel(sample_feature)
axes[0,0].set_ylabel('Frecuencia')
axes[0,0].grid(True, alpha=0.3)

# Estadísticas originales
mean_orig = original_data.mean()
std_orig = original_data.std()
axes[0,0].axvline(mean_orig, color='red', linestyle='--', 
                  label=f'Media: {mean_orig:.2f}')
axes[0,0].axvline(mean_orig + std_orig, color='orange', linestyle='--', alpha=0.7,
                  label=f'±1 STD: {std_orig:.2f}')
axes[0,0].axvline(mean_orig - std_orig, color='orange', linestyle='--', alpha=0.7)
axes[0,0].legend()

# Obtener la posición de la variable Age en los datos transformados
age_position = numeric_features.index(sample_feature)

# Subplot 2: StandardScaler
standard_scaled_data = processed_datasets['StandardScaler_OneHot']['X_train'][:, age_position]
axes[0,1].hist(standard_scaled_data, bins=20, alpha=0.7, color='lightgreen', edgecolor='black')
axes[0,1].set_title('🔧 StandardScaler', fontweight='bold')
axes[0,1].set_xlabel(f'{sample_feature} (Estandarizado)')
axes[0,1].set_ylabel('Frecuencia')
axes[0,1].grid(True, alpha=0.3)

# Estadísticas estandarizadas
mean_std = standard_scaled_data.mean()
std_std = standard_scaled_data.std()
axes[0,1].axvline(mean_std, color='red', linestyle='--', 
                  label=f'Media: {mean_std:.2f}')
axes[0,1].axvline(mean_std + std_std, color='orange', linestyle='--', alpha=0.7,
                  label=f'±1 STD: {std_std:.2f}')
axes[0,1].axvline(mean_std - std_std, color='orange', linestyle='--', alpha=0.7)
axes[0,1].legend()

# Subplot 3: MinMaxScaler
minmax_scaled_data = processed_datasets['MinMaxScaler_OneHot']['X_train'][:, age_position]
axes[1,0].hist(minmax_scaled_data, bins=20, alpha=0.7, color='lightcoral', edgecolor='black')
axes[1,0].set_title('🔧 MinMaxScaler', fontweight='bold')
axes[1,0].set_xlabel(f'{sample_feature} (Min-Max)')
axes[1,0].set_ylabel('Frecuencia')
axes[1,0].grid(True, alpha=0.3)

# Estadísticas min-max
mean_mm = minmax_scaled_data.mean()
std_mm = minmax_scaled_data.std()
axes[1,0].axvline(mean_mm, color='red', linestyle='--', 
                  label=f'Media: {mean_mm:.2f}')
axes[1,0].axvline(0, color='green', linestyle='--', alpha=0.7, label='Min: 0')
axes[1,0].axvline(1, color='green', linestyle='--', alpha=0.7, label='Max: 1')
axes[1,0].legend()

# Subplot 4: RobustScaler
robust_scaled_data = processed_datasets['RobustScaler_Ordinal']['X_train'][:, age_position]
axes[1,1].hist(robust_scaled_data, bins=20, alpha=0.7, color='lightsalmon', edgecolor='black')
axes[1,1].set_title('🔧 RobustScaler', fontweight='bold')
axes[1,1].set_xlabel(f'{sample_feature} (Robust)')
axes[1,1].set_ylabel('Frecuencia')
axes[1,1].grid(True, alpha=0.3)

# Estadísticas robust
mean_rb = robust_scaled_data.mean()
std_rb = robust_scaled_data.std()
axes[1,1].axvline(mean_rb, color='red', linestyle='--', 
                  label=f'Media: {mean_rb:.2f}')
axes[1,1].axvline(0, color='green', linestyle='--', alpha=0.7, label='Mediana: 0')
axes[1,1].legend()

plt.tight_layout()
plt.show()

# Tabla comparativa de estadísticas
print(f"\n📊 ESTADÍSTICAS COMPARATIVAS - Variable: {sample_feature}")
print("="*60)
print(f"{'Transformación':<20} {'Media':<10} {'Std':<10} {'Min':<10} {'Max':<10}")
print("-" * 60)

# Datos originales
print(f"{'Original':<20} {original_data.mean():<10.3f} {original_data.std():<10.3f} "
      f"{original_data.min():<10.3f} {original_data.max():<10.3f}")

# StandardScaler
print(f"{'StandardScaler':<20} {standard_scaled_data.mean():<10.3f} {standard_scaled_data.std():<10.3f} "
      f"{standard_scaled_data.min():<10.3f} {standard_scaled_data.max():<10.3f}")

# MinMaxScaler
print(f"{'MinMaxScaler':<20} {minmax_scaled_data.mean():<10.3f} {minmax_scaled_data.std():<10.3f} "
      f"{minmax_scaled_data.min():<10.3f} {minmax_scaled_data.max():<10.3f}")

# RobustScaler
print(f"{'RobustScaler':<20} {robust_scaled_data.mean():<10.3f} {robust_scaled_data.std():<10.3f} "
      f"{robust_scaled_data.min():<10.3f} {robust_scaled_data.max():<10.3f}")

print(f"\n💡 OBSERVACIONES:")
print("   🎯 StandardScaler: Media ≈ 0, Std ≈ 1")
print("   🎯 MinMaxScaler: Rango [0, 1]")
print("   🎯 RobustScaler: Mediana ≈ 0, resistente a outliers")

# Guardar datasets procesados para el siguiente notebook
joblib.dump(processed_datasets, '../models/processed_datasets.joblib')
print(f"\n💾 Datasets procesados guardados: ../models/processed_datasets.joblib")
print(f"✅ Listo para el benchmarking de modelos de machine learning")
