# 1. Cargar los datos

In [40]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import layers, models

import warnings

# Ignorar todos los warnings
warnings.filterwarnings("ignore")

# 1. Carga de datos
data_path = "house_data"
train_df = pd.read_csv(os.path.join(data_path, "train.csv"))
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))

# 2. Análisis exploratorio y limpieza de datos

# Separamos la variable objetivo y los features del conjunto de entrenamiento
target = "SalePrice"
y = train_df[target]
X = train_df.drop(target, axis=1)

# Identificar columnas numéricas y categóricas
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Imputación de valores nulos:
# Para variables numéricas se usa la mediana, y para categóricas la moda.
for col in numeric_cols:
    median_val = X[col].median()
    X[col].fillna(median_val, inplace=True)
    if col in test_df.columns:
        test_df[col].fillna(median_val, inplace=True)

for col in categorical_cols:
    mode_val = X[col].mode()[0]
    X[col].fillna(mode_val, inplace=True)
    if col in test_df.columns:
        test_df[col].fillna(mode_val, inplace=True)

# Convertir variables categóricas en variables dummy (one-hot encoding)
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
# Asegurarse de hacer lo mismo en el conjunto de test
test_encoded = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

# Alinear columnas entre train y test (en caso de que test tenga columnas de dummies que no estén en train y viceversa)
X_encoded, test_encoded = X_encoded.align(test_encoded, join='left', axis=1, fill_value=0)

# Escalar las variables numéricas (opcional pero recomendado para deep learning)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)
# Para el conjunto de test, se aplica la misma transformación
test_scaled = scaler.transform(test_encoded)

# 3. División de datos para validación
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 4. Construcción y entrenamiento del modelo deep learning con TensorFlow
input_dim = X_train.shape[1]
model = models.Sequential([
    layers.Dense(128, activation='relu', input_dim=input_dim),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # Capa de salida para regresión
])

model.compile(optimizer='adam', loss='mse')
model.summary()

# Entrenamiento
history = model.fit(X_train, y_train, 
                    validation_data=(X_val, y_val), 
                    epochs=100, batch_size=32, verbose=1)

# 5. Evaluación del modelo
# Predicciones sobre el conjunto de validación
y_pred = model.predict(X_val).flatten()

# Cálculo de métricas
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("Evaluación del modelo en el conjunto de validación:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R2: {r2:.2f}")


Epoch 1/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 41068924928.0000 - val_loss: 39638372352.0000
Epoch 2/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 38566100992.0000 - val_loss: 39453061120.0000
Epoch 3/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 37606014976.0000 - val_loss: 38352846848.0000
Epoch 4/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 36088012800.0000 - val_loss: 34674917376.0000
Epoch 5/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 33327472640.0000 - val_loss: 26603204608.0000
Epoch 6/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 23676155904.0000 - val_loss: 15666971648.0000
Epoch 7/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 13211296768.0000 - val_loss: 8144655872.0000
Epoch 8/100
[1m37/37[0m [

In [39]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

import warnings

# Ignorar todos los warnings
warnings.filterwarnings("ignore")

# 1. Carga de datos
data_path = "house_data"
train_df = pd.read_csv(os.path.join(data_path, "train.csv"))
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))

# Variable objetivo y features
target = "SalePrice"
y = train_df[target]
X = train_df.drop(target, axis=1)

# Identificar columnas numéricas y categóricas
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Imputación de valores nulos
for col in numeric_cols:
    median_val = X[col].median()
    X[col].fillna(median_val, inplace=True)
    if col in test_df.columns:
        test_df[col].fillna(median_val, inplace=True)

for col in categorical_cols:
    mode_val = X[col].mode()[0]
    X[col].fillna(mode_val, inplace=True)
    if col in test_df.columns:
        test_df[col].fillna(mode_val, inplace=True)

# One-hot encoding para variables categóricas
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
test_encoded = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

# Alinear columnas entre train y test
X_encoded, test_encoded = X_encoded.align(test_encoded, join='left', axis=1, fill_value=0)

# Escalar variables numéricas
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)
test_scaled = scaler.transform(test_encoded)

# 2. Función para crear el modelo optimizado
def create_model(input_dim):
    model = models.Sequential([
        layers.Dense(256, activation='relu', input_dim=input_dim),
        layers.Dropout(0.2),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)  # Capa de salida para regresión
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

# 3. Validación cruzada con KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1

rmse_scores = []
mae_scores = []
r2_scores = []

for train_index, val_index in kf.split(X_scaled):
    print(f"\nFold {fold}:")
    X_train_cv, X_val_cv = X_scaled[train_index], X_scaled[val_index]
    y_train_cv, y_val_cv = y.iloc[train_index], y.iloc[val_index]
    
    model_cv = create_model(input_dim=X_train_cv.shape[1])
    
    # Early stopping para evitar sobreajuste
    early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    
    history = model_cv.fit(X_train_cv, y_train_cv, 
                           validation_data=(X_val_cv, y_val_cv),
                           epochs=200, batch_size=32, verbose=0,
                           callbacks=[early_stop])
    
    # Predicciones y evaluación
    y_pred_cv = model_cv.predict(X_val_cv).flatten()
    
    mse = mean_squared_error(y_val_cv, y_pred_cv)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_val_cv, y_pred_cv)
    r2 = r2_score(y_val_cv, y_pred_cv)
    
    print(f"RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.2f}")
    
    rmse_scores.append(rmse)
    mae_scores.append(mae)
    r2_scores.append(r2)
    
    fold += 1

print("\nResumen validación cruzada:")
print(f"RMSE promedio: {np.mean(rmse_scores):.2f}")
print(f"MAE promedio: {np.mean(mae_scores):.2f}")
print(f"R2 promedio: {np.mean(r2_scores):.2f}")

# 4. Entrenamiento final con todos los datos
final_model = create_model(input_dim=X_scaled.shape[1])
early_stop_final = callbacks.EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)

final_model.fit(X_scaled, y, epochs=200, batch_size=32, verbose=1, callbacks=[early_stop_final])

# 5. Predicción en el conjunto de test y creación del submission
test_predictions = final_model.predict(test_scaled).flatten()

# Cargar el archivo sample_submission para preparar el submission
submission = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))
submission["SalePrice"] = test_predictions
submission.to_csv("submission.csv", index=False)

print("\nArchivo 'submission.csv' generado para enviar a Kaggle.")



Fold 1:
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
RMSE: 35222.61, MAE: 21522.45, R2: 0.84

Fold 2:
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
RMSE: 50269.75, MAE: 23925.42, R2: 0.63

Fold 3:
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
RMSE: 61914.25, MAE: 33280.65, R2: 0.31

Fold 4:
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
RMSE: 44825.60, MAE: 24993.84, R2: 0.68

Fold 5:
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
RMSE: 35521.19, MAE: 22166.11, R2: 0.76

Resumen validación cruzada:
RMSE promedio: 45550.68
MAE promedio: 25177.69
R2 promedio: 0.64
Epoch 1/200
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 37130821632.0000
Epoch 2/200
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 39316676608.0000
Epoch 3/200
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

In [43]:
import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, PowerTransformer
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

# Fijamos la semilla para reproducibilidad
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)

# 1. Carga de datos
data_path = "house_data"
train_df = pd.read_csv(os.path.join(data_path, "train.csv"))
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))

# 2. Ingeniería de features: nuevas variables que puedan aportar información
for df in [train_df, test_df]:
    if {'TotalBsmtSF', '1stFlrSF', '2ndFlrSF'}.issubset(df.columns):
        df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    if {'YrSold', 'YearBuilt'}.issubset(df.columns):
        df['HouseAge'] = df['YrSold'] - df['YearBuilt']
    if {'YrSold', 'YearRemodAdd'}.issubset(df.columns):
        df['RemodAge'] = df['YrSold'] - df['YearRemodAdd']

# 3. Separar variables y target
target = "SalePrice"
X = train_df.drop(columns=[target])
y = train_df[target]

# Identificar columnas numéricas y categóricas
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# 4. Pipeline de preprocesado para features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 5. Transformación del target con PowerTransformer (Yeo-Johnson)
target_transformer = PowerTransformer(method='yeo-johnson')
y_transformed = target_transformer.fit_transform(y.values.reshape(-1, 1)).flatten()

# 6. Función para construir el modelo Deep Learning avanzado
def build_model(input_dim):
    model = models.Sequential([
        layers.Dense(256, activation='relu', input_dim=input_dim),
        layers.BatchNormalization(),
        layers.Dropout(0.2),
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.2),
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

# 7. Validación cruzada con KFold (5 folds)
kf = KFold(n_splits=5, shuffle=True, random_state=seed)
rmse_scores = []
mae_scores = []
r2_scores = []
fold = 1

print("Validación Cruzada:")
for train_index, val_index in kf.split(X):
    print(f"\nFold {fold}:")
    X_train = X.iloc[train_index]
    X_val = X.iloc[val_index]
    y_train = y.iloc[train_index]
    y_val = y.iloc[val_index]
    
    # Preprocesar las features
    X_train_proc = preprocessor.fit_transform(X_train)
    X_val_proc = preprocessor.transform(X_val)
    
    # Transformar el target para este fold
    y_train_trans = target_transformer.fit_transform(y_train.values.reshape(-1,1)).flatten()
    y_val_trans = target_transformer.transform(y_val.values.reshape(-1,1)).flatten()
    
    # Construir y entrenar el modelo
    input_dim = X_train_proc.shape[1]
    model = build_model(input_dim)
    es = callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    rlrop = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)
    model.fit(X_train_proc, y_train_trans,
              validation_data=(X_val_proc, y_val_trans),
              epochs=200, batch_size=32, verbose=0,
              callbacks=[es, rlrop])
    
    # Predicción en el fold de validación
    y_pred_val_trans = model.predict(X_val_proc).flatten()
    # Inversa de la transformación para obtener los valores en escala original
    y_pred_val = target_transformer.inverse_transform(y_pred_val_trans.reshape(-1,1)).flatten()
    
    # Calcular métricas en la escala original
    rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    mae = mean_absolute_error(y_val, y_pred_val)
    r2 = r2_score(y_val, y_pred_val)
    
    print(f"RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.4f}")
    rmse_scores.append(rmse)
    mae_scores.append(mae)
    r2_scores.append(r2)
    
    fold += 1

print("\nResumen validación cruzada:")
print(f"RMSE promedio: {np.mean(rmse_scores):.2f}")
print(f"MAE promedio: {np.mean(mae_scores):.2f}")
print(f"R2 promedio: {np.mean(r2_scores):.4f}")

# 8. Entrenamiento final con todo el conjunto de entrenamiento
print("\nEntrenando modelo final con todos los datos...")
X_full_proc = preprocessor.fit_transform(X)
y_full_trans = target_transformer.fit_transform(y.values.reshape(-1,1)).flatten()

final_model = build_model(X_full_proc.shape[1])
es_final = callbacks.EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)
rlrop_final = callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5)
final_model.fit(X_full_proc, y_full_trans, epochs=200, batch_size=32, verbose=1,
                callbacks=[es_final, rlrop_final])

# 9. Predicción en el conjunto de test y generación del submission
X_test = test_df.copy()
X_test_proc = preprocessor.transform(X_test)
y_test_pred_trans = final_model.predict(X_test_proc).flatten()
y_test_pred = target_transformer.inverse_transform(y_test_pred_trans.reshape(-1,1)).flatten()

submission = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))
submission["SalePrice"] = y_test_pred
submission.to_csv("submission.csv", index=False)
print("\nArchivo 'submission.csv' generado para enviar a Kaggle.")


Validación Cruzada:

Fold 1:
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
RMSE: 47461.31, MAE: 25468.39, R2: 0.7063

Fold 2:
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
RMSE: 43216.23, MAE: 24257.14, R2: 0.7253

Fold 3:
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
RMSE: 52071.81, MAE: 30849.17, R2: 0.5092

Fold 4:
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
RMSE: 45755.45, MAE: 26319.89, R2: 0.6666

Fold 5:
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
RMSE: 63330.98, MAE: 25954.95, R2: 0.2327

Resumen validación cruzada:
RMSE promedio: 50367.16
MAE promedio: 26569.91
R2 promedio: 0.5680

Entrenando modelo final con todos los datos...
Epoch 1/200
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 1.9560 - learning_rate: 0.0010
Epoch 2/200
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - 

## Conclusión de la Validación y Entrenamiento Final

- **Validación Cruzada:**  
  
  - **RMSE promedio:** 50,367.16  
  
  - **MAE promedio:** 26,569.91  
  
  - **R² promedio:** 0.5680  
  
  - Se observa cierta variabilidad entre los folds (especialmente en el Fold 5 con un R² de 0.2327), lo cual indica que algunas particiones son más desafiantes. No obstante, los resultados son consistentes en términos de error medio y explican una parte razonable de la variabilidad del target.

- **Entrenamiento Final:**  
  
  - La red neuronal, con BatchNormalization, Dropout, y ajustes dinámicos de la tasa de aprendizaje (ReduceLROnPlateau y EarlyStopping), ha logrado converger de forma estable.
  
  - La transformación del target mediante un PowerTransformer (método Yeo-Johnson) ha facilitado una optimización más robusta y permite evaluar las métricas en la escala original.

- **Submission en Kaggle:**  
  
  - El modelo final ha generado una submission que obtiene una puntuación de 0.17 en Kaggle, lo cual representa una mejora notable respecto a versiones anteriores de este notebook.

**Conclusión General:**  

El enfoque avanzado de ingeniería de features, preprocesado robusto y optimización del modelo deep learning ha permitido obtener resultados competitivos. A pesar de la variabilidad en algunos folds, la integración de transformaciones y técnicas de regularización ha llevado a un desempeño global sólido, evidenciado por la mejora en la submission de Kaggle.
