In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

data22 = pd.read_csv('../../data/training/EPHARG_train_22.csv')
data23 = pd.read_csv('../../data/training/EPHARG_train_23.csv')
data24 = pd.read_csv('../../data/training/EPHARG_train_24.csv')
data24.drop(columns = ["V2_01_M", "V2_02_M", "V2_03_M","V5_01_M", "V5_02_M", "V5_03_M"], inplace=True)
data25 = pd.read_csv('../../data/training/EPHARG_train_25.csv')
data25.drop(columns = ["V2_01_M", "V2_02_M", "V2_03_M","V5_01_M", "V5_02_M", "V5_03_M"], inplace=True)
data = pd.concat([data22, data23, data24, data25], ignore_index=True)
pd.set_option('display.max_columns', None)

In [2]:
data['logP47T'] = np.where(data['P47T'] > 0, np.log10(data['P47T']), np.nan)


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [3]:
data_reg = data[(data['INGRESO'] == 1)]
y_reg = data_reg['logP47T']
excluir_cols = {'logP47T', 'INGRESO', 'CODUSU', 'P47T', 'P21', 'T_VI', 'V12_M', 'V2_M', 'V3_M', 'V5_M', 'TOT_P12', 'PP08D1'}
X_reg = data_reg.drop(columns=excluir_cols)

In [4]:
# Grilla inicial hiperpar치metros
param_grids_base = {
    "LinearRegression": {
        "reg__fit_intercept": [True, False]
    },
    "Ridge": {
        "reg__alpha": [0.01, 1, 100],
        "reg__fit_intercept": [True, False]
    },
    "Lasso": {
        "reg__alpha": [0.01, 1, 100],
        "reg__fit_intercept": [True, False]
    },
    "HistGradientBoostingRegressor": {
        "reg__max_iter": [100,200],
        "reg__learning_rate": [0.1, 0.001],
        "reg__max_leaf_nodes": [100,200],
        "reg__min_samples_leaf": [10, 100],
        "reg__l2_regularization": [0, 1]
    }
}

In [5]:
X_train_test, X_val, y_train_test, y_val = train_test_split(X_reg, y_reg, test_size=0.10, random_state=42) #Separo 10% para validacion
X_train, X_test, y_train, y_test = train_test_split(X_train_test, y_train_test, test_size=2/9, random_state=42) #Separo 20% total para test, 70% Train

In [6]:
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

In [7]:
# Preprocesadores

preproc_scaled = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
], remainder='drop')

preproc_unscaled = ColumnTransformer([
    ('num', 'passthrough', num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
], remainder='drop')

In [8]:
#Pipelines

pipelines = {
        "LinearRegression": Pipeline([('preproc', preproc_unscaled), ('reg', LinearRegression())]),
        "Ridge": Pipeline([('preproc', preproc_scaled), ('reg', Ridge())]),
        "Lasso": Pipeline([('preproc', preproc_scaled), ('reg', Lasso(max_iter=200))]),
        "HistGradientBoostingRegressor": Pipeline([('preproc', preproc_unscaled), ('reg', HistGradientBoostingRegressor(random_state=42))])
    }

In [9]:
#quiero quedarme con el modelo y configuracion que tuvo mejor r2 de todos los posibles

best_model = None
best_r2 = -np.inf
best_params = None

for name, pipe in pipelines.items():
    print(f"\n=== Evaluando modelo: {name} ===")
    grid = GridSearchCV(
        pipe,
        param_grids_base[name],
        cv=5,
        refit='r2',
        scoring=['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'],
        n_jobs=-1,
        verbose=0
    )
    grid.fit(X_train, y_train)
    best = grid.best_estimator_

    y_pred = best.predict(X_test)

    test_mse = mean_squared_error(y_test, y_pred)
    test_r2 = r2_score(y_test, y_pred)

    print("Best params:", grid.best_params_)
    print("Best CV score:", grid.best_score_)
    print("Regression metrics (test):")
    print("MSE:", test_mse)
    print("R2:", test_r2)

    if test_r2 > best_r2:
        best_r2 = test_r2
        best_model = best
        best_params = grid.best_params_
    
print("\n=== Mejor modelo global ===")
print("Mejor modelo:", best_model)
print("Mejores hiperpar치metros:", best_params)
print("Mejor R2 en test:", best_r2)


=== Evaluando modelo: LinearRegression ===
Best params: {'reg__fit_intercept': True}
Best CV score: 0.5023699980135874
Regression metrics (test):
MSE: 0.07024294983109733
R2: 0.5089789312647708

=== Evaluando modelo: Ridge ===


KeyboardInterrupt: 

In [None]:
# Feature importance del mejor modelo global

if hasattr(best_model, "feature_importances_"):
    print("Feature importances:")
    for name, importance in zip(X_train.columns, best_model.feature_importances_):
        print(f"  {name}: {importance:.4f}")


In [None]:
#Ahora quiero usar este mejor modelo global para, dados los datos de X_val, predecir y_val y registrar MSE y R2

y_val_pred = best_model.predict(X_val)
y_val_mse = mean_squared_error(y_val, y_val_pred)
y_val_r2 = r2_score(y_val, y_val_pred)

print("MSE en validaci칩n:", y_val_mse)
print("R2 en validaci칩n:", y_val_r2)