## Clase 1: Algoritmos de Regresion con GridSearchCV
#### Profesor: Diego Miranda Olavarría
#### Data Scientist

In [1]:
import pandas as pd 
from sklearn.model_selection import GridSearchCV

data = pd.read_csv('Boston.csv')

data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [2]:
data.shape

(506, 14)

In [3]:
X = data.drop('medv', axis=1)
y = data['medv']

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=42)

## Arbol de Decision

In [5]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Definimos el modelo base
dt_reg = DecisionTreeRegressor()

# Diccionario con los hiperparámetros y sus valores o distribuciones para RandomizedSearchCV
param_dist = {
    # Función para medir la calidad de las divisiones:
    # - 'squared_error': error cuadrático medio (MSE), común para regresión
    # - 'friedman_mse': variante del MSE, útil en boosting
    # - 'absolute_error': error absoluto (MAE), más robusto a outliers
    # - 'poisson': útil en problemas de conteo
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    
    # Estrategia para dividir nodos:
    # - 'best': elige la mejor división posible (por defecto)
    # - 'random': elige aleatoriamente entre las mejores divisiones (útil en ensambles)
    'splitter': ['best', 'random'],
    
    # Profundidad máxima del árbol:
    # - None: crece hasta que no puede dividir más
    # - Valores enteros limitan la profundidad para evitar sobreajuste
    'max_depth': [None] + list(range(3, 20)),
    
    # Mínimo número de muestras para dividir un nodo interno:
    # - Valores mayores reducen el crecimiento del árbol y evitan sobreajuste
    'min_samples_split': randint(2, 11),  # distribuciones aleatorias
    
    # Mínimo número de muestras que debe tener un nodo hoja:
    # - Evita hojas con muy pocos datos
    'min_samples_leaf': randint(1, 6)
}

# Búsqueda aleatoria sobre las combinaciones de parámetros
random_search = RandomizedSearchCV(
    estimator=dt_reg,                     # Modelo base
    param_distributions=param_dist,      # Diccionario de parámetros
    n_iter=50,                            # Número de combinaciones aleatorias a probar
    scoring='neg_mean_squared_error',    # Métrica de evaluación (MSE negativo)
    cv=5,                                 # Validación cruzada con 5 particiones
    verbose=1,                            # Nivel de detalle en la salida
    random_state=42,                      # Semilla para reproducibilidad
    n_jobs=-1                             # Usa todos los núcleos del procesador
)

# Entrenamos el modelo usando RandomizedSearchCV
random_search.fit(X_train, y_train)

# Extraemos el mejor modelo encontrado
best_dt_model = random_search.best_estimator_

# Mostramos los mejores hiperparámetros encontrados
print("Mejores parámetros encontrados:", random_search.best_params_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Mejores parámetros encontrados: {'criterion': 'poisson', 'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 6, 'splitter': 'best'}


In [6]:
y_pred_dt = best_dt_model.predict(X_test)
y_pred_dt

array([24.02424242, 33.175     , 17.63684211, 24.02424242, 15.88095238,
       21.39622642, 17.63684211, 13.85      , 21.39622642, 21.39622642,
       17.63684211, 26.5       ,  7.27      , 21.39622642, 21.39622642,
       21.39622642, 20.95      ,  8.7375    , 42.32      , 15.88095238,
       24.02424242, 24.02424242, 13.84      , 24.73333333, 15.88095238,
       15.88095238, 20.21333333, 11.125     , 17.63684211, 21.39622642,
       17.63684211, 24.02424242, 21.39622642, 21.39622642, 15.1       ,
       15.88095238, 34.89090909, 20.21333333, 22.1       , 24.02424242,
       21.56666667, 29.275     , 42.32      , 19.75      , 21.91666667,
       15.88095238, 15.88095238, 24.02424242, 15.88095238, 33.175     ,
       21.39622642, 34.89090909, 15.88095238, 29.275     , 47.09      ,
       21.39622642, 15.88095238, 26.325     , 21.91666667, 22.1       ,
       24.225     , 34.89090909, 30.35      , 19.75      , 26.325     ,
       13.84      , 15.88095238, 21.91666667, 26.325     , 14.  

## Random Forest

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Definimos el modelo base
rf_reg = RandomForestRegressor()

# Diccionario con los hiperparámetros a explorar aleatoriamente
param_dist_rf = {
    # Número de árboles en el bosque:
    # Más árboles → más robusto, pero más lento
    'n_estimators': randint(50, 201),  # valores entre 50 y 200

    # Profundidad máxima de cada árbol:
    # None → los nodos se expanden hasta que sean puros
    'max_depth': [None] + list(range(5, 21)),

    # Muestras mínimas para dividir un nodo:
    # Controla la complejidad del modelo y previene overfitting
    'min_samples_split': randint(2, 11),

    # Muestras mínimas que debe tener una hoja:
    # Ayuda a suavizar el modelo, útil contra el sobreajuste
    'min_samples_leaf': randint(1, 6),

    # Si se hace bootstrap (muestreo con reemplazo) al crear los árboles:
    # True → típico random forest, False → todos los datos en cada árbol
    'bootstrap': [True, False]
}

# Búsqueda aleatoria sobre las combinaciones de parámetros
random_search_rf = RandomizedSearchCV(
    estimator=rf_reg,                    # Modelo base
    param_distributions=param_dist_rf,  # Diccionario de hiperparámetros
    n_iter=50,                           # Número de combinaciones aleatorias a probar
    scoring='neg_mean_squared_error',   # Métrica de evaluación (error cuadrático negativo)
    cv=5,                                # Validación cruzada (5 folds)
    verbose=1,                           # Detalle del proceso
    random_state=42,                     # Semilla para reproducibilidad
    n_jobs=-1                            # Usa todos los núcleos del procesador
)

# Ajustamos el modelo con los datos de entrenamiento
random_search_rf.fit(X_train, y_train)

# Extraemos el mejor modelo encontrado
best_rf_model = random_search_rf.best_estimator_

# Mostramos los mejores hiperparámetros encontrados
print("Mejores parámetros del Random Forest:", random_search_rf.best_params_)

# Predicciones en el set de prueba usando el mejor modelo
y_pred_rf = best_rf_model.predict(X_test)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Mejores parámetros del Random Forest: {'bootstrap': True, 'max_depth': 12, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 138}


In [8]:
y_pred_rf

array([22.84221079, 31.14240712, 17.82291465, 23.36711422, 16.94933   ,
       21.39586725, 19.62292156, 15.57835116, 21.19131801, 21.01749851,
       20.43269697, 20.29098631,  8.21002293, 21.43644668, 19.52239945,
       25.2688571 , 19.36375518,  8.60186874, 44.94262551, 15.02828991,
       24.83442134, 23.87443248, 14.54265694, 24.05636216, 14.71866099,
       14.81440045, 21.61653303, 14.0290249 , 19.75515758, 20.97372948,
       19.74636128, 23.23657531, 29.44952959, 20.29474321, 14.45567507,
       15.83063262, 34.05787727, 19.22071042, 20.75815668, 24.37746644,
       18.70671814, 29.31390758, 45.12035803, 19.22407435, 22.61471226,
       13.95341793, 15.2419039 , 24.671968  , 18.19980629, 28.20064892,
       21.19422804, 33.94689182, 16.72207787, 26.13511277, 45.12401915,
       22.00875891, 15.85945566, 32.27864915, 22.37214839, 20.32427824,
       25.44286203, 34.45826202, 29.05536491, 18.96265142, 26.82211209,
       16.83906808, 14.02560737, 23.11368167, 28.82748107, 15.05

## XGBoost

In [9]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# Modelo base con configuración moderna para GPU
xgboost_reg = xgb.XGBRegressor(
    tree_method='hist',       # GPU compatible si se usa junto con device='cuda'
    device='cuda',            # NUEVA forma correcta de activar GPU
    objective='reg:squarederror',
    verbosity=1,
    random_state=42
)

# Hiperparámetros reducidos para carga ligera
param_dist_xgb = {
    'n_estimators': randint(30, 60),
    'max_depth': randint(2, 5),
    'learning_rate': uniform(0.05, 0.15),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'gamma': uniform(0, 0.1)
}

# Randomized Search optimizado
random_search_xgb = RandomizedSearchCV(
    estimator=xgboost_reg,
    param_distributions=param_dist_xgb,
    n_iter=10,
    scoring='neg_mean_squared_error',
    cv=2,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Entrenamiento
random_search_xgb.fit(X_train, y_train)

# Mejor modelo y predicción
best_xgb_model = random_search_xgb.best_estimator_
y_pred_xgb = best_xgb_model.predict(X_test)

# Mostrar los hiperparámetros óptimos
print("✅ Mejores parámetros encontrados por XGBoost:", random_search_xgb.best_params_)



Fitting 2 folds for each of 10 candidates, totalling 20 fits
✅ Mejores parámetros encontrados por XGBoost: {'colsample_bytree': np.float64(0.9949692657420364), 'gamma': np.float64(0.046676289324798), 'learning_rate': np.float64(0.1789910610104481), 'max_depth': 4, 'n_estimators': 50, 'subsample': np.float64(0.8351497755908628)}


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


In [10]:
y_pred_xgb

array([23.75428  , 31.404915 , 17.00668  , 25.215145 , 17.096333 ,
       22.288374 , 18.340492 , 14.624627 , 20.877277 , 21.212221 ,
       21.101797 , 19.149147 ,  7.855071 , 22.00376  , 19.355806 ,
       24.317663 , 19.680204 ,  9.977827 , 45.904884 , 15.199898 ,
       24.148703 , 25.47985  , 13.956404 , 21.289333 , 15.378546 ,
       15.5821295, 22.174925 , 12.447767 , 19.703062 , 21.338741 ,
       19.847214 , 23.701637 , 16.7394   , 20.942728 , 15.058172 ,
       15.860579 , 37.211052 , 19.294956 , 21.300198 , 24.232397 ,
       16.59843  , 29.353615 , 46.957542 , 19.17539  , 22.829472 ,
       15.198711 , 16.237856 , 24.442816 , 18.795586 , 29.1932   ,
       20.657019 , 35.00153  , 16.85     , 26.699173 , 46.776634 ,
       22.435978 , 16.724411 , 31.119606 , 22.12412  , 18.30494  ,
       25.300812 , 34.556026 , 30.827015 , 17.685106 , 25.074043 ,
       17.655554 , 13.206769 , 23.250887 , 28.663872 , 14.950134 ,
       20.825453 , 28.34049  , 11.415029 , 21.758612 , 22.7958

## Gradient Boosting

In [11]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

# ✅ Definimos el modelo base
gb_reg = GradientBoostingRegressor(random_state=42)

# ✅ Espacio reducido de hiperparámetros para que sea más rápido de ejecutar
param_dist_gb = {
    # 🔢 Número de árboles: controla la cantidad de etapas del boosting
    'n_estimators': [5, 10, 15],  # Pequeños para acelerar el entrenamiento

    # 📉 Tasa de aprendizaje: cuánto contribuye cada nuevo árbol
    'learning_rate': [0.01, 0.1, 0.2],  # Valores estándar

    # 🌳 Profundidad de los árboles: controla la complejidad del modelo
    'max_depth': [3, 4, 5],  # Árboles moderadamente profundos

    # ✂️ Número mínimo de muestras para hacer un split
    'min_samples_split': [2, 4],  # Valores básicos

    # 🍃 Número mínimo de muestras por hoja
    'min_samples_leaf': [1, 2],  # Valores seguros para evitar overfitting

    # 🔍 Subconjunto de features usado en cada split
    'max_features': ['sqrt', 'log2', None]  # Métodos comunes en boosting
}

# ✅ Búsqueda aleatoria (menos intensiva que GridSearch)
random_search_gb = RandomizedSearchCV(
    estimator=gb_reg,
    param_distributions=param_dist_gb,
    n_iter=10,                          # Solo 10 combinaciones a probar
    scoring='neg_mean_squared_error',  # Métrica usada para regresión
    cv=3,                               # 3-fold cross-validation para acelerar
    verbose=1,                          # Muestra progreso
    random_state=42,
    n_jobs=-1                           # Usa todos los núcleos disponibles
)

# ✅ Entrenamiento del modelo
random_search_gb.fit(X_train, y_train)

# ✅ Mejor modelo encontrado
best_gb_model = random_search_gb.best_estimator_

# ✅ Predicción con el modelo optimizado
y_pred_gb = best_gb_model.predict(X_test)

# ✅ Mostrar hiperparámetros óptimos
print("✅ Mejores parámetros encontrados por Gradient Boosting:")
print(random_search_gb.best_params_)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
✅ Mejores parámetros encontrados por Gradient Boosting:
{'n_estimators': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 4, 'learning_rate': 0.2}


In [12]:
y_pred_gb

array([23.17068962, 29.19627398, 16.39614582, 23.27463328, 16.7587947 ,
       21.6938187 , 19.79248515, 15.19639121, 20.5814995 , 21.40232359,
       20.30222996, 19.35954535,  9.85606068, 21.74571921, 21.18999739,
       25.31670922, 19.05473672, 12.03417016, 44.75946249, 14.74867598,
       24.15754667, 25.57556845, 15.25433247, 20.89460185, 13.65487845,
       17.6235109 , 22.96764295, 14.61360914, 20.19391519, 21.26403651,
       21.18914872, 23.43877294, 24.67392074, 17.76211386, 16.33282851,
       15.73618136, 33.99941646, 19.7931461 , 23.03030402, 23.27463328,
       18.33045223, 30.36505524, 44.97714729, 21.29987908, 23.1857202 ,
       13.99529728, 16.61182868, 23.27463328, 18.36217339, 27.04427144,
       21.6335099 , 32.91925475, 17.82285186, 22.22981639, 45.98337906,
       20.59634202, 15.17508701, 30.74416642, 23.06840634, 18.67713443,
       24.80857248, 32.22873276, 29.03945859, 18.93409857, 26.40186229,
       17.45238128, 13.76213918, 23.16488131, 29.05952592, 14.10

### Escalado de datos

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Perceptrón Multicapa

In [14]:
from sklearn.neural_network import MLPRegressor
# Modelo base
mlp_reg = MLPRegressor(max_iter=300, random_state=42)

# Hiperparámetros reducidos
params_mlp = {
    'hidden_layer_sizes': [(50,), (30, 30)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001],
    'learning_rate_init': [0.001, 0.01],
}

# RandomizedSearchCV
random_mlp = RandomizedSearchCV(
    mlp_reg, params_mlp, n_iter=4, cv=2,
    scoring='neg_mean_squared_error',
    n_jobs=-1, verbose=1, random_state=42
)

random_mlp.fit(X_train_scaled, y_train)
best_mlp = random_mlp.best_estimator_
y_pred_mlp = best_mlp.predict(X_test_scaled)

Fitting 2 folds for each of 4 candidates, totalling 8 fits


In [15]:
y_pred_mlp

array([28.66484674, 35.30757542, 18.01124772, 25.11827806, 17.49051415,
       19.90398447, 17.5921045 , 13.67432427, 24.0782709 , 18.00035356,
       22.3044665 , 17.51511197,  6.47271458, 18.42924972, 18.3812998 ,
       23.27480197, 21.29799258, 12.33975521, 46.47534287, 16.09167985,
       25.28050666, 26.55188603, 15.57575276, 23.96228785, 18.20976809,
       19.38972213, 20.5223929 , 11.53068739, 20.65768123, 16.30983476,
       23.69016764, 23.65485511, 20.7295924 , 25.13422241, 15.59884441,
       17.09408513, 33.39725205, 21.00627657, 21.33320447, 23.99913164,
       15.30361078, 29.01060345, 48.93953319, 18.13405891, 27.45066295,
       19.0593178 , 15.91329088, 25.66902873, 19.70226357, 28.87148426,
       18.30535248, 35.49378853, 16.76384752, 25.39766911, 44.60813242,
       22.36329418, 18.5410794 , 31.90948896, 25.35542352, 14.38079325,
       22.25659918, 34.10510239, 31.09744837, 16.00755067, 21.98626507,
       18.3127857 , 18.02403138, 23.88424753, 29.34113789, 13.68

### KNN

In [16]:
from sklearn.neighbors import KNeighborsRegressor

# Modelo base
knn_reg = KNeighborsRegressor()

# Hiperparámetros reducidos
params_knn = {
    'n_neighbors': [1, 2],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree'],
    'leaf_size': [5],
    'p': [1, 2]
}

# RandomizedSearchCV
random_knn = RandomizedSearchCV(
    knn_reg, params_knn, n_iter=4, cv=2,
    scoring='neg_mean_squared_error',
    n_jobs=-1, verbose=1, random_state=42
)

random_knn.fit(X_train_scaled, y_train)
best_knn = random_knn.best_estimator_
y_pred_knn = best_knn.predict(X_test_scaled)


Fitting 2 folds for each of 4 candidates, totalling 8 fits


In [17]:
y_pred_knn

array([27.03802412, 33.12738961, 17.2972845 , 22.51492387, 16.59815907,
       22.82492668, 18.36552362, 15.49519596, 23.0992809 , 19.62816074,
       23.14026031, 21.43444574,  8.64770173, 24.26520107, 20.19994707,
       22.36522433, 20.74039606, 10.01300273, 43.07544524, 14.58217939,
       22.78702855, 29.44166175, 14.00262489, 21.20858101, 15.75541905,
       13.16214632, 24.49919273, 13.6108478 , 20.8909688 , 19.25578472,
       24.34668469, 22.58420864, 15.53358359, 21.28763558, 14.62524609,
       17.31827123, 25.44859283, 18.83551731, 21.90250719, 22.50841675,
       18.78727241, 30.94099781, 43.22282116, 21.22725867, 21.88152601,
        9.64329815, 13.99934868, 22.68490229, 20.85622064, 29.71595856,
       24.51216419, 38.04378258, 15.054851  , 25.68720163, 46.78725188,
       21.65884293, 13.7026538 , 33.55898924, 24.47396885, 18.55956018,
       21.43935971, 33.89267765, 38.56566627, 20.50512626, 20.80145532,
       19.84255674, 12.29914585, 25.50984114, 31.88784434, 16.16

### Support Vector Machine

In [18]:
from sklearn.svm import SVR

# Modelo base
svr_reg = SVR()

# Hiperparámetros reducidos
params_svr = {
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale'],
    'C': [0.1, 1],
    'epsilon': [0.01, 0.1]
}

# RandomizedSearchCV
random_svr = RandomizedSearchCV(
    svr_reg, params_svr, n_iter=4, cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1, verbose=1, random_state=42
)

random_svr.fit(X_train_scaled, y_train)
best_svr = random_svr.best_estimator_
y_pred_svr = best_svr.predict(X_test_scaled)


Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [19]:
y_pred_svr

array([26.04791611, 31.73125564, 16.58271922, 24.13727337, 17.56116695,
       22.11860794, 18.42134121, 16.48843825, 19.35074524, 20.16332279,
       22.30700546, 19.8178044 , -3.36253033, 21.84575162, 18.45286872,
       23.67431805, 19.0169055 ,  6.35350504, 38.05210346, 16.46728767,
       25.93546358, 27.83311346, 13.52491658, 23.73054016, 15.71197892,
       12.98879664, 21.85853116, 13.9095376 , 20.66325172, 19.00643842,
       20.18243947, 24.14940775, 23.64404129, 14.39528941, 15.03376085,
       17.43063545, 29.81955267, 19.99277145, 22.40343283, 24.37750999,
       15.08990379, 28.56743024, 39.42257939, 18.42037119, 25.61268191,
       14.86696118, 14.86395939, 25.55328983, 18.03268828, 28.40902911,
       21.34265361, 31.77629743, 17.46230119, 24.99767826, 36.15768169,
       20.42042357, 17.2805942 , 30.03400099, 24.4991328 , 14.93031784,
       23.70303962, 30.43813118, 28.63849323, 16.13870258, 23.0429556 ,
       15.79272932, 18.69826966, 24.92824263, 28.76394328, 10.54

### ElasticNet

In [20]:
from sklearn.linear_model import ElasticNet

# Modelo base
elastic_net = ElasticNet()

# Hiperparámetros reducidos
params_en = {
    'alpha': [0.1, 1, 10],
    'l1_ratio': [0.1, 0.5],
    'max_iter': [100],
    'tol': [0.0001]
}

# RandomizedSearchCV
random_en = RandomizedSearchCV(
    elastic_net, params_en, n_iter=4, cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1, verbose=1, random_state=42
)

random_en.fit(X_train_scaled, y_train)
best_en = random_en.best_estimator_
y_pred_en = best_en.predict(X_test_scaled)


Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [21]:
y_pred_en

array([28.37991722, 34.70024967, 16.93960727, 24.82692633, 18.65260514,
       23.41423543, 18.14424809, 15.10701609, 22.06375288, 20.92818914,
       24.30936018, 19.57543648, -4.84134409, 22.25941104, 19.45561844,
       25.46846334, 19.380368  ,  5.77726936, 39.09009284, 17.5848564 ,
       26.41626705, 28.76825281, 12.41650341, 24.36203059, 17.60367911,
       15.0972923 , 22.86780308, 15.10611775, 22.43912004, 19.59935352,
       21.86825454, 25.24426277, 25.56546338, 17.43003033, 16.16168499,
       18.02179774, 31.36610292, 20.27437252, 23.77560468, 24.52045459,
       14.46782846, 30.51356293, 40.49795736, 18.11550156, 27.13416876,
       16.5976877 , 14.83369059, 25.78086469, 19.65320971, 30.26419844,
       21.93826718, 33.28156802, 16.91311382, 26.52566574, 38.37531879,
       22.11065949, 18.71635784, 31.07798649, 25.08888561, 13.95461429,
       23.23588407, 29.98126867, 30.4578272 , 16.33522267, 22.56116986,
       16.83248984, 20.15571059, 25.68184324, 29.45840475, 11.92

### Lasso y Lars

In [22]:
from sklearn.linear_model import LassoLars

# Modelo base
lasso_lars = LassoLars()

# Hiperparámetros reducidos
params_ll = {
    'alpha': [0.01, 1],
    'max_iter': [10, 50],
    'eps': [1e-4, 1e-8]
}

# RandomizedSearchCV
random_ll = RandomizedSearchCV(
    lasso_lars, params_ll, n_iter=4, cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1, verbose=1, random_state=42
)

random_ll.fit(X_train_scaled, y_train)
best_ll = random_ll.best_estimator_
y_pred_ll = best_ll.predict(X_test_scaled)


Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [23]:
y_pred_ll

array([29.00627172, 35.87992353, 15.04955899, 24.97360848, 18.75016713,
       23.27744603, 17.70129358, 14.36518914, 22.9134848 , 20.70062389,
       24.80658834, 18.71520745, -5.97940762, 21.79393739, 19.26706046,
       26.11031359, 20.36649495,  5.77178962, 40.36432071, 17.6062312 ,
       27.13357465, 29.91584776, 11.44794515, 24.17918972, 17.84159147,
       15.75883921, 22.78095027, 14.63054977, 22.41688072, 19.2680872 ,
       22.43997148, 25.22723356, 25.91824795, 17.69292722, 16.69230081,
       17.04621906, 31.2581581 , 20.13117946, 23.73281888, 24.57695997,
       14.01160262, 32.10193889, 42.42547192, 17.40453156, 27.32753476,
       16.95570709, 14.16101545, 25.84842349, 20.21447916, 30.06643845,
       21.34775865, 34.23564085, 16.14105879, 26.31023553, 39.46868514,
       22.50431565, 18.83424042, 32.52331542, 25.08344256, 12.95885924,
       22.68918285, 30.36909626, 31.43546039, 15.94868071, 20.46801265,
       16.69492957, 20.48984676, 25.93467951, 30.48369241, 11.62

### Comparacion de Metricas

In [24]:
from sklearn.metrics import mean_squared_error, r2_score
from tabulate import tabulate

# Diccionario de modelos optimizados (sin LightGBM)
models = {
    'DecisionTree': best_dt_model,
    'RandomForest': best_rf_model,
    'XGBoost': best_xgb_model,
    'GradientBoosting': best_gb_model,
    'MLPRegressor': best_mlp,
    'SVR': best_svr,
    'KNeighbors': best_knn,
    'ElasticNet': best_en,
    'LassoLars': best_ll
}

# Modelos que necesitan escalado
scaled_models = ['MLPRegressor', 'SVR', 'KNeighbors', 'ElasticNet', 'LassoLars']

# Evaluar cada modelo con MSE y R²
results = []
for name, model in models.items():
    if name in scaled_models:
        y_pred = model.predict(X_test_scaled)
    else:
        y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append([name, round(mse, 4), round(r2, 4)])

# Mostrar tabla con resultados
print(tabulate(results, headers=['Modelo', 'MSE', 'R^2']))



Modelo                MSE     R^2
----------------  -------  ------
DecisionTree      16.3945  0.7764
RandomForest       9.2873  0.8734
XGBoost            7.5     0.8977
GradientBoosting  10.0601  0.8628
MLPRegressor      13.0898  0.8215
SVR               28.2748  0.6144
KNeighbors        16.2133  0.7789
ElasticNet        25.2036  0.6563
LassoLars         24.3326  0.6682
