In [None]:
import pandas as pd

data = pd.read_csv("boston.csv")

data.head()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
# Codificar 'ocean_proximity' usando map
data['ocean_proximity'] = data['ocean_proximity'].map({
    'NEAR BAY': 0,
    '<1H OCEAN': 1,
    'INLAND': 2,
    'NEAR OCEAN': 3,
    'ISLAND': 4
})


In [None]:
data.isnull().sum()

In [None]:
data['total_bedrooms'].fillna(data['total_bedrooms'].median(), inplace=True)

In [None]:
data.isnull().sum()

In [None]:
# Para clasificar primero debemos separar en 2 variables independiente y dependiente
# X = variables independientes 
# y = variable dependiente, que es la que quiero predecir o clasificar

# creacion de variables independientes
X = data.drop('median_house_value', axis=1) 
# creacion de la variable dependiente
y = data['median_house_value']


In [None]:
# seperar los datos en conjunto de entrenamiento y prueba 

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Arbol de Decision

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Definir el modelo base
modelo_arbol = DecisionTreeRegressor(random_state=42)

# Definir el grid de hiperparámetros
param_grid = {
    'max_depth': [3, 5, 10, None], # Profundidad del arbol
    'min_samples_split': [2, 5, 10], # Minimo de hojas 
    'min_samples_leaf': [1, 2, 4] # Minimo de muestras 
}

# Configurar GridSearch
grid_search = GridSearchCV(estimator=modelo_arbol,
                           param_grid=param_grid,
                           cv=5,              # validación cruzada de 5 folds
                           scoring='neg_mean_squared_error',
                           n_jobs=-1,
                           verbose=1)

# Entrenamiento del modelo con búsqueda de hiperparámetros
grid_search.fit(X_train, y_train)

# Mejor modelo encontrado
mejor_modelo = grid_search.best_estimator_

# Predicciones
y_pred = mejor_modelo.predict(X_test)

# Métricas
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / np.where(y_test == 0, 1e-10, y_test))) * 100

# Resultados
print("Árbol de Decisión (con GridSearch)")
print("Mejores hiperparámetros:", grid_search.best_params_)
print("MAE :", round(mae, 4))
print("MSE :", round(mse, 4))
print("RMSE:", round(rmse, 4))
print("MAPE:", round(mape, 2), "%")
print("R²  :", round(r2, 4))


# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Definir el modelo base
modelo_rf = RandomForestRegressor(random_state=42)

# Definir la malla de hiperparámetros
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Configurar búsqueda con validación cruzada
grid_search = GridSearchCV(estimator=modelo_rf,
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1,
                           verbose=1)

# Ajuste del modelo con búsqueda
grid_search.fit(X_train, y_train)

# Mejor estimador
mejor_modelo = grid_search.best_estimator_

# Predicciones
y_pred = mejor_modelo.predict(X_test)

# Métricas
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / np.where(y_test == 0, 1e-10, y_test))) * 100

# Resultados
print("Random Forest (con GridSearch)")
print("Mejores hiperparámetros:", grid_search.best_params_)
print("MAE :", round(mae, 4))
print("MSE :", round(mse, 4))
print("RMSE:", round(rmse, 4))
print("MAPE:", round(mape, 2), "%")
print("R²  :", round(r2, 4))


# Gradient Boosting 

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Definir el modelo base
modelo_gb = GradientBoostingRegressor(random_state=42)

# Definir la grilla de hiperparámetros
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 3]
}

# Configurar búsqueda con validación cruzada
grid_search = GridSearchCV(estimator=modelo_gb,
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1,
                           verbose=1)

# Entrenamiento con búsqueda
grid_search.fit(X_train, y_train)

# Mejor modelo
mejor_modelo = grid_search.best_estimator_

# Predicciones
y_pred = mejor_modelo.predict(X_test)

# Métricas
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / np.where(y_test == 0, 1e-10, y_test))) * 100

# Resultados
print("Gradient Boosting Regressor (con GridSearch)")
print("Mejores hiperparámetros:", grid_search.best_params_)
print("MAE :", round(mae, 4))
print("MSE :", round(mse, 4))
print("RMSE:", round(rmse, 4))
print("MAPE:", round(mape, 2), "%")
print("R²  :", round(r2, 4))


# Light GBM

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Definir el modelo base
modelo_lgbm = LGBMRegressor(random_state=42)

# Definir grilla de hiperparámetros
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [-1, 5, 10],
    'num_leaves': [31, 50, 100],
    'min_child_samples': [10, 20]
}

# Configurar GridSearchCV
grid_search = GridSearchCV(estimator=modelo_lgbm,
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1,
                           verbose=1)

# Entrenar con búsqueda
grid_search.fit(X_train, y_train)

# Mejor modelo encontrado
mejor_modelo = grid_search.best_estimator_

# Predicciones
y_pred = mejor_modelo.predict(X_test)

# Métricas
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / np.where(y_test == 0, 1e-10, y_test))) * 100

# Resultados
print("LightGBM Regressor (con GridSearch)")
print("Mejores hiperparámetros:", grid_search.best_params_)
print("MAE :", round(mae, 4))
print("MSE :", round(mse, 4))
print("RMSE:", round(rmse, 4))
print("MAPE:", round(mape, 2), "%")
print("R²  :", round(r2, 4))


# CatBoost

In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Modelo base (modo silencioso para evitar spam en consola)
modelo_cat = CatBoostRegressor(verbose=0, random_state=42)

# Grilla de hiperparámetros
param_grid = {
    'iterations': [100, 200],
    'learning_rate': [0.03, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5]
}

# GridSearch con validación cruzada
grid_search = GridSearchCV(estimator=modelo_cat,
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1,
                           verbose=1)

# Entrenamiento con búsqueda
grid_search.fit(X_train, y_train)

# Mejor modelo encontrado
mejor_modelo = grid_search.best_estimator_

# Predicciones
y_pred = mejor_modelo.predict(X_test)

# Métricas
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / np.where(y_test == 0, 1e-10, y_test))) * 100

# Resultados
print("CatBoost Regressor (con GridSearch)")
print("Mejores hiperparámetros:", grid_search.best_params_)
print("MAE :", round(mae, 4))
print("MSE :", round(mse, 4))
print("RMSE:", round(rmse, 4))
print("MAPE:", round(mape, 2), "%")
print("R²  :", round(r2, 4))


# Escalado de datos

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_scaled, y, test_size=0.2, random_state=42)



# XGBoost

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Definir el modelo base
modelo_xgb = XGBRegressor(random_state=42, verbosity=0)

# Definir la grilla de hiperparámetros
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Configurar la búsqueda
grid_search = GridSearchCV(estimator=modelo_xgb,
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1,
                           verbose=1)

# Entrenamiento del modelo con búsqueda
grid_search.fit(X_train_s, y_train_s)

# Mejor modelo encontrado
mejor_modelo = grid_search.best_estimator_

# Predicciones
y_pred = mejor_modelo.predict(X_test_s)

# Métricas
mae = mean_absolute_error(y_test_s, y_pred)
mse = mean_squared_error(y_test_s, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_s, y_pred)
mape = np.mean(np.abs((y_test_s - y_pred) / np.where(y_test_s == 0, 1e-10, y_test_s))) * 100

# Resultados
print("XGBoost Regressor (con GridSearch)")
print("Mejores hiperparámetros:", grid_search.best_params_)
print("MAE :", round(mae, 4))
print("MSE :", round(mse, 4))
print("RMSE:", round(rmse, 4))
print("MAPE:", round(mape, 2), "%")
print("R²  :", round(r2, 4))


# KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Modelo base
modelo_knn = KNeighborsRegressor()

# Grilla de hiperparámetros
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # p=1: Manhattan, p=2: Euclidiana
}

# Configurar búsqueda
grid_search = GridSearchCV(estimator=modelo_knn,
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1,
                           verbose=1)

# Entrenamiento con búsqueda
grid_search.fit(X_train_s, y_train_s)

# Mejor modelo encontrado
mejor_modelo = grid_search.best_estimator_

# Predicciones
y_pred = mejor_modelo.predict(X_test_s)

# Métricas
mae = mean_absolute_error(y_test_s, y_pred)
mse = mean_squared_error(y_test_s, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_s, y_pred)
mape = np.mean(np.abs((y_test_s - y_pred) / np.where(y_test_s == 0, 1e-10, y_test_s))) * 100

# Resultados
print("KNN Regressor (con GridSearch)")
print("Mejores hiperparámetros:", grid_search.best_params_)
print("MAE  :", round(mae, 4))
print("MSE  :", round(mse, 4))
print("RMSE :", round(rmse, 4))
print("MAPE :", round(mape, 2), "%")
print("R²   :", round(r2, 4))


# Maquina de Soporte Vectorial

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Modelo base
modelo_svr = SVR(kernel='rbf')

# Grilla de hiperparámetros
param_grid = {
    'C': [0.1, 1, 10],
    'epsilon': [0.01, 0.1, 0.2],
    'gamma': ['scale', 'auto']
}

# Configurar búsqueda
grid_search = GridSearchCV(estimator=modelo_svr,
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1,
                           verbose=1)

# Entrenamiento con búsqueda
grid_search.fit(X_train_s, y_train_s)

# Mejor modelo encontrado
mejor_modelo = grid_search.best_estimator_

# Predicciones
y_pred = mejor_modelo.predict(X_test_s)

# Métricas
mae = mean_absolute_error(y_test_s, y_pred)
mse = mean_squared_error(y_test_s, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_s, y_pred)
mape = np.mean(np.abs((y_test_s - y_pred) / np.where(y_test_s == 0, 1e-10, y_test_s))) * 100

# Resultados
print("SVR Regressor (con GridSearch)")
print("Mejores hiperparámetros:", grid_search.best_params_)
print("MAE  :", round(mae, 4))
print("MSE  :", round(mse, 4))
print("RMSE :", round(rmse, 4))
print("MAPE :", round(mape, 2), "%")
print("R²   :", round(r2, 4))
