In [1]:
import pandas as pd

data = pd.read_csv("housing.csv")

data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [2]:
data.shape

(20640, 10)

In [3]:
data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [4]:
# Codificar 'ocean_proximity' usando map
data['ocean_proximity'] = data['ocean_proximity'].map({
    'NEAR BAY': 0,
    '<1H OCEAN': 1,
    'INLAND': 2,
    'NEAR OCEAN': 3,
    'ISLAND': 4
})


In [5]:
data.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [6]:
data['total_bedrooms'].fillna(data['total_bedrooms'].mean(), inplace=True)

In [7]:
data.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [8]:
# Para clasificar primero debemos separar en 2 variables independiente y dependiente
# X = variables independientes 
# y = variable dependiente, que es la que quiero predecir o clasificar


# creacion de variables independientes
X = data.drop('median_house_value', axis=1) 
# creacion de la variable dependiente
y = data['median_house_value']


In [9]:
# seperar los datos en conjunto de entrenamiento y prueba 

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Arbol de Decision

In [10]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# División del dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenamiento del modelo
modelo_arbol = DecisionTreeRegressor(random_state=42)
modelo_arbol.fit(X_train, y_train)

# Predicciones
y_pred = modelo_arbol.predict(X_test)

# Cálculo de métricas
mae = mean_absolute_error(y_test, y_pred) # Error Absoluto Medio
mse = mean_squared_error(y_test, y_pred) # Error Cuadratico Medio
rmse = np.sqrt(mse) # Raiz del Error Cuadratico Medio
r2 = r2_score(y_test, y_pred) # Coeficiente de Determinacion

# MAPE 
mape = np.mean(np.abs((y_test - y_pred) / np.where(y_test == 0, 1e-10, y_test))) * 100

# Resultados
print("Árbol de Decisión")
print("MAE :", round(mae, 4))
print("MSE :", round(mse, 4))
print("RMSE:", round(rmse, 4))
print("MAPE:", round(mape, 2), "%")
print("R²  :", round(r2, 4))


Árbol de Decisión
MAE : 44582.7946
MSE : 4820833294.1318
RMSE: 69432.2209
MAPE: 24.19 %
R²  : 0.6321


# Random Forest

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Entrenamiento del modelo
modelo_rf = RandomForestRegressor(n_estimators=100, random_state=42)
modelo_rf.fit(X_train, y_train)

# Predicciones
y_pred = modelo_rf.predict(X_test)

# Cálculo de métricas
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / np.where(y_test == 0, 1e-10, y_test))) * 100

# Resultados
print("Random Forest")
print("MAE :", round(mae, 4))
print("MSE :", round(mse, 4))
print("RMSE:", round(rmse, 4))
print("MAPE:", round(mape, 2), "%")
print("R²  :", round(r2, 4))

Random Forest
MAE : 32428.6737
MSE : 2557794454.6855
RMSE: 50574.6424
MAPE: 18.07 %
R²  : 0.8048


# Escalado de datos

In [12]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt


In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_scaled, y, test_size=0.2, random_state=42)



# XGBoost

In [14]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Entrenamiento del modelo con datos escalados
modelo_xgb = XGBRegressor(random_state=42)
modelo_xgb.fit(X_train_s, y_train_s)

# Predicciones
y_pred = modelo_xgb.predict(X_test_s)

# Cálculo de métricas
mae = mean_absolute_error(y_test_s, y_pred)
mse = mean_squared_error(y_test_s, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_s, y_pred)
mape = np.mean(np.abs((y_test_s - y_pred) / np.where(y_test_s == 0, 1e-10, y_test_s))) * 100

# Resultados
print("XGBoost")
print("MAE :", round(mae, 4))
print("MSE :", round(mse, 4))
print("RMSE:", round(rmse, 4))
print("MAPE:", round(mape, 2), "%")
print("R²  :", round(r2, 4))


XGBoost
MAE : 32076.3255
MSE : 2337325508.8438
RMSE: 48345.8944
MAPE: 18.22 %
R²  : 0.8216


# KNN

In [15]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Entrenamiento del modelo con datos escalados
modelo_knn = KNeighborsRegressor(n_neighbors=5)
modelo_knn.fit(X_train_s, y_train_s)

# Predicciones
y_pred = modelo_knn.predict(X_test_s)

# Cálculo de métricas
mae = mean_absolute_error(y_test_s, y_pred)
mse = mean_squared_error(y_test_s, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_s, y_pred)
mape = np.mean(np.abs((y_test_s - y_pred) / np.where(y_test_s == 0, 1e-10, y_test_s))) * 100

# Resultados
print("KNN Regressor")
print("MAE  :", round(mae, 4))
print("MSE  :", round(mse, 4))
print("RMSE :", round(rmse, 4))
print("MAPE :", round(mape, 2), "%")
print("R²   :", round(r2, 4))


KNN Regressor
MAE  : 41025.4201
MSE  : 3822879504.771
RMSE : 61829.4388
MAPE : 22.21 %
R²   : 0.7083


# Maquina de Soporte Vectorial

In [16]:
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Entrenamiento del modelo con datos escalados
modelo_svr = SVR(kernel='rbf')
modelo_svr.fit(X_train_s, y_train_s)

# Predicciones
y_pred = modelo_svr.predict(X_test_s)

# Cálculo de métricas
mae = mean_absolute_error(y_test_s, y_pred)
mse = mean_squared_error(y_test_s, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_s, y_pred)
mape = np.mean(np.abs((y_test_s - y_pred) / np.where(y_test_s == 0, 1e-10, y_test_s))) * 100

# Resultados
print("SVM Regressor")
print("MAE  :", round(mae, 4))
print("MSE  :", round(mse, 4))
print("RMSE :", round(rmse, 4))
print("MAPE :", round(mape, 2), "%")
print("R²   :", round(r2, 4))



SVM Regressor
MAE  : 87081.0609
MSE  : 13681275294.7933
RMSE : 116966.9838
MAPE : 52.84 %
R²   : -0.044


# Perceptron Multicapa

In [17]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Entrenamiento del modelo con datos escalados
modelo_mlp = MLPRegressor(hidden_layer_sizes=(50,), max_iter=500, random_state=42)
modelo_mlp.fit(X_train_s, y_train_s)

# Predicciones
y_pred = modelo_mlp.predict(X_test_s)

# Cálculo de métricas
mae = mean_absolute_error(y_test_s, y_pred)
mse = mean_squared_error(y_test_s, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_s, y_pred)
mape = np.mean(np.abs((y_test_s - y_pred) / np.where(y_test_s == 0, 1e-10, y_test_s))) * 100

# Resultados
print("MLP Regressor")
print("MAE  :", round(mae, 4))
print("MSE  :", round(mse, 4))
print("RMSE :", round(rmse, 4))
print("MAPE :", round(mape, 2), "%")
print("R²   :", round(r2, 4))


MLP Regressor
MAE  : 82985.5974
MSE  : 13097090227.9719
RMSE : 114442.5193
MAPE : 41.86 %
R²   : 0.0005


