In [117]:
import pandas as pd
df = pd.read_csv("housingData.csv")

In [118]:
#Dummies / One-Hot Encoding
cleaned_data = df.dropna()
dummies_ocean = pd.get_dummies(cleaned_data["ocean_proximity"], dtype=int)
cleaned_data = cleaned_data.join(dummies_ocean)
cleaned_data.drop("ocean_proximity", axis=1, inplace=True)

In [119]:
x = cleaned_data.drop("median_house_value", axis=1)
y = cleaned_data["median_house_value"]

# Prepare the data for the model

In [120]:
# divide data into training and testing
from sklearn.model_selection import train_test_split
X_entrena, X_prueba, Y_entrena, Y_prueba = train_test_split(x, y, test_size=.2)

# 1. Linear Regression (Regresión Lineal)

In [121]:
# Create model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [122]:
# Train model
model.fit(X_entrena, Y_entrena)

In [None]:
# Predict
prediccion = model.predict(X_prueba)
compartaion = pd.DataFrame({"Real": Y_prueba, "Prediccion": prediccion})
compartaion

In [123]:
# Score
print("Score: ", model.score(X_prueba, Y_prueba))
print("Score: ", model.score(X_entrena, Y_entrena))

Score:  0.6183491023344356
Score:  0.6526481758638547


In [None]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(Y_prueba, prediccion)
print("MSE: ", mse)

In [None]:
# Raiz cuadrada del error medio RMSE
import numpy as np
rmse = np.sqrt(mse)
print("RMSE: ", rmse)

In [None]:
# Evaluate model using cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, x, y, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
print("Puntajes de MSE en cada fold: ", rmse_scores)
print("Promedio de MSE: ", np.mean(rmse_scores))
print("Desviación estandar de MSE: ", np.std(rmse_scores))

In [97]:
def show_scores(score):
    print("Puntajes de MSE en cada fold: ", score)
    print("Promedio de MSE: ", np.mean(rmse_scores))
    print("Desviación estandar de MSE: ", np.std(rmse_scores))

# 2. Decision Tree (Arboles de desición)

In [None]:
# Desicion tree
from sklearn.tree import DecisionTreeRegressor
modelDecisionTree = DecisionTreeRegressor(random_state=42)
modelDecisionTree.fit(X_entrena, Y_entrena)

In [127]:
# Score
print("Score: ", modelDecisionTree.score(X_prueba, Y_prueba))
print("Score: ", modelDecisionTree.score(X_entrena, Y_entrena))

Score:  0.9319532804377815
Score:  0.9317934200047988


In [None]:
scores = cross_val_score(modelDecisionTree, x, y, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
show_scores(rmse_scores)

# 3. Random Forest (Bosques Aleatorios)

In [None]:
from sklearn.ensemble import RandomForestRegressor

random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest.fit(X_entrena, Y_entrena)

In [128]:
# Score
print("Score: ", random_forest.score(X_prueba, Y_prueba))
print("Score: ", random_forest.score(X_entrena, Y_entrena))

Score:  0.9446979160049913
Score:  0.9453089499809503


In [None]:
scores = cross_val_score(random_forest, x, y, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
show_scores(rmse_scores)

# Resumen

In [None]:
# Datos de evaluación
list_model = ['Linear Regression', 'DecisionTree', 'Random Forest']
list_mean = [72091.68777781783, 91700.83194344032, 68684.42981815965]
list_sd = [13759.321238818358, 21105.004412133527, 17476.07087805363]

# Agregamos una gráfica para comparar visualmente los resultados
import matplotlib.pyplot as plt

plt.figure(figsize=(4, 5))
for i in range(3):
    plt.errorbar(i, list_mean[i], xerr=0, yerr=list_sd[i], fmt='o', linewidth=3)

plt.xticks([0, 1, 2], list_model)
plt.title("Evaluación de los modelos")
plt.ylim(0, 120000)
plt.grid(visible=True, which='major', color='#666666', linestyle='-')
plt.minorticks_on()
plt.grid(visible=True, which='minor', color='#999999', linestyle='-', alpha=0.2)
plt.show()

# Models version 2 with standard scaler

In [131]:
from sklearn.preprocessing import StandardScaler
 
scaler = StandardScaler()
 
X_entrena_escalado = scaler.fit_transform(X_entrena)
X_prueba_escalado = scaler.fit_transform(X_prueba)
 
pd.DataFrame(X_entrena_escalado)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.120272,0.533885,1.858168,-0.358179,-0.239153,-0.593012,-0.286031,-1.171423,-0.894883,1.473607,-0.015645,-0.352652,-0.385093
1,0.632022,-0.706391,1.540797,-0.542632,-0.575910,-0.667773,-0.556414,-0.338913,1.117465,-0.678607,-0.015645,-0.352652,-0.385093
2,-1.256186,1.053397,-0.363430,1.589736,1.947398,0.295112,2.066040,-0.504640,-0.894883,-0.678607,-0.015645,2.835653,-0.385093
3,-0.832709,1.086159,-0.680802,-1.080050,-1.090532,-1.211816,-1.181183,-1.442522,-0.894883,1.473607,-0.015645,-0.352652,-0.385093
4,0.632022,-0.785956,1.461454,-0.390515,-0.473934,-0.377736,-0.446161,-0.701932,1.117465,-0.678607,-0.015645,-0.352652,-0.385093
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16341,0.726682,-0.804677,0.668026,-0.839578,-0.860494,-0.614629,-0.829423,-0.146961,1.117465,-0.678607,-0.015645,-0.352652,-0.385093
16342,0.487542,-0.664268,0.509340,-0.599562,-0.734803,-0.600218,-0.742795,0.184389,1.117465,-0.678607,-0.015645,-0.352652,-0.385093
16343,0.876144,-0.795316,-0.998173,1.448094,1.062817,0.795920,1.089511,0.392988,1.117465,-0.678607,-0.015645,-0.352652,-0.385093
16344,0.402846,-0.673629,-1.950286,6.405552,5.137107,4.291669,4.483739,1.429803,-0.894883,-0.678607,-0.015645,-0.352652,2.596777


# 1. Linear Regression (Regresión Lineal)

In [135]:
# Create model
from sklearn.linear_model import LinearRegression
linear_regression_model_scaler = LinearRegression()
linear_regression_model_scaler.fit(X_entrena_escalado, Y_entrena)
scores = cross_val_score(linear_regression_model_scaler, X_prueba_escalado, Y_prueba, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
show_scores(rmse_scores)

Puntajes de MSE en cada fold:  [69822.19319702 67749.70192796 67501.51578803 71559.1789408
 75899.23692625 76373.03315293 69901.78701548 69747.35194967
 74755.61804511 67002.23081913]
Promedio de MSE:  71031.18477623729
Desviación estandar de MSE:  3320.060938182026


In [139]:
print("Score: ", linear_regression_model_scaler.score(X_prueba_escalado, Y_prueba))
print("Score: ", linear_regression_model_scaler.score(X_entrena_escalado, Y_entrena))

Score:  0.6200261803458318
Score:  0.6526481758638547


# 2. Decision Tree (Arboles de desición)

In [140]:
# Desicion tree
from sklearn.tree import DecisionTreeRegressor
decision_tree_model_scaler = DecisionTreeRegressor(random_state=42)
decision_tree_model_scaler.fit(X_entrena_escalado, Y_entrena)
scores = cross_val_score(decision_tree_model_scaler, X_prueba_escalado, Y_prueba, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
show_scores(rmse_scores)

Puntajes de MSE en cada fold:  [76840.65531087 76576.61411216 77177.72627707 79951.3186605
 68057.78319087 87604.27516647 74956.39345209 75277.40625984
 79758.72818761 80401.73647068]
Promedio de MSE:  77660.26370881546
Desviación estandar de MSE:  4725.748359430206


In [141]:
print("Score: ", decision_tree_model_scaler.score(X_prueba_escalado, Y_prueba))
print("Score: ", decision_tree_model_scaler.score(X_entrena_escalado, Y_entrena))

Score:  0.4488515419124698
Score:  1.0


# 3. Random Forest (Bosques Aleatorios)

In [142]:
from sklearn.ensemble import RandomForestRegressor

random_forest_model_scaler = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_model_scaler.fit(X_entrena_escalado, Y_entrena)
scores = cross_val_score(random_forest_model_scaler, X_prueba_escalado, Y_prueba, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
show_scores(rmse_scores)

Puntajes de MSE en cada fold:  [56176.67518938 57679.39663583 51989.45539225 54765.26963517
 50878.94113354 63379.9536622  54151.50558002 55517.74016071
 58474.83138835 56184.8485568 ]
Promedio de MSE:  55919.861733425816
Desviación estandar de MSE:  3330.0723479052


In [143]:
print("Score: ", random_forest_model_scaler.score(X_prueba_escalado, Y_prueba))
print("Score: ", random_forest_model_scaler.score(X_entrena_escalado, Y_entrena))

Score:  0.675005535930886
Score:  0.9755443385326762


# Resumen with standard scaler

In [None]:
# Datos de evaluación
list_model = ['Linear Regression', 'DecisionTree', 'Random Forest', 'Linear Regression Scaled', 'DecisionTree Scaled', 'Random Forest Scaled']
list_mean = [72091.68777781783, 91700.83194344032, 68684.42981815965, 68330.75685971466, 76378.01607656936, 53872.52762946496]
list_sd = [13759.321238818358, 21105.004412133527, 17476.07087805363, 6745.096427995872, 3708.6811813546096, 3549.473803525287]

# Agregamos una gráfica para comparar visualmente los resultados
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
for i in range(6):
    plt.errorbar(i, list_mean[i], xerr=0, yerr=list_sd[i], fmt='o', linewidth=3)

plt.xticks([0, 1, 2, 3, 4, 5], list_model)
plt.title("Evaluación de los modelos")
plt.ylim(0, 120000)
plt.grid(visible=True, which='major', color='#666666', linestyle='-')
plt.minorticks_on()
plt.grid(visible=True, which='minor', color='#999999', linestyle='-', alpha=0.2)
plt.show()

# Fine-tuning the model

In [148]:
# RandomForest
from sklearn.model_selection import GridSearchCV
  
# Definimos los posibles parámetros
n_estimators = [3, 10, 30, 50, 80, 100]
min_samples_split = [2, 4, 6, 8]
max_depth = [2, 4, 6, 8, 10]
 
params = {
    'n_estimators': n_estimators,
    'min_samples_split': min_samples_split,
    'max_depth': max_depth
}
 
#Aplicamos el método de ajuste
grid_search = GridSearchCV(estimator=random_forest, param_grid=params, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_entrena, Y_entrena)

In [149]:
print("Mejores parámetros:", grid_search.best_params_)

Mejores parámetros: {'max_depth': 10, 'min_samples_split': 4, 'n_estimators': 100}
