In [40]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import Ridge, LinearRegression, LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from ipywidgets import interact, FloatSlider
from numpy.random import seed, randn
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, FunctionTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.tree import DecisionTreeRegressor, export_graphviz, plot_tree
from sklearn.ensemble import VotingRegressor, BaggingRegressor, RandomForestRegressor

In [3]:
%config InlineBackend.figure_format = "retina"
plt.rcParams["figure.figsize"] = (10, 5)
plt.rcParams["font.size"] = 13

In [4]:
housing = pd.read_csv("train.csv")

In [5]:
train, test = train_test_split(housing, test_size=0.2, random_state=2718)
xtrain, ytrain = train.drop("SalePrice", axis=1), train["SalePrice"]

In [6]:
#Transformamos las variables categóricas, llenamos datos vacíos para las variables float y obtenemos el logaritmo natural 
#de LotArea y ajustamos un Lasso

column_selector = ColumnTransformer(
    [("label_bin", OneHotEncoder(sparse=False), ["MSZoning", "SaleCondition"]),
     ("numeric", SimpleImputer(), make_column_selector(dtype_include="float")),
     ('log', FunctionTransformer(np.log1p, validate=False), ["LotArea"])]
)

pipe = Pipeline([
    ("select_cols", column_selector),
    ("lasso", Lasso(alpha=0.1, tol = 0.5))
])

cv = cross_validate(pipe, xtrain, ytrain, cv=5, scoring=["r2", "neg_mean_absolute_error"])
r2 = cv["test_r2"]
mae = -cv["test_neg_mean_absolute_error"]

print(f"mean r2: {r2.mean():0.2f}")
print(f"mean mae: {mae.mean():,.2f}")

mean r2: 0.49
mean mae: 39,958.02


In [7]:
#Añadimos una estandarización de la variable LotArea

column_selector = ColumnTransformer(
    [("label_bin", OneHotEncoder(sparse=False), ["MSZoning", "SaleCondition"]),
     ("numeric", SimpleImputer(), make_column_selector(dtype_include="float")),
     ('log', FunctionTransformer(np.log1p, validate=False), ["LotArea"]),
     ('scale', StandardScaler(), ["LotArea"])]
)

pipe = Pipeline([
    ("select_cols", column_selector),
    ("lasso", Lasso(alpha=0.1, tol = 0.5))
])

cv = cross_validate(pipe, xtrain, ytrain, cv=5, scoring=["r2", "neg_mean_absolute_error"])
r2 = cv["test_r2"]
mae = -cv["test_neg_mean_absolute_error"]

print(f"mean r2: {r2.mean():0.2f}")
print(f"mean mae: {mae.mean():,.2f}")

mean r2: 0.49
mean mae: 39,959.92


In [13]:
#Agregamos un polinomio de grado 10

column_selector = ColumnTransformer(
    [("label_bin", OneHotEncoder(sparse=False), ["MSZoning", "SaleCondition"]),
     ("numeric", SimpleImputer(), make_column_selector(dtype_include="float")),
     ('log', FunctionTransformer(np.log1p, validate=False), ["LotArea"]),
     ('scale', StandardScaler(), ["LotArea"]),
     ('poly', PolynomialFeatures(degree=10, include_bias=False), ["LotArea"])]
)

pipe = Pipeline([
    ("select_cols", column_selector),
    ("lasso", Lasso(alpha=0.1, tol = 0.5))
])

cv = cross_validate(pipe, xtrain, ytrain, cv=5, scoring=["r2", "neg_mean_absolute_error"])
r2 = cv["test_r2"]
mae = -cv["test_neg_mean_absolute_error"]

print(f"mean r2: {r2.mean():0.2f}")
print(f"mean mae: {mae.mean():,.2f}")

mean r2: 0.49
mean mae: 39,908.83


In [17]:
#Ajustamos un Ridge

column_selector = ColumnTransformer(
    [("label_bin", OneHotEncoder(sparse=False), ["MSZoning", "SaleCondition"]),
     ("numeric", SimpleImputer(), make_column_selector(dtype_include="float")),
     ('log', FunctionTransformer(np.log1p, validate=False), ["LotArea"]),
     ('scale', StandardScaler(), ["LotArea"])]
)

pipe = Pipeline([
    ("select_cols", column_selector),
    ("ridge", Ridge(alpha=0.1))
])

cv = cross_validate(pipe, xtrain, ytrain, cv=5, scoring=["r2", "neg_mean_absolute_error"])
r2 = cv["test_r2"]
mae = -cv["test_neg_mean_absolute_error"]

print(f"mean r2: {r2.mean():0.2f}")
print(f"mean mae: {mae.mean():,.2f}")

mean r2: 0.49
mean mae: 39,982.16


In [28]:
#Ajustamos una regresión logística

column_selector = ColumnTransformer(
    [("label_bin", OneHotEncoder(sparse=False), ["MSZoning", "SaleCondition"]),
     ("numeric", SimpleImputer(), make_column_selector(dtype_include="float")),
     ('log', FunctionTransformer(np.log1p, validate=False), ["LotArea"]),
     ('scale', StandardScaler(), ["LotArea"])]
)

pipe = Pipeline([
    ("select_cols", column_selector),
    ("tree", LogisticRegression(C=1))
])

cv = cross_validate(pipe, xtrain, ytrain, cv=5, scoring=["r2", "neg_mean_absolute_error"])
r2 = cv["test_r2"]
mae = -cv["test_neg_mean_absolute_error"]

print(f"mean r2: {r2.mean():0.2f}")
print(f"mean mae: {mae.mean():,.2f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

mean r2: -0.16
mean mae: 56,903.05


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [27]:
#Ajustamos un árbol de decisión

column_selector = ColumnTransformer(
    [("label_bin", OneHotEncoder(sparse=False), ["MSZoning", "SaleCondition"]),
     ("numeric", SimpleImputer(), make_column_selector(dtype_include="float")),
     ('log', FunctionTransformer(np.log1p, validate=False), ["LotArea"]),
     ('scale', StandardScaler(), ["LotArea"])]
)

pipe = Pipeline([
    ("select_cols", column_selector),
    ("tree", DecisionTreeRegressor(max_depth=5, random_state=314))
])

cv = cross_validate(pipe, xtrain, ytrain, cv=5, scoring=["r2", "neg_mean_absolute_error"])
r2 = cv["test_r2"]
mae = -cv["test_neg_mean_absolute_error"]

print(f"mean r2: {r2.mean():0.2f}")
print(f"mean mae: {mae.mean():,.2f}")

mean r2: 0.48
mean mae: 37,971.79


In [34]:
#Ajustamos un ensemble voting

column_selector = ColumnTransformer(
    [("label_bin", OneHotEncoder(sparse=False), ["MSZoning", "SaleCondition"]),
     ("numeric", SimpleImputer(), make_column_selector(dtype_include="float")),
     ('log', FunctionTransformer(np.log1p, validate=False), ["LotArea"]),
     ('scale', StandardScaler(), ["LotArea"])]
)

tree = DecisionTreeRegressor(max_depth=3)
las = Lasso(alpha=0.1, tol = 0.5)
rid = Ridge(alpha=0.1)

pipe = Pipeline([
    ("select_cols", column_selector),
    ("vote", VotingRegressor([("tree", tree), ("ridge", rid), ("lasso", las)]))
])

cv = cross_validate(pipe, xtrain, ytrain, cv=5, scoring=["r2", "neg_mean_absolute_error"])
r2 = cv["test_r2"]
mae = -cv["test_neg_mean_absolute_error"]

print(f"mean r2: {r2.mean():0.2f}")
print(f"mean mae: {mae.mean():,.2f}")

mean r2: 0.54
mean mae: 36,992.30


In [35]:
#Ajustamos un ensemble bagging con decision tree

column_selector = ColumnTransformer(
    [("label_bin", OneHotEncoder(sparse=False), ["MSZoning", "SaleCondition"]),
     ("numeric", SimpleImputer(), make_column_selector(dtype_include="float")),
     ('log', FunctionTransformer(np.log1p, validate=False), ["LotArea"]),
     ('scale', StandardScaler(), ["LotArea"])]
)

pipe = Pipeline([
    ("select_cols", column_selector),
    ("bag", BaggingRegressor(DecisionTreeRegressor(), n_estimators=500, bootstrap=True, max_samples=100, n_jobs=-1))
])

cv = cross_validate(pipe, xtrain, ytrain, cv=5, scoring=["r2", "neg_mean_absolute_error"])
r2 = cv["test_r2"]
mae = -cv["test_neg_mean_absolute_error"]

print(f"mean r2: {r2.mean():0.2f}")
print(f"mean mae: {mae.mean():,.2f}")

mean r2: 0.59
mean mae: 33,471.47


In [39]:
#Ajustamos un ensemble bagging con ridge

column_selector = ColumnTransformer(
    [("label_bin", OneHotEncoder(sparse=False), ["MSZoning", "SaleCondition"]),
     ("numeric", SimpleImputer(), make_column_selector(dtype_include="float")),
     ('log', FunctionTransformer(np.log1p, validate=False), ["LotArea"]),
     ('scale', StandardScaler(), ["LotArea"])]
)

pipe = Pipeline([
    ("select_cols", column_selector),
    ("bag", BaggingRegressor(Ridge(), n_estimators=500, bootstrap=True, max_samples=100, n_jobs=-1))
])

cv = cross_validate(pipe, xtrain, ytrain, cv=5, scoring=["r2", "neg_mean_absolute_error"])
r2 = cv["test_r2"]
mae = -cv["test_neg_mean_absolute_error"]

print(f"mean r2: {r2.mean():0.2f}")
print(f"mean mae: {mae.mean():,.2f}")

mean r2: 0.49
mean mae: 39,795.50


In [41]:
#Ajustamos un ensemble bagging con ridge

column_selector = ColumnTransformer(
    [("label_bin", OneHotEncoder(sparse=False), ["MSZoning", "SaleCondition"]),
     ("numeric", SimpleImputer(), make_column_selector(dtype_include="float")),
     ('log', FunctionTransformer(np.log1p, validate=False), ["LotArea"]),
     ('scale', StandardScaler(), ["LotArea"])]
)

pipe = Pipeline([
    ("select_cols", column_selector),
    ("r_forest", RandomForestRegressor(n_estimators=400, random_state=314, n_jobs=-1, oob_score=True))
])

cv = cross_validate(pipe, xtrain, ytrain, cv=5, scoring=["r2", "neg_mean_absolute_error"])
r2 = cv["test_r2"]
mae = -cv["test_neg_mean_absolute_error"]

print(f"mean r2: {r2.mean():0.2f}")
print(f"mean mae: {mae.mean():,.2f}")

mean r2: 0.58
mean mae: 33,805.93


In [42]:
#El mejor resultado lo obtenemos con un ensemble bagging con decision tree

column_selector = ColumnTransformer(
    [("label_bin", OneHotEncoder(sparse=False), ["MSZoning", "SaleCondition"]),
     ("numeric", SimpleImputer(), make_column_selector(dtype_include="float")),
     ('log', FunctionTransformer(np.log1p, validate=False), ["LotArea"]),
     ('scale', StandardScaler(), ["LotArea"])]
)

pipe = Pipeline([
    ("select_cols", column_selector),
    ("bag", BaggingRegressor(DecisionTreeRegressor(), n_estimators=500, bootstrap=True, max_samples=100, n_jobs=-1))
])

cv = cross_validate(pipe, xtrain, ytrain, cv=5, scoring=["r2", "neg_mean_absolute_error"])
r2 = cv["test_r2"]
mae = -cv["test_neg_mean_absolute_error"]

print(f"Mejor resultado (bagging con decision tree) mean r2: {r2.mean():0.2f}")
print(f"Mejor resultado (bagging con decision tree) mean mae: {mae.mean():,.2f}")

Mejor resultado (bagging con decision tree) mean r2: 0.59
Mejor resultado (bagging con decision tree) mean mae: 33,672.67
