In [1]:
from sklearn.base import BaseEstimator, clone
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
import numpy as np
import pandas as pd

In [2]:
np.random.seed(357823)

In [3]:
def checkCondition(condition, message=None):
    if (not condition):
        raise Exception(message)

In [4]:
class SequentialRegressor(BaseEstimator):
    def __init__(self, estimator, n_estimators,
                sample_size, lr, hyperparameters):
        checkCondition(isinstance(estimator, BaseEstimator),
                      "Estimator must be an instance of Scikit-learn class")
        checkCondition(n_estimators >= 1, "Number of estimators must be bigger or equals to 1")
        checkCondition(sample_size >= 0. and sample_size <= 1., "The sample size must be between 0 and 1")
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.sample_size = sample_size
        self.lr = lr
        self.hyperparameters = hyperparameters

    def fit(self, attributes, objective):
        attributes = np.asarray(attributes)
        objective = np.asarray(objective).ravel()

        rows = attributes.shape[0]
        prediction = np.zeros(rows)
        self.models = []

        for i in range(self.n_estimators):
            remainder_i = objective - prediction
            n_samples_i = int(self.sample_size * rows)
            selected = np.random.choice(rows, size=n_samples_i, replace=False)
            samples_i = attributes[selected]
            objective_i = remainder_i[selected]
            model = clone(self.estimator)
            model.set_params(**self.hyperparameters)
            model.fit(samples_i, objective_i)
            self.models.append(model)
            prediction_i = model.predict(attributes)
            prediction = prediction + self.lr * prediction_i
        return self.models

    def predict(self, attributes):
        attributes = np.asarray(attributes)
        
        rows = attributes.shape[0]
        predictions = np.zeros(rows)
        for model in self.models:
            predictions += self.lr * model.predict(attributes)
        return predictions

    def evaluate(self, attributes, objective, k):
        checkCondition(k >= 1, "The number of partitions must be bigger or equals to 1")
        results = cross_validate(self, attributes, objective, scoring=['r2', 'neg_root_mean_squared_error'], cv=k)
        return {'r2': results['test_r2'].mean(), 'neg_root_mean_squared_error': results['test_neg_root_mean_squared_error'].mean()}

**Lectura y transformacíon de datos**

In [5]:
houses = pd.read_csv("house_prices.csv")
houses.head()

Unnamed: 0,GarageCars,Condition2,YearBuilt,GarageYrBlt,LandContour,LowQualFinSF,HouseStyle,GarageType,MSSubClass,WoodDeckSF,...,MiscVal,BsmtExposure,OpenPorchSF,ExterCond,Fireplaces,FullBath,BsmtQual,MiscFeature,PoolQC,SalePrice
0,2,Norm,1962,1977.0,Lvl,0,1Story,Detchd,20,0,...,0,none,0,TA,0,1,TA,none,none,132000
1,0,Norm,1914,0.0,Lvl,0,2.5Unf,none,75,0,...,0,none,291,TA,1,2,TA,none,none,128000
2,2,Norm,1999,1999.0,Lvl,0,1Story,Attchd,20,0,...,0,Av,35,TA,0,2,Gd,none,none,192000
3,1,Norm,1948,1948.0,Bnk,0,2Story,Attchd,20,103,...,0,none,0,Gd,0,3,TA,none,none,225000
4,2,Norm,1950,1950.0,Lvl,0,1Story,Detchd,20,0,...,0,none,29,TA,0,1,none,none,none,109900


In [6]:
atributos = houses.iloc[:,:-1]
objetivo = houses.iloc[:,-1:]
atributos_discretos = atributos.select_dtypes(include=['object', 'string']).columns
encoder = OrdinalEncoder()
encoder.fit(atributos[atributos_discretos])
atributos[atributos_discretos] = encoder.transform(atributos[atributos_discretos])

**Pruebas de métodos**

In [7]:
hyperparameters = {
    'max_depth' : 10
}
decisionTree = SequentialRegressor(DecisionTreeRegressor(), 4, 0.75, 0.5, hyperparameters)

In [8]:
print(decisionTree.evaluate(atributos, objetivo, 10))

{'r2': 0.5935500945607923, 'neg_root_mean_squared_error': -48247.45397833455}


In [9]:
(atributos_entrenamiento, atributos_prueba,
 objetivo_entrenamiento, objetivo_prueba) = train_test_split(atributos, objetivo,test_size=.2)

In [10]:
decisionTree = SequentialRegressor(DecisionTreeRegressor(), 4, 0.75, 0.5, hyperparameters)
decisionTree.fit(atributos_entrenamiento, objetivo_entrenamiento)
predictions = decisionTree.predict(atributos_prueba)
print(r2_score(objetivo_prueba, predictions))

0.6929053693008591


In [11]:
comparar_tree = DecisionTreeRegressor(max_depth=10)
comparar_tree.fit(atributos_entrenamiento, objetivo_entrenamiento)
comparar_predictions = comparar_tree.predict(atributos_prueba)
score2 = r2_score(objetivo_prueba, comparar_predictions)
print(score2)

0.6735289601003858
