In [189]:
import pandas as pd
import numpy as np

with open('../datasets/housing/housing.csv') as f:
    data  = pd.read_csv(f)

In [190]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

### Estratificación según ingreso medio

In [191]:
def stratified_data_split(data):
    """ hace un split que refleja la realidad dando iguales proporciones en la muestra sobre un atributo """
    data['income_category'] = np.ceil(data['median_income']/1.5)
    data['income_category'].where(data['income_category'] < 5, 5.0, inplace=True)
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    train_index, test_index = next(split.split(data, data['income_category']))
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]
    strat_test_set.drop(['income_category'], axis=1, inplace=True)
    strat_train_set.drop(['income_category'], axis=1, inplace=True)
    return strat_train_set, strat_test_set


strat_train_set, strat_test_set = stratified_data_split(data)
housing_labels = strat_train_set['median_house_value'].copy()
housing = strat_train_set.drop('median_house_value',axis=1)
# Hago una copia del set de datos para no alterarloo mientras exploro

### reemplazando valores nulos

In [192]:
# relleno los valores nulos con la mediana, pero también hay otras posibildades

from sklearn.preprocessing import Imputer

imputer = Imputer(strategy="median")

# tiro todas las columnas no numericas porque no puedo calcular la mediana en ellas
housing_num = housing.drop('ocean_proximity',axis = 1)
imputer.fit(housing_num)

Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)

In [193]:
#imputer.statistics_

X = imputer.transform(housing_num)
# X es un array numpy, lo vuelvo a poner en un data frame
housing_tr = pd.DataFrame(X, columns=housing_num.columns)


### encodeando categorias a atributos numericos

In [194]:
# convert text columns to numbers so the algorithms can handle them. Este puede provocar que los algoritmos
#supongan que la categoria 0 sea similar a la 1
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
housing_cat = housing['ocean_proximity']
housing_cat_encoded = encoder.fit_transform(housing_cat)


In [195]:
# para que no haya similitudes falsas, uso un vector binario
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
unspecified = -1
housing_cat_1hot= encoder.fit_transform(housing_cat_encoded.reshape(unspecified,1)) # -1 means unspecified


In [196]:
#To combine the two steps in one we have a LabelBinarizer
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer(sparse_output=True)
housing_cat = housing['ocean_proximity']
housing_cat_encoded_1hot = encoder.fit_transform(housing_cat)


### derivando atributos con mejor correlación

In [197]:
# puedo hacer un Transformer que me agregue atributos, y lo puedo poner luego en un pipeline
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

# Base Estimator me da un get_params y set_params
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
            self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix]/X[:, household_ix]
        population_per_household = X[:, population_ix]/X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(False)
housing_extra_attr = attr_adder.transform(housing.values)

### pipeline

In [198]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    """ handles pandas dataframes and returns columns as numpy arrays. There is also a sklearn-pandas egg, and it
    may be that something is added to sklearn in the future as ColumnTransfrmer"""
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [199]:
# combinando todo en pipelines combinadas
from sklearn.pipeline import FeatureUnion


# empiezo de nuevo haciendo un split
strat_train_set, strat_test_set = stratified_data_split(data)
housing_labels = strat_train_set['median_house_value'].copy()
housing = strat_train_set.drop('median_house_value',axis=1)


num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([('selector', DataFrameSelector(num_attribs)),
                         ('imputer', Imputer(strategy="median")),
                        ('attribs_adder', CombinedAttributesAdder()),
                        ('std_scaleer', StandardScaler())])
cat_pipeline = Pipeline([('selector', DataFrameSelector(cat_attribs)),
                         ('label_binarizer', LabelBinarizer())])

full_pipeline = FeatureUnion(transformer_list = [('num_pipeline', num_pipeline),
                                                ('cat_pipeline', cat_pipeline)])

housing_prepared = full_pipeline.fit_transform(housing)

### Training a Linear Regression Model

In [202]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
linear_reg.fit(housing_prepared, housing_labels)


# empizo de nuevo haciendo un split
strat_train_set, strat_test_set = stratified_data_split(data)
housing_labels = strat_train_set['median_house_value'].copy()
housing = strat_train_set.drop('median_house_value',axis=1)

housing_prepared = full_pipeline.fit_transform(housing)

# entreno en unos pocos ejemplos
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]

some_data_prepared = full_pipeline.transform(some_data)
print("predictions", linear_reg.predict(some_data_prepared))
print("labels", list(some_labels))

predictions [ 210644.60459286  317768.80697211  210956.43331178   59218.98886849
  189747.55849879]
labels [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]


#### Mido el RMSE en TODO el training set

In [203]:
from sklearn.metrics import mean_squared_error

housing_predictions = linear_reg.predict(housing_prepared)
linear_mse = mean_squared_error(housing_labels, housing_predictions)
linear_mse = np.sqrt(linear_mse)
print(linear_mse)

68628.1981985


#### como el error es muy significativo, intento con un modelo mas complejo
### DecisionTrees

In [207]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse)

0.0


Hice overfitting, intento dividir el training set en kfolds para hacer crossvalidation

In [210]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
print(rms_scores)

NameError: name 'rms_scores' is not defined