In [103]:
import sys
sys.path.insert(0, '/Users/emilioalberini/Desktop/eaLearn')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from eaLearn import LinearRegression, Lasso , Ridge, ElasticNet
from eaLearn.utils.data_manipulation import train_test_split

# 1. Preprocessing

In [78]:
housing = pd.read_csv('data/housing.csv')

In [79]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [84]:
features = [
    'longitude',
    'latitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'ocean_proximity'
]

target = ['median_house_value']

X = df[features]
y = df[target]

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2)

In [95]:
X_train = pd.DataFrame(X_train, columns=features)
X_test = pd.DataFrame(X_test, columns=features)

y_test = pd.DataFrame(y_test, columns=target)
y_train = pd.DataFrame(y_train, columns=target)

In [100]:
X_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-117.39,34.1,12.0,7184.0,1516.0,4862.0,1235.0,2.4492,INLAND
1,-117.02,34.9,37.0,1199.0,351.0,782.0,296.0,1.6515,INLAND
2,-121.33,38.0,14.0,3731.0,772.0,1679.0,750.0,3.1369,INLAND
3,-122.11,37.11,46.0,1993.0,404.0,850.0,327.0,5.208,NEAR OCEAN
4,-121.86,37.41,16.0,1489.0,262.0,945.0,263.0,7.3861,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
16342,-119.23,34.42,16.0,630.0,117.0,343.0,100.0,5.75,<1H OCEAN
16343,-122.64,41.95,18.0,1867.0,424.0,802.0,314.0,1.8242,INLAND
16344,-122.71,38.42,23.0,1569.0,414.0,1031.0,368.0,1.6267,<1H OCEAN
16345,-121.2,37.78,4.0,58.0,29.0,79.0,29.0,3.375,INLAND


In [101]:
from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [104]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

In [105]:
from sklearn.compose import ColumnTransformer

X_train_num = X_train.drop("ocean_proximity", axis=1)

num_attribs = list(X_train_num)
cat_attribs = ["ocean_proximity"]


full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

In [106]:
housing_prepared = full_pipeline.fit_transform(X_train)
housing_prepared

array([[ 1.08651488, -0.71791802, -1.32541758, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.27105293, -0.34399193,  0.65824564, ...,  0.        ,
         0.        ,  0.        ],
       [-0.87856599,  1.10497166, -1.16672453, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.56684304,  1.30128286, -0.45260577, ...,  0.        ,
         0.        ,  0.        ],
       [-0.81372829,  1.00214199, -1.96018982, ...,  0.        ,
         0.        ,  0.        ],
       [-0.72395303,  0.941379  , -0.69064535, ...,  0.        ,
         0.        ,  0.        ]])