In [7]:
import numpy as np
import pandas as pd
import tarfile
import matplotlib.pyplot as plt
import os
import src.chapter2 as ch2

%load_ext autoreload
%autoreload 2
%matplotlib inline

ImportError: cannot import name 'TransfomerMixin'

In [None]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

ch2.fetch_housing_data(HOUSING_URL, HOUSING_PATH)

In [None]:
housing_data = ch2.load_housing_data(HOUSING_PATH)
housing_data.shape

In [None]:
housing_data.head()

Let's see some basic statistics.

In [None]:
housing_data.describe()

Let's check the number of missing values.

In [None]:
housing_data.isnull().sum()

Similar information can be obtained using the `info` method.

In [None]:
housing_data.info()

Only one column is non-numeric.

In [None]:
print(housing_data['ocean_proximity'].value_counts())
print(housing_data['ocean_proximity'].value_counts(normalize=True))

Note, if you don't include the `plt.show()` command, `hist` will also show some text data about the plots. Here we are using the default number of bins, but it is useful to experiment with different bin sizes. From here, for example, it's not immediately obvious if and where capping has been applied.

In [None]:
housing_data.hist(figsize=(20, 15))
plt.show()

In the graph below, the capping of `housing_median_age` and `median_house_value` is clearly visible.

In [None]:
housing_data.hist(bins=50, figsize=(20, 15))
plt.show()

**TO DO** experiment with binning the values of these two predictors.

**TO DO** try using xgboost and see if it perfoms significantly better than random forests.

We could check whether the prices are very different for different locations.

In [None]:
housing_data.median_house_value.hist(bins=50, by=housing_data.ocean_proximity,
                                     figsize=(20, 15))
plt.show()

Yes, it seems that inland houses are cheaper, and the peak for the high valued houses is not visible in that category.

In the `chapter2` module there is a `split_train_test_by_id` function that splits a dataset by crc32 hash. One example of its application is shown here.

In [None]:
housing_with_id = housing_data.reset_index()
train_set, test_set = ch2.split_train_test_by_id(housing_with_id, 0.2, 'index')
train_set.shape, test_set.shape

Let's do it the "canonical" way.

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing_data, test_size=0.2, random_state=42)

## Stratification


In [None]:
housing_data['income_cat'] = np.ceil(housing_data['median_income'] / 1.5)
housing_data['income_cat'].value_counts().sort_index()

We set the values of `income_cat` that are larger than 5.0 to 5.0. Note the peculiar way `pd.Series.where` works. When the condition is `True`, the value is retained. When it's `False`, the second argument is used instead. The opposite behaviour is obtained by `pd.Series.mask`.

In [None]:
housing_data['income_cat'].where(housing['income_cat'] < 5.0, 5.0, inplace=True)

The same result can be obtained by using `pd.cut`.

In [None]:
housing_data['income_cat'] = pd.cut(housing_data['median_income'], 
                                    bins=[0, 1.5, 3.0, 4.5, 6.0, np.inf],
                                    labels=[1, 2, 3, 4, 5])
housing_data['income_cat'].value_counts()

In the book, the class `StratifiedShuffleSplit` is used. This function returns the indices of the trianing and test sets. It turns out that the same result can be obtained more succintly by using `train_test_split` with the option `stratify`.

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in split.split(housing_data, housing_data['income_cat']):
    strat_train_set = housing_data.loc[train_idx]
    strat_test_set = housing_data.loc[test_idx]

In [None]:
strat_train_set2, strat_test_set2 = train_test_split(housing_data, test_size=0.2,
                                                     random_state=42, 
                                                     stratify=housing_data['income_cat'])

We obtain exactly the same entries in the stratified test set.

In [None]:
print(np.all(strat_test_set.index == strat_test_set2.index))
del strat_train_set2, strat_test_set2

The stratification seems to work fine.

In [None]:
print(strat_train_set['income_cat'].value_counts(normalize=True))
print(strat_test_set['income_cat'].value_counts(normalize=True))

In [None]:
for dataset in (strat_train_set, strat_test_set):
    dataset.drop('income_cat', axis=1, inplace=True)

Let's create a copy of the training set we can play with without risks.

In [None]:
housing = strat_train_set.copy()

In [None]:
housing.plot(x='longitude', y='latitude', kind='scatter', alpha=0.1, figsize=(10, 8),
             s=housing['population']/100, c='median_house_value', cmap=plt.get_cmap('jet'),
             colorbar=True, label='population')
plt.legend()

In [None]:
corr_mat = housing.corr()
corr_mat['median_house_value']

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], alpha=0.1, figsize=(12, 10), hist_kwds={'bins': 50})
plt.show()

## Adding features

We add a couple of additional features to those shown in the book.

In [None]:
housing['rooms_per_household'] = housing['total_rooms'] / housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms'] / housing['total_rooms']
housing['population_per_household'] = housing['population'] / housing['households']
housing['rooms_per_population'] = housing['total_rooms'] / housing['population']
housing['bedrooms_per_population'] = housing['total_bedrooms'] / housing['population']

In [None]:
housing.corr()['median_house_value']

The `rooms_per_population` feature seems to have a good positive correlation with the median price. If, however, we plot these two variables, the picture is quite different.

In [None]:
housing.plot(x='rooms_per_population', y='median_house_value', kind='scatter',
             alpha=0.1)
plt.show()

Turns out that taking the log of the house value doesn't make things better.

## Preparing the dataset for ML

In [None]:
housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = housing['median_house_value']
from sklearn.base import TransformerMixin

### Missing values

We use the `SimpleImputer` class, but it would be interesting to play with a KNN classifier.

In [None]:
from sklearn.impute import SimpleImputer

We have missing values in `total_bedrooms` and in the variables built on it.

In [None]:
housing.isnull().sum()

In [None]:
imputer = SimpleImputer(strategy='median')
housing_num = housing.drop('ocean_proximity', axis=1)
imputer.fit(housing_num)

In [None]:
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)

Note that we need to use the double bracket to produce a DataFrame, otherwise `OrdinalEncoder` will complain.

In [None]:
housing_cat = housing[['ocean_proximity']]

In [None]:
from sklearn.preprocessing import OrdinalEncoder

oenc = OrdinalEncoder()
housing_cat_encoded = oenc.fit_transform(housing_cat)

In [None]:
housing_cat_encoded

### One-hot encoding

Scikit-Learn's `OneHotEncoder` can now handle strings as inputs.

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
housing_cat_1hot = ohe.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
ohe.categories_