In [None]:
from get_data import fetch_housing_data
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from zlib import crc32
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from pandas.plotting import scatter_matrix

## Downloading and loading the data

In [None]:
fetch_housing_data(housing_url="https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz", 
                   housing_path="/home/jupyter/hands-on-ml/data")

In [None]:
housing = pd.read_csv("/home/jupyter/hands-on-ml/data/housing.csv")

## Quick overview

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

In [None]:
housing.hist(bins=50, figsize=(20,15))

## Create a TestSet

In [None]:
def split_train_test(data, test_ratio):
    np.random.seed(42)
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(housing, 0.2)

In [None]:
print(train_set.shape)
print(test_set.shape)

Scikit-Learn provides a few functions to split datasets into multiple subsets in various ways

In [None]:
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
train_set.describe()

## Stratified sampling

We are going to stratify the dataset by income variable 

In [None]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5,3.0, 4.5, 6., np.inf],
                               labels=[1,2,3,4,5])

In [None]:
housing.income_cat.hist()

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_data = housing.loc[train_index]
    strat_test_data = housing.loc[test_index]

In [None]:
strat_test_data["income_cat"].value_counts()/len(strat_test_data)

In [None]:
for set_ in (strat_train_data, strat_test_data):
    set_.drop("income_cat", axis=1, inplace=True)

## Looking for correlations

Let's just focus on a few promissing attributes that seem most correlated with the median housing value

In [None]:
attributes = ['median_house_value','median_income', 'total_rooms', 'housing_median_age']
scatter_matrix(housing[attributes], figsize=(12, 8))

## Experiencing with Attribute Combinations

The total number of rooms in a district is not very useful if we don't know how many households there are. What you really want is the number of rooms per households. Similarly, the total number number of bedrooms by itself is not very useful: we want to compare it to the number of rooms. And the population per household also seems like an interesting attribute combination to look at. Let's create these new attributes. 

In [None]:
housing['rooms_per_household'] = housing['total_rooms']/housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms']/housing['total_rooms']
housing['population_per_household'] = housing['population']/housing['households']

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)