In [24]:
import os
import tarfile
from urllib.request import urlretrieve
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib notebook


In [25]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

In [26]:
# retrieving the data
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.exists(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urlretrieve(housing_url, tgz_path)
    
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

# Returning a Pandas dataframe
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [27]:
fetch_housing_data()

In [28]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [29]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [30]:
housing.ocean_proximity.value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [31]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [32]:
housing.hist(bins=50);

<IPython.core.display.Javascript object>

# Separating the Data
Once we have the data, it is best to separate it between training and test set

In [42]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import numpy as np

In [124]:
# A first attemp
train_set1, test_set1 = train_test_split(housing, test_size=0.2, random_state=42)

The histogram of data seems to be right-skewed. Since the data is not large enough, the segregation may not be unbiased

In [38]:
train_set1.median_income.hist(bins=50);

<IPython.core.display.Javascript object>

In [125]:
# A second attempt
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

In [86]:
# Initializing the Stratisfies Shuffle Split
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_index, test_index = [ix for ix in split.split(housing, housing["income_cat"])][0]
train_set2 = housing.loc[train_index]
test_set2 = housing.loc[test_index]

In [89]:
train_set2.income_cat.hist();

<IPython.core.display.Javascript object>

In [103]:
def segregation(df, column):
    return (df[column].value_counts() / sum(df[column].value_counts())).to_dict()

In [117]:
segregations = pd.DataFrame({
    "overall": segregation(housing, "income_cat"),
    "Random": segregation(train_set1, "income_cat"),
    "Stratified": segregation(train_set2, "income_cat")
})

In [130]:
segregations

Unnamed: 0,Random,Stratified,overall
1.0,0.039729,0.03985,0.039826
2.0,0.317466,0.318859,0.318847
3.0,0.348595,0.350594,0.350581
4.0,0.178537,0.176296,0.176308
5.0,0.115673,0.114402,0.114438
