
# Get the Data

## Download the Data

In [None]:
import pathlib
import requests
import tarfile

import pandas as pd


def download_data(url, data_dir):
    with open(data_dir / "housing.tgz", 'wb') as f:
        response = requests.get(url)
        f.write(response.content)


def extract_data(data_dir):
    with tarfile.open(data_dir / "housing.tgz") as tgz:
        tgz.extractall(path=data_dir)



url = "https://github.com/ageron/data/raw/main/housing.tgz"
data_dir = pathlib.Path("./sample_data")
data_dir.mkdir(parents=True, exist_ok=True)

download_data(url, data_dir)
extract_data(data_dir)

housing_df = pd.read_csv(data_dir / "housing" / "housing.csv")

## Take a Quick Look at the Data Structure

In [None]:
housing_df.head()

In [None]:
housing_df.info()

In [None]:
(
    housing_df.loc[:, "ocean_proximity"]
              .value_counts()
)

In [None]:
housing_df.describe()

In [None]:
import matplotlib.pyplot as plt


_ = housing_df.hist(bins=50, figsize=(12, 8))

## Create a Test Set

### Naive sampling

In [None]:
from sklearn import model_selection


train_df, test_df = model_selection.train_test_split(
    housing_df,
    test_size=0.2,
    random_state=42
)

In [None]:
train_df.info()

In [None]:
test_df.info()

### Stratified Sampling

In [None]:
import numpy as np


housing_df["income_cat"] = pd.cut(
    housing_df["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[0, 1, 2, 3, 4]
)

In [None]:
_ = (
    housing_df.loc[:, "income_cat"]
              .value_counts()
              .sort_index()
              .plot
              .bar(rot=0, grid=True)
)
plt.xlabel("Income category")
plt.ylabel("Number of districts")
plt.show()

In [None]:
train_df, test_df = model_selection.train_test_split(
    housing_df,
    test_size=0.2,
    stratify=housing_df.loc[:, "income_cat"],
    random_state=42
)

In [None]:
(
    train_df.loc[:, "income_cat"]
            .value_counts(normalize=True)
)

In [None]:
(
    test_df.loc[:, "income_cat"]
           .value_counts(normalize=True)
)

## Save Train and Test Sets

In [None]:
train_df.drop("income_cat", axis=1, inplace=True)
test_df.drop("income_cat", axis=1, inplace=True)

In [None]:
train_df.to_csv(data_dir / "housing" / "train.csv")
test_df.to_csv(data_dir / "housing" / "test.csv")