# Exploratory Data Analysis

## Download the Data

In [None]:
import pathlib
import requests
import tarfile

import numpy as np
import pandas as pd
from sklearn import model_selection


def download_data(url, data_dir):
    with open(data_dir / "housing.tgz", 'wb') as f:
        response = requests.get(url)
        f.write(response.content)


def extract_data(data_dir):
    with tarfile.open(data_dir / "housing.tgz") as tgz:
        tgz.extractall(path=data_dir)


# load the data
url = "https://github.com/ageron/data/raw/main/housing.tgz"
data_dir = pathlib.Path("./sample_data")
data_dir.mkdir(parents=True, exist_ok=True)

download_data(url, data_dir)
extract_data(data_dir)
housing_df = pd.read_csv(data_dir / "housing" / "housing.csv")

# stratified sampling to match the income distribution
housing_df["income_cat"] = pd.cut(
    housing_df["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[0, 1, 2, 3, 4]
)

train_df, test_df = model_selection.train_test_split(
    housing_df,
    test_size=0.2,
    stratify=housing_df.loc[:, "income_cat"],
    random_state=42
)

train_df.drop("income_cat", axis=1, inplace=True)
test_df.drop("income_cat", axis=1, inplace=True)

In [None]:
train_df.info()

In [None]:
test_df.info()

## Visualizing Geographical Data

In [None]:
import matplotlib.pyplot as plt


train_df.plot(kind="scatter", x="longitude", y="latitude", grid=True)
plt.show()

In [None]:
train_df.plot(kind="scatter", x="longitude", y="latitude", grid=True, alpha=0.2)
plt.show()

In [None]:
train_df.plot(
    kind="scatter",
    x="longitude",
    y="latitude",
    grid=True,
    s=train_df.loc[:, "population"] / 100,
    label="population",
    c="median_house_value",
    colorbar=True,
    legend=True,
    sharex=False,
    figsize=(10, 7)
)
plt.show()

## Looking for Correlations

In [None]:
(
    train_df.corr(numeric_only=True)
            .loc[:, "median_house_value"]
            .sort_values(ascending=False)
)

In [None]:
attributes = [
    "median_house_value",
    "median_income",
    "total_rooms",
    "housing_median_age"
]
_ = (
    pd.plotting
      .scatter_matrix(
          train_df.loc[:, attributes],
          figsize=(12, 8)
          )
)
plt.show()

In [None]:
train_df.plot(
    kind="scatter",
    x="median_income",
    y="median_house_value",
    alpha=0.1,
    grid=True
)
plt.show()

## Experimenting with Attribute Combinations

In [None]:
new_features = {
    "rooms_per_house": train_df["total_rooms"] / train_df["households"],
    "bedrooms_ratio": train_df["total_bedrooms"] / train_df["total_rooms"],
    "people_per_house": train_df["population"] / train_df["households"]
}

In [None]:
(
    train_df.assign(**new_features)
            .corr(numeric_only=True)
            .loc[:, "median_house_value"]
            .sort_values(ascending=False)
)