# A01 — EDA Warm-Up (California Housing)

**What to do (commit after each ✔):**
1. ✔ Run all cells once. Verify files appear in `A01-eda/data/` and `A01-eda/figs/`.
2. ✔ Change **one thing** (e.g., histogram `bins` or choose a different numeric column), re-run the affected cell(s).
3. ✔ Commit & push. Add a 2–3 sentence note in `A01-eda/README.md` about what changed and what you observed.

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Find the repo root (folder containing .git). If not found, use current working directory.
REPO = Path.cwd()
for _ in range(10):
    if (REPO / ".git").exists():
        break
    if REPO.parent == REPO:
        break
    REPO = REPO.parent

# A01 root and output folders
A01 = REPO / "A01-eda"
DATA = A01 / "data"
FIGS = A01 / "figs"
DATA.mkdir(parents=True, exist_ok=True)
FIGS.mkdir(parents=True, exist_ok=True)

print("Repo root:", REPO)
print("A01 dir  :", A01)
print("Data dir :", DATA)
print("Figs dir :", FIGS)

In [None]:
from sklearn.datasets import fetch_california_housing

cal = fetch_california_housing(as_frame=True)
df = cal.frame.rename(columns={"MedHouseVal": "MedHomeVal_100k"})
df["MedHomeVal_$"] = df["MedHomeVal_100k"] * 100_000

print("Shape:", df.shape)
df.head()

In [None]:
summary = df.describe(include="all").T
summary.to_csv(DATA / "summary.csv")
nulls = df.isna().sum().sort_values(ascending=False)
nulls.to_csv(DATA / "null_counts.csv")
summary.head(10)

In [None]:
# Choose a numeric column and number of bins, then re-run this cell to regenerate the plot.
col = "MedInc"  # try changing this to another numeric column from df.columns
bins = 30       # try 50

assert col in df.columns, f"{col} not in dataframe columns"
assert pd.api.types.is_numeric_dtype(df[col]), f"{col} is not numeric"

plt.figure()
df[col].hist(bins=bins)
plt.title(f"Histogram of {col} (bins={bins})")
plt.xlabel(col); plt.ylabel("count")
plt.tight_layout()
out = FIGS / f"hist_{col}_{bins}.png"
plt.savefig(out, dpi=150)
out

In [None]:
# Optional: scatter of income vs. home value if both columns exist
if {"MedInc", "MedHomeVal_$"}.issubset(df.columns):
    plt.figure()
    plt.scatter(df["MedInc"], df["MedHomeVal_$"], s=5, alpha=0.5)
    plt.xlabel("Median Income (10k units in original)")
    plt.ylabel("Median Home Value ($)")
    plt.title("Income vs Home Value")
    plt.tight_layout()
    out2 = FIGS / "scatter_income_value.png"
    plt.savefig(out2, dpi=150)
    out2
else:
    print("Columns for scatter not found; skipping.")

## Next steps
- Change `bins` or pick a different numeric `col` above, re-run the cell, and ensure a new PNG appears in `figs/`.
- Commit & push your changes with a meaningful message.
- Open an Issue titled **“A01 ready”** and paste your latest commit hash (`git log --oneline -1`).