In [21]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request
import numpy as np

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")  # Path to the compressed dataset

    if not tarball_path.is_file():  # If the file doesn't exist locally
        Path("datasets").mkdir(parents=True, exist_ok=True)  # Create the 'datasets' directory if needed

        url = "https://github.com/ageron/data/raw/main/housing.tgz"  # URL to download the dataset
        urllib.request.urlretrieve(url, tarball_path)  # Download the .tgz file from the URL and save it locally

        with tarfile.open(tarball_path) as housing_tarball:  # Open the .tgz file as a tar archive
            housing_tarball.extractall(path="datasets")  # Extract all contents into the 'datasets' directory

    return pd.read_csv(Path("datasets/housing/housing.csv"))  # Load the CSV data into a DataFrame and return it

housing = load_housing_data()

In [22]:
housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
    labels=[1, 2, 3, 4, 5]
)


from sklearn.model_selection import train_test_split

strat_train_set, strat_test_set = train_test_split(
    housing,
    test_size=0.2,
    stratify=housing["income_cat"],
    random_state=42
)

housing = strat_train_set.copy()

---

### Cleaning The data

**Thee Options**:
1. **Get rid of the corresponding district**
```python
housing.dropna(subset=["total_bedrooms"], inplace=True) # option 1
```
2. **Get rid of the whole attribute**
```python
housing.drop("total_bedrooms", axis=1) # option 2
```
3. **Set the missing value to some value (zero, the mean, the median, etc.). This is called *Imputation***
```python
median = housing["total_bedrooms"].median()  # option 3
housing["total_bedrooms"] = housing["total_bedrooms"].fillna(median)
```

### Option 3 SimpleImputer Scikit-Learn Implementation

In [23]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median") # set imputer
housing_num = housing.select_dtypes(include=[np.number]) # select only numerical attributes
imputer.fit(housing_num)

* The imputer has simply computed the median of each attribute and stored the result in its statistics_instance variable

In [24]:
imputer.statistics_ # checking the imputer

array([-1.1851e+02,  3.4260e+01,  2.9000e+01,  2.1250e+03,  4.3400e+02,
        1.1670e+03,  4.0800e+02,  3.5385e+00,  1.7920e+05])

In [25]:
housing_num.median().values # verify the imputer calculation

array([-1.1851e+02,  3.4260e+01,  2.9000e+01,  2.1250e+03,  4.3400e+02,
        1.1670e+03,  4.0800e+02,  3.5385e+00,  1.7920e+05])

* Now we use this "trained" Imputer to transform the training set by replacing missing values with the learned median

In [26]:
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
index=housing_num.index)

## Imputing Missing Values with SimpleImputer

1. **Initialize** the `SimpleImputer` with `strategy="median"`
2. **Fit** on numerical data to compute medians
3. **Transform** the data to fill missing values
4. **Convert** the result back to a DataFrame

This preserves the dataset structure while handling missing values systematically.