In [1]:
# extra code – fetches, splits and normalizes the California housing dataset

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

housing = fetch_california_housing()


In [6]:
import tensorflow as tf

tf.keras.datasets.california_housing.load_data(
    version='large',
    path='california_housing.npz',
    test_split=0.2,
    seed=113
)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/california_housing.npz
[1m743530/743530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


((array([[-118.27  ,   34.09  ,   52.    , ..., 1048.    ,  491.    ,
             3.7847],
         [-118.36  ,   33.96  ,   21.    , ..., 1286.    ,  557.    ,
             2.7284],
         [-122.39  ,   37.76  ,   52.    , ...,  712.    ,  398.    ,
             3.9722],
         ...,
         [-122.34  ,   37.57  ,   52.    , ...,  876.    ,  359.    ,
             8.2598],
         [-122.18  ,   37.89  ,   18.    , ..., 1634.    ,  734.    ,
             8.1489],
         [-118.43  ,   34.2   ,   29.    , ..., 1942.    ,  679.    ,
             3.1118]], dtype=float32),
  array([252300., 146900., 290900., ..., 500001., 499000., 238100.],
        dtype=float32)),
 (array([[-118.36  ,   34.08  ,   45.    , ..., 1265.    ,  455.    ,
             3.3864],
         [-120.2   ,   34.63  ,   14.    , ..., 1487.    ,  488.    ,
             4.4519],
         [-121.21  ,   37.81  ,    8.    , ...,  999.    ,  301.    ,
             5.193 ],
         ...,
         [-121.29  ,   37.97  ,  

In [3]:
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)


In [4]:
import numpy as np
from pathlib import Path

def save_to_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = Path() / "datasets" / "housing"
    housing_dir.mkdir(parents=True, exist_ok=True)
    filename_format = "my_{}_{:02d}.csv"

    filepaths = []
    m = len(data)
    chunks = np.array_split(np.arange(m), n_parts)
    for file_idx, row_indices in enumerate(chunks):
        part_csv = housing_dir / filename_format.format(name_prefix, file_idx)
        filepaths.append(str(part_csv))
        with open(part_csv, "w") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

train_filepaths = save_to_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_csv_files(test_data, "test", header, n_parts=10)


In [5]:
housing_dir = Path() / "datasets" / "housing"
housing_dir

PosixPath('datasets/housing')