### Creating TestSet and TrainSet

In [1]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")  # Path to the compressed dataset

    if not tarball_path.is_file():  # If the file doesn't exist locally
        Path("datasets").mkdir(parents=True, exist_ok=True)  # Create the 'datasets' directory if needed

        url = "https://github.com/ageron/data/raw/main/housing.tgz"  # URL to download the dataset
        urllib.request.urlretrieve(url, tarball_path)  # Download the .tgz file from the URL and save it locally

        with tarfile.open(tarball_path) as housing_tarball:  # Open the .tgz file as a tar archive
            housing_tarball.extractall(path="datasets")  # Extract all contents into the 'datasets' directory

    return pd.read_csv(Path("datasets/housing/housing.csv"))  # Load the CSV data into a DataFrame and return it

housing = load_housing_data()


### Naive Random Split

In [3]:
import numpy as np

def shuffle_and_split_data(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data)) # Shuffled data
    test_set_size = int(len(data) * test_ratio) # length of  data * 0.2(since we get 20% for test set)
    test_indices = shuffled_indices[:test_set_size] # 20% testset
    train_indices = shuffled_indices[test_set_size:] # 80% trainset
    return data.iloc[train_indices], data.iloc[test_indices] # return seperate test and trainset data

train_set, test_set = shuffle_and_split_data(housing, 0.2)

In [6]:
len(train_set)

16512

In [5]:
len(test_set)

4128

###  Stable or Deterministic Split via Hashing

In [9]:
from zlib import crc32  # Import the crc32 hash function from the zlib module (used for stable hashing)
import numpy as np      # Needed for converting to int64

# This function determines whether a given ID should go into the test set
def is_id_in_test_set(identifier, test_ratio):
    # Convert the identifier to a 64-bit integer (ensures consistent input type for hashing)
    # Apply crc32 hashing to generate a 32-bit deterministic hash value
    # Return True if the hash value is within the test_ratio proportion of the total possible range (0 to 2^32)
    return crc32(np.int64(identifier)) < test_ratio * 2**32 

# This function splits a dataset into a stable train/test split based on hashed IDs
def split_data_with_id_hash(data, test_ratio, id_column):
    # Extract the column containing unique IDs
    ids = data[id_column]

    # Apply the is_id_in_test_set function to each ID
    # This returns a boolean Series: True if the row should go to the test set
    in_test_set = ids.apply(lambda id_: is_id_in_test_set(id_, test_ratio))

    # Use the boolean Series to filter and return:
    # - rows not in test set (i.e., training set)
    # - rows in test set
    return data.loc[~in_test_set], data.loc[in_test_set]

In [12]:
# Create a new 'id' column to uniquely identify each row based on location, 
# since the dataset does not come with a built-in unique identifier
# We combine 'longitude' and 'latitude' into a single number by multiplying 'longitude' to preserve decimal precision and reduce collisions.
# This works because location is a stable feature — even if we update the dataset with new entries, 
# existing rows will retain their 'id' and remain consistently assigned to the same split.
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]

# Use the deterministic hashing method to split the dataset into training and test sets based on the 'id' column.
# About 20% of the rows will go into the test set if their hashed 'id' falls below the threshold, 
# ensuring a stable and consistent train/test split across dataset updates.
train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2, 'id')

In [13]:
housing_with_id["id"]

0       -122192.12
1       -122182.14
2       -122202.15
3       -122212.15
4       -122212.15
           ...    
20635   -121050.52
20636   -121170.51
20637   -121180.57
20638   -121280.57
20639   -121200.63
Name: id, Length: 20640, dtype: float64

In [14]:
len(train_set)

16322

In [15]:
len(test_set)

4318