# Content Hashing Utilities

> Shared cryptographic hashing primitives for content integrity verification

In [None]:
#| default_exp utils.hashing

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import hashlib
from pathlib import Path
from typing import Union

## hash_bytes

Computes a cryptographic hash of byte content, returning a self-describing string in `"algo:hexdigest"` format. This format embeds the algorithm name, making hashes forward-compatible if the algorithm changes.

In [None]:
#| export
def hash_bytes(
    content: bytes,  # Byte content to hash
    algo: str = "sha256"  # Hash algorithm name (e.g., "sha256", "sha3_256")
) -> str:  # Hash string in "algo:hexdigest" format
    """Compute a hash of byte content."""
    return f"{algo}:{hashlib.new(algo, content).hexdigest()}"

In [None]:
result = hash_bytes(b"hello world")
print(f"hash_bytes result: {result}")

# Check format
algo, digest = result.split(":", 1)
assert algo == "sha256"
assert len(digest) == 64  # SHA-256 produces 64 hex chars

# Deterministic
assert hash_bytes(b"hello world") == hash_bytes(b"hello world")

# Different content produces different hash
assert hash_bytes(b"hello world") != hash_bytes(b"hello World")

print("hash_bytes tests passed")

hash_bytes result: sha256:b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9
hash_bytes tests passed


In [None]:
# Custom algorithm
sha512_result = hash_bytes(b"test", algo="sha512")
print(f"SHA-512 result: {sha512_result[:30]}...")
assert sha512_result.startswith("sha512:")
assert len(sha512_result.split(":")[1]) == 128  # SHA-512 produces 128 hex chars

print("Custom algorithm test passed")

SHA-512 result: sha512:ee26b0dd4af7e749aa1a8ee...
Custom algorithm test passed


## hash_file

Stream-hashes a file without loading it entirely into memory. Uses chunked reads suitable for large files (audio, video, etc.).

In [None]:
#| export
def hash_file(
    path: Union[str, Path],  # Path to file to hash
    algo: str = "sha256",  # Hash algorithm name
    chunk_size: int = 8192  # Read chunk size in bytes
) -> str:  # Hash string in "algo:hexdigest" format
    """Stream-hash a file without loading it entirely into memory."""
    h = hashlib.new(algo)
    with open(path, 'rb') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            h.update(chunk)
    return f"{algo}:{h.hexdigest()}"

In [None]:
import tempfile
import os

# Create a temp file with known content
with tempfile.NamedTemporaryFile(delete=False, mode='wb') as tmp:
    tmp.write(b"hello world")
    tmp_path = tmp.name

# Hash the file
file_hash = hash_file(tmp_path)
print(f"hash_file result: {file_hash}")

# Should match hash_bytes of the same content
assert file_hash == hash_bytes(b"hello world")

# Test with Path object
assert hash_file(Path(tmp_path)) == file_hash

# Cleanup
os.unlink(tmp_path)
print("hash_file tests passed")

hash_file result: sha256:b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9
hash_file tests passed


## verify_hash

Verifies byte content against an expected hash string. Automatically extracts the algorithm from the `"algo:hexdigest"` format.

In [None]:
#| export
def verify_hash(
    content: bytes,  # Content to verify
    expected: str  # Expected hash in "algo:hexdigest" format
) -> bool:  # True if content matches expected hash
    """Verify content against an expected hash string."""
    algo, _ = expected.split(":", 1)
    return hash_bytes(content, algo) == expected

In [None]:
original = b"hello world"
h = hash_bytes(original)

# Matching content
assert verify_hash(original, h) == True

# Modified content
assert verify_hash(b"hello World", h) == False

# Works with different algorithms
h_sha512 = hash_bytes(original, algo="sha512")
assert verify_hash(original, h_sha512) == True
assert verify_hash(b"tampered", h_sha512) == False

print("verify_hash tests passed")

verify_hash tests passed


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()