In [18]:
import gzip
import scipy.io
import scipy.sparse
import numpy as np
from pybloom_live import BloomFilter
import random
import pickle  # To serialize Bloom filters
import time

In [19]:
def load_matrix(file_prefix):
    with gzip.open(file_prefix + ".mtx.gz", "rt") as f:
        matrix = scipy.io.mmread(f).tocsc()  # fast column access
    return matrix

In [20]:
def binarize(matrix):
    """Convert counts to presence/absence (0 or 1)"""
    bin_matrix = matrix.copy()
    bin_matrix.data = np.ones_like(bin_matrix.data)
    return bin_matrix

def encode_gene_info(matrix):
    n_genes, n_cells = matrix.shape
    ref_idx = random.randint(0, n_cells - 1)
    reference = matrix[ref_idx ,:].toarray().flatten()

    bloom = BloomFilter(capacity=n_genes * 3, error_rate=0.01)

    for gene_id in range(n_genes):
        vec = matrix[gene_id, :].toarray().flatten()
        delta = reference - vec

        l2 = np.linalg.norm(delta, ord=2)
        linf = np.max(np.abs(delta))
        argmax = np.argmax(np.abs(delta))

        # Encode all 3 into Bloom filter as strings
        bloom.add(f"{l2}:{linf}:{argmax}")
    return bloom, reference

def save_bloom(bloom, reference_vector, out_file):
    encoded = {"reference": reference_vector, "filter": bloom}
    out_file += "_delta.pkl"
    with open(out_file, "wb") as f:
        pickle.dump(encoded, f)

In [21]:
# === RUNNING THE PIPELINE ===
files = ["human"]
for f in files:
    start = time.time()
    matrix = load_matrix(f)
    binary_matrix = binarize(matrix)
    bloom, reference = encode_gene_info(binary_matrix)
    save_bloom(bloom, reference, f)
    end = time.time()
    print(end - start)

352.69325709342957
