In [2]:
import numpy as np
from scipy import signal
import scipy.stats

def get_gdf_kernel(sigma, n_sigma=3):
    """
    Returns normalized Gaussian Derivative to be used as the GDF kernel.
    :param sigma: width of the Gaussian in time samples
    :param n_sigma: number of sigmas to include in the kernel
    :return: GDF kernel as a np.array
    """
    x = np.arange(-sigma * n_sigma, sigma * n_sigma + 1)
    gaus = scipy.stats.norm(scale=sigma).pdf(x)
    kernel = np.diff(gaus)
    step_function = -np.heaviside(x[:-1], 1)
    return kernel / np.dot(kernel, step_function)

def threshold_trigger_1d(raw_record, kernel, trigger_threshold=100, deactivation_threshold_coefficient=1, trigger_holdoff=0):
    """
    Fast threshold trigger with convolution for a single trace.
    Assumes a simple threshold crossing (with optional holdoff).
    """
    filtered = signal.convolve(raw_record, kernel, mode='valid')
    deactivate_threshold = trigger_threshold * deactivation_threshold_coefficient

    triggered = False
    hits = []
    i = 0
    while i < len(filtered):
        if not triggered and filtered[i] > trigger_threshold:
            hits.append(i)
            triggered = True
            i += trigger_holdoff  # fast skip
        elif triggered and filtered[i] < deactivate_threshold:
            triggered = False
        i += 1

    return filtered, hits

def threshold_trigger_2d(records, sigma, trigger_threshold=100, deactivation_threshold_coefficient=1, trigger_holdoff=0, n_sigma=3):
    """
    Optimized batch threshold trigger for multiple records with configurable sigma.
    Generates the GDF kernel internally using sigma.

    :param records: 2D array of traces (n_records, n_samples)
    :param sigma: Gaussian sigma used for GDF kernel
    :param trigger_threshold: Threshold for triggering
    :param deactivation_threshold_coefficient: Coefficient for deactivation threshold
    :param trigger_holdoff: Number of samples to hold off after trigger
    :param n_sigma: Number of sigmas to include in the kernel
    :return: (filtered_records, all_hits, total_hits)
    """
    kernel = get_gdf_kernel(sigma, n_sigma)
    num_records = len(records)
    filtered_records = np.empty(num_records, dtype=object)
    all_hits = []

    kernel_len = len(kernel)
    conv_len = records.shape[1] - kernel_len + 1

    for i in range(num_records):
        raw = records[i]
        filtered = signal.convolve(raw, kernel, mode='valid')
        filtered_records[i] = filtered

        deactivate_threshold = trigger_threshold * deactivation_threshold_coefficient
        hits = []
        triggered = False
        j = 0
        while j < conv_len:
            if not triggered and filtered[j] > trigger_threshold:
                hits.append(j)
                triggered = True
                j += trigger_holdoff
            elif triggered and filtered[j] < deactivate_threshold:
                triggered = False
            j += 1

        all_hits.append(hits)

    total_hits = sum(len(h) for h in all_hits)
    return filtered_records, all_hits, total_hits


In [2]:
import yaml
from TraceSimulator import TraceSimulator
import numpy as np
import matplotlib.pyplot as plt
from scipy import signal, stats

def read_yaml_to_dict(file_path):
    with open(file_path, 'r') as file:
        config_dict = yaml.safe_load(file)
    return config_dict

config = read_yaml_to_dict('config.yaml')
ts = TraceSimulator(config)




  self.template = np.concatenate([(np.exp((xs - self.trigger_time) / self.tau_rise))[xs <= self.trigger_time], (np.exp(-(xs - self.trigger_time) / self.tau_decay))[xs > self.trigger_time]])


In [None]:
trace, (x, y, z) = ts.generate(200, type_recoil='NR', no_noise=False) # generate a 20 keV ER from a random position in the volume


In [5]:
trace[0].shape

(54, 32768)

In [30]:
sigma = 500  # Width of Gaussian kernel
filtered, hits, total_hits = threshold_trigger_2d(
    trace[0],
    sigma=sigma,
    trigger_threshold=5,                   # Default value
    deactivation_threshold_coefficient=1,    # Default value
    trigger_holdoff=0                       # Example holdoff
)

# Print summary
print(f"Total hits: {total_hits}")
print(f"Hits in first 5 records: {hits[:5]}")

Total hits: 2
Hits in first 5 records: [[], [], [], [], []]


In [42]:
import numpy as np
import zarr
from tqdm import tqdm

# Parameters
energies = np.arange(10, 201)  # from 10 to 200 eV
n_sets = 10                    # 10 simulations per energy

# Zarr storage setup
root = zarr.open('traces_dataset.zarr', mode='w')

# Dataset structure:
# For each energy, we create a group with 10 traces

for energy in tqdm(energies, desc="Simulating energies"):
    energy_group = root.create_group(str(energy))
    for i in range(n_sets):
        trace, _ = ts.generate(energy, type_recoil='NR', no_noise=False)
        # Ensure trace is a NumPy array
        trace = np.asarray(trace)
        # Store trace with shape and dtype specified
        energy_group.create_dataset(
            name=f"set_{i}",
            shape=trace.shape,
            dtype=trace.dtype,
            data=trace
        )

print("All simulations completed and saved to 'traces_dataset.zarr'")

  energy_group.create_dataset(
  E_ir_avg = E_ir / N_ir
  energy_group.create_dataset(
  E_ir_avg = E_ir / N_ir
  energy_group.create_dataset(
  E_ir_avg = E_ir / N_ir
  energy_group.create_dataset(
  E_ir_avg = E_ir / N_ir
  energy_group.create_dataset(
  E_ir_avg = E_ir / N_ir
  energy_group.create_dataset(
  E_ir_avg = E_ir / N_ir
  energy_group.create_dataset(
  E_ir_avg = E_ir / N_ir
  energy_group.create_dataset(
  E_ir_avg = E_ir / N_ir
  energy_group.create_dataset(
  E_ir_avg = E_ir / N_ir
  energy_group.create_dataset(
  E_ir_avg = E_ir / N_ir
  energy_group.create_dataset(
  E_ir_avg = E_ir / N_ir
  energy_group.create_dataset(
  E_ir_avg = E_ir / N_ir
  energy_group.create_dataset(
  E_ir_avg = E_ir / N_ir
  energy_group.create_dataset(
  E_ir_avg = E_ir / N_ir
  energy_group.create_dataset(
  E_ir_avg = E_ir / N_ir
  energy_group.create_dataset(
  E_ir_avg = E_ir / N_ir
  energy_group.create_dataset(
  E_ir_avg = E_ir / N_ir
  energy_group.create_dataset(
  E_ir_avg = E_ir

All simulations completed and saved to 'traces_dataset.zarr'





In [3]:
import numpy as np
import lz4.frame
import zstandard as zstd
import time
import os
from tqdm import tqdm

# Parameters
energies = np.arange(10, 201)  # from 10 to 200 eV
n_sets = 10                    # 10 simulations per energy

# Directory to save compressed data
os.makedirs("compressed_traces", exist_ok=True)

compressors = {
    'lz4': lambda data: lz4.frame.compress(data),
    'zstd': lambda data: zstd.ZstdCompressor(level=5).compress(data)
}

for codec, compress_fn in compressors.items():
    print(f"\n=== Running simulation with {codec.upper()} compression ===")
    output_path = f"compressed_traces/traces_{codec}.bin"

    if os.path.exists(output_path):
        print(f"Removing existing {output_path}")
        os.remove(output_path)

    start_time = time.time()
    all_data = bytearray()

    for energy in tqdm(energies, desc=f"Simulating energies with {codec}"):
        for i in range(n_sets):
            trace, _ = ts.generate(energy, type_recoil='NR', no_noise=False)
            trace = np.asarray(trace, dtype=np.float32)
            all_data.extend(trace.tobytes())

    compressed_data = compress_fn(bytes(all_data))

    with open(output_path, 'wb') as f:
        f.write(compressed_data)

    end_time = time.time()
    total_time = end_time - start_time
    size_mb = os.path.getsize(output_path) / 1024 / 1024

    print(f"\n{codec.upper()} compression completed.")
    print(f"Time taken: {total_time:.2f} seconds")
    print(f"Final compressed size: {size_mb:.2f} MB")


=== Running simulation with LZ4 compression ===


  E_ir_avg = E_ir / N_ir
Simulating energies with lz4: 100%|██████████| 191/191 [08:24<00:00,  2.64s/it]



LZ4 compression completed.
Time taken: 556.86 seconds
Final compressed size: 5684.82 MB

=== Running simulation with ZSTD compression ===


Simulating energies with zstd: 100%|██████████| 191/191 [08:20<00:00,  2.62s/it]



ZSTD compression completed.
Time taken: 697.89 seconds
Final compressed size: 2861.81 MB


In [4]:
import numpy as np
import zstandard as zstd
import time
import os
from tqdm import tqdm

# Parameters
energies = np.arange(10, 201)  # from 10 to 200 eV
n_sets = 10                    # 10 simulations per energy

# Directory to save compressed data
os.makedirs("compressed_traces", exist_ok=True)

# Shuffle bytes for better compression
def shuffle_bytes(arr: np.ndarray) -> bytes:
    return arr.view(np.uint8).reshape(-1, arr.itemsize).T.tobytes()

# Zstd compressor with higher compression level
compressor = zstd.ZstdCompressor(level=15)
output_path = "compressed_traces/traces_zstd.bin"

if os.path.exists(output_path):
    print(f"Removing existing {output_path}")
    os.remove(output_path)

print("\n=== Running simulation with ZSTD compression ===")
start_time = time.time()
all_data = bytearray()

for energy in tqdm(energies, desc="Simulating energies with ZSTD"):
    for i in range(n_sets):
        trace, _ = ts.generate(energy, type_recoil='NR', no_noise=False)
        trace = np.asarray(trace, dtype=np.float16)  # Downcast to float16
        shuffled = shuffle_bytes(trace)              # Byte shuffle
        all_data.extend(shuffled)

compressed_data = compressor.compress(bytes(all_data))

with open(output_path, 'wb') as f:
    f.write(compressed_data)

end_time = time.time()
total_time = end_time - start_time
size_mb = os.path.getsize(output_path) / 1024 / 1024

print("\nZSTD compression completed.")
print(f"Time taken: {total_time:.2f} seconds")
print(f"Final compressed size: {size_mb:.2f} MB")



=== Running simulation with ZSTD compression ===


  E_ir_avg = E_ir / N_ir
Simulating energies with ZSTD: 100%|██████████| 191/191 [09:23<00:00,  2.95s/it]



ZSTD compression completed.
Time taken: 1510.70 seconds
Final compressed size: 2031.30 MB


In [None]:
import zstandard as zstd
trace_shape = (54, 32768)
trace_dtype = np.float16

def unshuffle_bytes(data: bytes, dtype=np.float16, shape=(54, 32768)) -> np.ndarray:
    itemsize = np.dtype(dtype).itemsize
    unshuffled = np.frombuffer(data, dtype=np.uint8).reshape(itemsize, -1).T.reshape(-1)
    return unshuffled.view(dtype).reshape(shape)
# === Decompression and Reconstruction ===
print("\nReading back and decompressing traces...")
decompressor = zstd.ZstdDecompressor()
with open(output_path, 'rb') as f:
    compressed_content = f.read()
    decompressed = decompressor.decompress(compressed_content)

n_total = len(energies) * n_sets
trace_size_bytes = np.prod(trace_shape) * np.dtype(trace_dtype).itemsize

traces = []
for i in range(n_total):
    start = i * trace_size_bytes
    end = (i + 1) * trace_size_bytes
    trace_bytes = decompressed[start:end]
    trace = unshuffle_bytes(trace_bytes, dtype=trace_dtype, shape=trace_shape)
    traces.append(trace)

print(f"Recovered {len(traces)} traces with shape {traces[0].shape}")



Reading back and decompressing traces...
Recovered 1910 traces with shape (54, 32768)


In [10]:
len(traces)

1910

In [7]:
import numpy as np
import zstandard as zstd
import time
import os
from tqdm import tqdm

# Parameters
energies = np.arange(0, 200)  # from 10 to 200 eV
n_sets = 10                    # 10 simulations per energy
trace_shape = (54, 32768)      # Known shape of each trace
trace_dtype = np.float16

# Directory to save compressed data
os.makedirs("compressed_traces", exist_ok=True)

# Shuffle bytes for better compression
def shuffle_bytes(arr: np.ndarray) -> bytes:
    return arr.view(np.uint8).reshape(-1, arr.itemsize).T.tobytes()

def unshuffle_bytes(data: bytes, dtype=np.float16, shape=(54, 32768)) -> np.ndarray:
    itemsize = np.dtype(dtype).itemsize
    unshuffled = np.frombuffer(data, dtype=np.uint8).reshape(itemsize, -1).T.reshape(-1)
    return unshuffled.view(dtype).reshape(shape)

# === Generate traces only once for fair benchmarking ===
print("\nGenerating traces...")
trace_gen_start = time.time()
all_traces = []
for energy in tqdm(energies, desc="Generating traces"):
    for i in range(n_sets):
        trace, _ = ts.generate(energy, type_recoil='NR', no_noise=False)
        trace = np.asarray(trace, dtype=trace_dtype)
        all_traces.append(trace)
trace_gen_end = time.time()
print(f"Trace generation time: {trace_gen_end - trace_gen_start:.2f} seconds")

# === ZSTD Compression ===
print("\nRunning ZSTD compression...")
zstd_start = time.time()
all_data = bytearray()
for trace in all_traces:
    shuffled = shuffle_bytes(trace)
    all_data.extend(shuffled)

compressor = zstd.ZstdCompressor(level=15)
compressed_data = compressor.compress(bytes(all_data))
output_path = "compressed_traces/traces_zstd.zst"

with open(output_path, 'wb') as f:
    f.write(compressed_data)
zstd_end = time.time()
size_mb = os.path.getsize(output_path) / 1024 / 1024
print(f"ZSTD compression time: {zstd_end - zstd_start:.2f} seconds")
print(f"ZSTD compressed size: {size_mb:.2f} MB")

# === ZSTD Decompression ===
print("\nZSTD decompression...")
zstd_decomp_start = time.time()
decompressor = zstd.ZstdDecompressor()
with open(output_path, 'rb') as f:
    compressed_content = f.read()
    decompressed = decompressor.decompress(compressed_content)
zstd_decomp_end = time.time()

n_total = len(all_traces)
trace_size_bytes = np.prod(trace_shape) * np.dtype(trace_dtype).itemsize

zstd_traces = []
for i in range(n_total):
    start = i * trace_size_bytes
    end = (i + 1) * trace_size_bytes
    trace_bytes = decompressed[start:end]
    trace = unshuffle_bytes(trace_bytes, dtype=trace_dtype, shape=trace_shape)
    zstd_traces.append(trace)
print(f"ZSTD decompression time: {zstd_decomp_end - zstd_decomp_start:.2f} seconds")

# === NPZ Save ===
print("\nNPZ compression...")
npz_path = "compressed_traces/traces_npz.npz"
npz_save_start = time.time()
np.savez_compressed(npz_path, *all_traces)
npz_save_end = time.time()
npz_size_mb = os.path.getsize(npz_path) / 1024 / 1024
print(f"NPZ save time: {npz_save_end - npz_save_start:.2f} seconds")
print(f"NPZ file size: {npz_size_mb:.2f} MB")

# === NPZ Load ===
print("\nNPZ decompression...")
npz_load_start = time.time()
with np.load(npz_path) as data:
    npz_traces = [data[key] for key in data.files]
npz_load_end = time.time()
print(f"NPZ load (decompression) time: {npz_load_end - npz_load_start:.2f} seconds")



Generating traces...


Generating traces: 100%|██████████| 200/200 [08:27<00:00,  2.54s/it]


Trace generation time: 507.28 seconds

Running ZSTD compression...
ZSTD compression time: 1057.16 seconds
ZSTD compressed size: 2126.72 MB

ZSTD decompression...
ZSTD decompression time: 14.03 seconds

NPZ compression...
NPZ save time: 1194.03 seconds
NPZ file size: 2455.75 MB

NPZ decompression...
NPZ load (decompression) time: 37.21 seconds
