1. Preprocessor:    
    - Filtering
    - Augmentation in time domain
(still in dataframe)

2. Compression
    - Store the data in less bits (currently in 16bit)

In [11]:
# given
import struct
from pathlib import Path
# import RaggedArray type


BASE_DIR = Path().resolve()
data_path = BASE_DIR / "data"  # Directory to store data files
train_data_path = BASE_DIR / "data" / "X_train.bin"
test_data_path = BASE_DIR / "data" / "X_test.bin"
train_label_path = BASE_DIR / "data" / "y_train.csv"


def read_binary(path):
    ragged_array = []
    with open(path, "rb") as r:
        read_binary_from(ragged_array, r)
    return ragged_array


def read_binary_from(ragged_array, r):
    while True:
        size_bytes = r.read(4)
        if not size_bytes:
            break
        sub_array_size = struct.unpack("i", size_bytes)[0]
        sub_array = list(
            struct.unpack(f"{sub_array_size}h", r.read(sub_array_size * 2))
        )
        sub_array = [int(x) for x in sub_array]
        ragged_array.append(sub_array)

In [12]:
# find a lossless compression type for the data instead of 16bit (e.g., PDICT, PDELTA, PFOR).  --> then NS (null suppression)
import dask # we will have a lot of for loops for each data point, so we will use dask to parallelize the process
import numpy as np
import bitstruct

# before patched version
# Delta Encoding
def delta_encode(data):
    deltas = [data[0]]  # Store the first value
    for i in range(1, len(data)):
        deltas.append(data[i] - data[i - 1])
    return deltas

# Delta Decoding
def delta_decode(deltas):
    original = [deltas[0]]  # Start with the first value
    for i in range(1, len(deltas)):
        original.append(original[-1] + deltas[i])
    return original


def pack_offsets(offsets, bits_needed=12):
    fmt= f"u{bits_needed}"
    overall_fmt = fmt * len(offsets)
    packed = bitstruct.pack(overall_fmt, *offsets)
    return packed


def unpack_offsets(packed, bits_needed=12, num_values=None):
    fmt = f"u{bits_needed}"
    if num_values is None:
        raise ValueError("num_values must be provided")
    overall_fmt = fmt * num_values
    unpacked = bitstruct.unpack(overall_fmt, packed)
    return list(unpacked)


def for_encode(data: list) -> tuple:
    data_copy = data.copy()
    min_val = min(data)
    for i in range(len(data_copy)):
        data_copy[i] -= min_val
        if data_copy[i] < 0:
            print(f"data[i] is negative: {data_copy[i]} min_val: {min_val}")
    return min_val, data_copy

def for_decode(data: list, min_val: int) -> list:
    return [val + min_val for val in data]


def pfor_encode(data: list, patch_size: int = 128) -> tuple:
    patches = []
    min_vals = []
    max_all = 0
    bits_needed = 0
    for i in range(0, len(data), patch_size):
        patch = data[i:i + patch_size]
        min_val, encoded_patch = for_encode(patch)

        # find least amount of bits to represent the data
        max_val = max(encoded_patch)
        if max_val > max_all:
            bits_needed = max_val.bit_length()
            max_all = max_val
        patches.append(encoded_patch)
        min_vals.append(min_val)

    return patches, min_vals, bits_needed


def encoded_pfor_to_binary(patches, min_vals, bits_needed=12) -> bytes:
    # first 32 bits for the number of patches
    # second 32 bits for the bits needed
    # third 32 bits for the patch size
    # fourth 32 bits for the last patch length
    last_patch_len = len(patches[-1])
    header = struct.pack(
        "IIII", len(patches), bits_needed, len(patches[0]), last_patch_len
    )

    packed_patches = bytearray(header)
    for min_val, patch in zip(min_vals, patches):
        packed_patches.extend(struct.pack("h", min_val))
        packed_offsets = pack_offsets(patch, bits_needed)
        packed_patches.extend(packed_offsets)

    return bytes(packed_patches)


def pfor_decode(patches, patch_size=128) -> list:
    decoded_data = []
    for min_val, patch in patches:
        decoded_patch = for_decode(patch, min_val)
        decoded_data.extend(decoded_patch)
    return decoded_data


def binary_to_decoded_pfor(binary_data: bytes) -> list:
    header_size = 16  # 4 integers of 4 bytes each
    num_patches, bits_needed, patch_size, last_patch_len = struct.unpack(
        "IIII", binary_data[:header_size]
    )

    patches = []
    offset = header_size

    for i in range(num_patches):
        min_val = struct.unpack("h", binary_data[offset : offset + 2])[0]
        offset += 2

        if i == num_patches - 1:
            num_values = last_patch_len
        else:
            num_values = patch_size

        patch_length = (num_values * bits_needed + 7) // 8
        packed_offsets = binary_data[offset : offset + patch_length]
        offsets = unpack_offsets(packed_offsets, bits_needed, num_values)
        offset += patch_length

        patches.append((min_val, offsets))

    return pfor_decode(patches)

In [13]:
import pandas as pd
signals = read_binary(train_data_path) # ragged array of signals
labels = pd.read_csv(train_label_path, header=None)


signal_samples = signals[15:20] 
bits = []
min_bits_all = 16
for patch_size in [1024*(i+1) for i in range(10)]:
    max_bits_patch = 0
    for signal in signal_samples:
        print(f"First signal length: {len(signal)}")
        print(f"First signal: {signal[:10]}...")  # Print first 10 values for brevity
        # Encode the signal using PFOR
        encoded_signal, bits_all = pfor_encode(signal, patch_size=128)
        print(f"Encoded signal length: {len(encoded_signal)}")
        print(f"Bits used: {bits_all}")
        # Decode the signal using PFOR
        decoded_signal = pfor_decode(encoded_signal, patch_size=128)
        assert signal == decoded_signal, "Decoded signal does not match original"
        print(f"Decoded signal length: {len(decoded_signal)}")
        print(f"Decoded signal: {decoded_signal[:10]}...")  # Print first 10 values for brevity
        if bits_all > max_bits_patch:
            max_bits_patch = bits_all
    print(f"Max bits used for patch size {patch_size}: {max_bits_patch}")
    bits.append((patch_size, max_bits_patch))
    if max_bits_patch < min_bits_all:
        min_bits_all = max_bits_patch

print(f"Min bits used for all patch sizes: {min_bits_all}")
for (size, bit) in bits:
    print(f"size:{size}, bits: {bit}")

First signal length: 3178
First signal: [107, 128, 155, 166, 173, 179, 183, 187, 190, 192]...


ValueError: too many values to unpack (expected 2)

In [16]:
# test binary conversion
signal = signals[0]
print(f"signal length: {len(signal)}")
patch_size = 512

encoded_patches, min_vals, _ = pfor_encode(signal, patch_size=patch_size)
encoded_signal = encoded_pfor_to_binary(encoded_patches, min_vals, 10)
print(f"Encoded signal length for patch size {patch_size}: {len(encoded_signal)}")
decoded_signal = binary_to_decoded_pfor(encoded_signal)
assert signal == decoded_signal, "Decoded signal does not match original"

signal length: 9000
Encoded signal length for patch size 512: 11302
