In [None]:
from kvikio.nvcomp_codec import NvCompBatchCodec
from kvikio.zarr import NVCompCompressor
import numpy as np
import io
import cupy as cp
import numpy
import zstandard as zstd
import sys

# (SLOW) Read compressed buffer from file to GPU -> decompress buffer on GPU 

The problem: Compress file with zstd using CPU, then read it into GPU and decompress it there. Validate against CPU compression.

## Generate and compress some data with zstd on CPU and save

In [None]:
# Create a sample NumPy array
dtype = np.float32
shape = (100000, 10000)
data_CPU = np.random.default_rng().random(shape, dtype=dtype)

# Compress the data using zstd
compressor = zstd.ZstdCompressor()
compressed_data = compressor.compress(data_CPU.tobytes())

data_compressed_CPU_path = "/home/fstrug/uscmshome/nobackup/GPU/kvikio_playground/compressed_data_zstd_CPU.zst"
# Save the compressed data to a file
with open(data_compressed_CPU_path, 'wb') as f:
    f.write(compressed_data)


### Generate and compress some data with zstd on GPU and save (Example)

In [None]:
# Compress data to file with GPU (must be small cp array or get error due to large chunk size)
data_GPU = cp.random.rand(1000,1000, dtype=np.float32)
codec = NvCompBatchCodec("zstd")
data_compressed_GPU = codec.encode(data_GPU)

data_compressed_GPU_path = "/home/fstrug/uscmshome/nobackup/GPU/kvikio_playground/compressed_data_zstd_GPU.zst"
#Write compressed data to file
with open(data_compressed_GPU_path, "wb") as f:
    f.write(data_compressed_GPU)


## Read zstd compressed data to CPU and decompress on CPU

In [None]:
# Read compressed data to CPU
with open(data_compressed_CPU_path, 'rb') as f:
    compressed_data = f.read()

# Decompress the data using zstd
decompressor = zstd.ZstdDecompressor()
data_decompressed_CPU = decompressor.decompress(compressed_data)

# Convert the bytes back to a NumPy array
nparray_decompressed_CPU = np.frombuffer(data_decompressed_CPU, dtype=dtype).reshape(shape)

## Read zstd compressed data to CPU and decompress on GPU

In [None]:
#Read compressed data to CPU
with open(data_compressed_CPU_path, "rb") as f:
    read_comp_data = f.read()

outGPU = cp.empty_like(data_CPU, dtype=dtype)
# Decode compressed data on GPU
codec = NvCompBatchCodec("zstd")
decompressed_data_byGPU = codec.decode(read_comp_data, out = outGPU).view(dtype).reshape(shape) # will return to CPU npArray if not given cpArray for output

## Check that CPU and GPU decompression give same result

In [None]:
# Check array correctly decompressed
assert (nparray_decompressed_CPU == data_CPU).all()

In [None]:
# Check CPU and GPU decompression give same result
assert (nparray_decompressed_CPU == decompressed_data_byGPU.get()).all()

# (FAST) Chunked compressed buffer to GPU -> decompress buffer on GPU 

In [None]:
# Generate chunks of data
shape_chunk = (1000, 1000)
N_chunks = 1000
chunks = [np.random.default_rng().random(shape_chunk, dtype=np.float32) for i in range(N_chunks)]
chunks_combined = np.concatenate(chunks)


## CPU Compression

In [None]:
# Compress the data using zstd
compressor = zstd.ZstdCompressor()
compressed_data = compressor.compress(chunks_combined.tobytes())

compressed_data_chunks = []
for c in chunks:
    compressed_data_chunks.append(compressor.compress(c.tobytes()))
compressed_data_chunks = np.array(compressed_data_chunks)

## CPU Decompression

In [None]:
%%time
# Decompress the data using zstd
decompressor = zstd.ZstdDecompressor()
data_decompressed_CPU = decompressor.decompress(compressed_data)
nparray_decompressed_CPU = np.frombuffer(data_decompressed_CPU, dtype=np.float32).reshape((shape_chunk[0]*N_chunks, shape_chunk[1]))

In [None]:
assert (chunks_combined == nparray_decompressed_CPU).all()

## GPU Decompression 

In [None]:
%%time
codec = NvCompBatchCodec("zstd")
out_buf = [cp.empty_like(c) for c in chunks]
decompressed_data_byGPU = codec.decode_batch(compressed_data_chunks, out = out_buf) # will return to CPU npArray if not given cpArray for output
cparray_decompressed_GPU = cp.concatenate(decompressed_data_byGPU).reshape((shape_chunk[0]*N_chunks, shape_chunk[1]))

In [None]:
assert (cparray_decompressed_GPU.get() == nparray_decompressed_CPU).all()
assert (cparray_decompressed_GPU.get() == chunks_combined).all()

# Chunk data, compress to single file, open and decompress

## Helper Functions

In [None]:
def compress_chunks(chunks):
    compressor = zstd.ZstdCompressor()
    compressed_chunks = []
    compressed_chunk_sizes = []
    for c in chunks:
        compressed_c = compressor.compress(c.tobytes())
        compressed_chunks.append(compressed_c)
        compressed_chunk_sizes.append(np.uint64(len(compressed_c)))

    return(compressed_chunks, compressed_chunk_sizes)

def get_chunk_sizes(chunks):
    chunk_sizes = []
    for c in chunks:
        chunk_sizes.append(np.uint64(len(c.tobytes())))

    return(chunk_sizes)

def compress_chunks_to_file(chunks, path):
    # Get chunk metadata
    N_chunks = np.uint32(len(chunks))
    chunk_sizes = get_chunk_sizes(chunks)
    compressed_chunks, compressed_chunk_sizes = compress_chunks(chunks)
    
    # Save the compressed data to a file
    with open(path, 'wb') as f:
        ###
        # Write header
        header_offset = np.uint64(4 + N_chunks * (8 + 8 + 8) + 4)
        f.write(N_chunks.tobytes())
        for i in range(N_chunks):
            offset = header_offset + np.uint64(np.sum(compressed_chunk_sizes[0:i]))
            f.write(offset.tobytes())
            f.write(compressed_chunk_sizes[i].tobytes())
            f.write(chunk_sizes[i].tobytes())
        f.write(b'\x00' * 4)
        
        ###
        # Write data
        for i in range(N_chunks):
            f.write(compressed_chunks[i])

def get_chunks_from_file(path):
    with open(path, 'rb') as f:
        # Get N_chunks
        N_chunks = np.frombuffer(f.read(4), np.uint32)[0]
        offsets = []
        compressed_chunk_sizes = []
        uncompressed_chunk_sizes = []
        
        for i in range(N_chunks):
            f.seek(4 + i * (8 + 8 + 8))
            byte_range = f.read(24)
            offset = byte_range[0:8]
            compressed_chunk_size = byte_range[8:16]
            uncompressed_chunk_size = byte_range[16:24]

            offsets.append(np.frombuffer(offset, np.uint64)[0])
            compressed_chunk_sizes.append(np.frombuffer(compressed_chunk_size, np.uint64)[0])
            uncompressed_chunk_sizes.append(np.frombuffer(uncompressed_chunk_size, np.uint64)[0])

        # Read chunk byte ranges and decompress
        decompressed_chunks = []
        compressed_chunks = []
        decompressor = zstd.ZstdDecompressor()
        for i in range(N_chunks):
            # Find compressed chunk
            f.seek(offsets[i])
            compressed_chunk = f.read(compressed_chunk_sizes[i])
            compressed_chunks.append(compressed_chunk)
            # Decompress chunk

        return(compressed_chunks)

def CPU_decompression(chunks):
    decompressed_chunks = []
    
    # Decompress the data using zstd
    decompressor = zstd.ZstdDecompressor()
    for c in chunks:
        c_decompressed_CPU = decompressor.decompress(c)
        decompressed_chunks.append(c_decompressed_CPU)
    
    return(decompressed_chunks)

## Generate, write, and load File

In [None]:
# Generate chunks of data and save
shape_chunk = (100000)
chunks = [np.random.default_rng().random(shape_chunk, dtype=np.float32) for i in range(1000)]

data_compressed_CPU_path_chunked = "/home/fstrug/uscmshome/nobackup/GPU/kvikio_playground/compressed_data_zstd_CPU_chunked.zst"
compress_chunks_to_file(chunks, data_compressed_CPU_path_chunked)

In [None]:
# Read
compressed_chunks_loaded_bytes = get_chunks_from_file(data_compressed_CPU_path_chunked)

## CPU Decompression

In [None]:
%%time
# Load compressed chunks to CPU and decompress on CPU
decompressed_chunks_bytes = CPU_decompression(compressed_chunks_loaded_bytes)
chunks_loaded_bytes_joined = b''.join(decompressed_chunks_bytes)
chunk_loaded_CPU = np.frombuffer(chunks_loaded_bytes_joined, np.float32)

In [None]:
# Check CPU decompression worked
assert (np.concatenate(chunks) == chunk_loaded_CPU).all()

## GPU Decompression

In [None]:
%%time
out_buf = [cp.empty_like(c) for c in chunks] #this is a bit of a cheat (not reading chunk shapes from file)
codec = NvCompBatchCodec("zstd")
decompressed_data_byGPU = codec.decode_batch(compressed_chunks_loaded_bytes, out = out_buf) # will return to CPU npArray if not given cpArray for output
chunk_loaded_GPU = cp.concatenate(decompressed_data_byGPU).view(np.float32)

In [None]:
# Check GPU decompression worked
assert (chunk_loaded_CPU == chunk_loaded_GPU.get()).all()