In [None]:
import smll
import time
import zlib
import bz2
import lzma
import base64

def compress_and_compare(
    input: str,
):
    print("Compressing the following input:\n")
    print(input)

    with smll.Compressor.from_pretrained(repo_id="QuantFactory/SmolLM2-360M-GGUF",filename="*Q4_0.gguf") as compressor:

        start = time.time()
        compressed = compressor.compress(input)
        end = time.time()
        print(f"\nCompressed in {end - start:.2f} seconds.\n")
        # compare results
        original_bytes = input.encode("utf-8")
        original_size = len(original_bytes)

        # calc llm compression ratio
        compressed_size = len(compressed)
        compression_ratio = original_size / compressed_size

        # calc compression algorithms
        gzip_size = len(zlib.compress(original_bytes, level=9))
        bz2_size = len(bz2.compress(original_bytes, compresslevel=9))
        lzma_size = len(lzma.compress(original_bytes, preset=9))
        # zstd_size = len(zstd.compress(original_bytes, 22))

        print(f"Encoded: {base64.b64encode(compressed).decode('ascii')}")
        print("\nCompression Results:")
        print(f"  Original:        {original_size:>6} bytes")
        print(f"  LLM compressed:  {compressed_size:>6} bytes ({compression_ratio:.2f}x)")
        # print(
        #     f"  ZSTD (level 22):  {zstd_size:>6} bytes ({original_size / zstd_size:.2f}x)"
        # )
        print(f"  GZIP (level 9):  {gzip_size:>6} bytes ({original_size / gzip_size:.2f}x)")
        print(f"  BZ2 (level 9):   {bz2_size:>6} bytes ({original_size / bz2_size:.2f}x)")
        print(f"  LZMA (level 9):  {lzma_size:>6} bytes ({original_size / lzma_size:.2f}x)")

        print("Decompressing...")

        start = time.time()
        decompressed = compressor.decompress(compressed)
        end = time.time()
        print(f"\nDecompressed in {end - start:.2f} seconds.\n")

        assert decompressed == input, "Decompression failed!"
        print("\nDecompression successful!")

In [None]:
# compress text & compare against other algs
input_text = """"In information theory, data compression, source coding,[1] or bit-rate reduction is the process of encoding information using fewer bits than the original representation.[2] Any particular compression is either lossy or lossless. Lossless compression reduces bits by identifying and eliminating statistical redundancy. No information is lost in lossless compression. Lossy compression reduces bits by removing unnecessary or less important information.[3] Typically, a device that performs data compression is referred to as an encoder, and one that performs the reversal of the process (decompression) as a decoder.
        The process of reducing the size of a data file is often referred to as data compression. In the context of data transmission, it is called source coding: encoding is done at the source of the data before it is stored or transmitted.[4] Source coding should not be confused with channel coding, for error detection and correction or line coding, the means for mapping data onto a signal.
        Data compression algorithms present a spaceâ€“time complexity trade-off between the bytes needed to store or transmit information, and the computational resources needed to perform the encoding and decoding. The design of data compression schemes involves balancing the degree of compression, the amount of distortion introduced (when using lossy data compression), and the computational resources or time required to compress and decompress the data.[5] """

compress_and_compare(input_text)