In [1]:
!pip install torch
!pip install transformers
!pip install tqdm



In [3]:
import os
import json
import time
import torch
from IPython.display import display, HTML

from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

def bin_usc(usc_value):
    bin_size = 50
    return int((usc_value // bin_size) * bin_size)

def get_file_size(file_path):
    size = os.path.getsize(file_path)
    return size

def count_unique_symbols(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()
        unique_symbols = set(text)
        return len(unique_symbols)
    
def round_to_class(file_size_bytes):
    if file_size_bytes < 1050:
        return "1kb" 
    elif file_size_bytes < 10500:
        return "10kb"
    elif file_size_bytes < 105000:
        return "100kb" 
    elif file_size_bytes < 1050000:
        return "1mb"
    elif file_size_bytes < 10500000:
        return "10mb"  
    else:
        return "100mb"  

def detect_algorithm(model, tokenizer, filename, mode):
    device = "cuda" if torch.cuda.is_available() else "mps"
    
    file_size_bytes = get_file_size(filename)
    file_size = round_to_class(file_size_bytes)
    usc = bin_usc(count_unique_symbols(filename))
    eval_prompt = f"### Instruction: We need to find algorithm from given input params: (usc:{usc}, file_size:{file_size}, compression_type: {mode})."
    model_input = tokenizer(eval_prompt, return_tensors="pt").to(device)

    start_eval_time = time.time()
    model.eval()
    with torch.no_grad():
        result = tokenizer.decode(model.generate(**model_input, max_new_tokens=150, repetition_penalty=1.15)[0], skip_special_tokens=True)
    model_eval_time = time.time() - start_eval_time
    
    return eval_prompt, result, model_eval_time, file_size_bytes

def print_formatted(data):
    for key in data:
        print(f"{key:{20}}: {data.get(key, 'N/A')}")


In [4]:
file_path = "https://raw.githubusercontent.com/emirozturk/XCompress/main/RevisionFiles/LLM-Test-Data/"
file_path = "/Users/emirozturk/Desktop/ProjectX/XCompress/RevisionFiles/LLM-Test-Data/"
files =  ["nattest.txt","progtest.txt","sensortest.txt","sqltest.txt"]
modes = ['fast-compression', 'fast-decompression', 'best-compression']

# Load model and tokenizer once
device = "cuda" if torch.cuda.is_available() else "mps"
base_model_name = "emirozturk/CSM"
model = AutoModelForCausalLM.from_pretrained(base_model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(base_model_name, padding_side='right')
tokenizer.pad_token = tokenizer.eos_token

for filename in tqdm(files):
    for mode in modes:
        for i in range(1, 4):
            eval_prompt, result, model_eval_time, file_size_bytes = detect_algorithm(model, tokenizer, file_path+filename, mode)
            result_entry = {
                'filename': filename,
                'mode': mode,
                'run': i,
                'file_size_bytes': file_size_bytes,
                'eval_prompt': eval_prompt,
                'result': result,
                'model_eval_time': model_eval_time
            }
            print_formatted(result_entry)

Downloading shards: 100%|██████████| 2/2 [00:00<00:00,  5.53it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:19<00:00,  9.76s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

filename            : nattest.txt
mode                : fast-compression
run                 : 1
file_size_bytes     : 1028
eval_prompt         : ### Instruction: We need to find algorithm from given input params: (usc:50, file_size:1kb, compression_type: fast-compression).
result              : ### Instruction: We need to find algorithm from given input params: (usc:50, file_size:1kb, compression_type: fast-compression). ### Output: The algorithm is: Snappy-Snzip
model_eval_time     : 4.765666961669922
filename            : nattest.txt
mode                : fast-compression
run                 : 2
file_size_bytes     : 1028
eval_prompt         : ### Instruction: We need to find algorithm from given input params: (usc:50, file_size:1kb, compression_type: fast-compression).
result              : ### Instruction: We need to find algorithm from given input params: (usc:50, file_size:1kb, compression_type: fast-compression). ### Output: The algorithm is: Snappy-Snzip
model_eval_time     : 

 25%|██▌       | 1/4 [00:15<00:46, 15.45s/it]

filename            : nattest.txt
mode                : best-compression
run                 : 3
file_size_bytes     : 1028
eval_prompt         : ### Instruction: We need to find algorithm from given input params: (usc:50, file_size:1kb, compression_type: best-compression).
result              : ### Instruction: We need to find algorithm from given input params: (usc:50, file_size:1kb, compression_type: best-compression). ### Output: The algorithm is: brotli
model_eval_time     : 1.2350211143493652
filename            : progtest.txt
mode                : fast-compression
run                 : 1
file_size_bytes     : 1051
eval_prompt         : ### Instruction: We need to find algorithm from given input params: (usc:0, file_size:10kb, compression_type: fast-compression).
result              : ### Instruction: We need to find algorithm from given input params: (usc:0, file_size:10kb, compression_type: fast-compression). ### Output: The algorithm is: zlib
model_eval_time     : 1.1464500427

 50%|█████     | 2/4 [00:26<00:25, 12.67s/it]

filename            : progtest.txt
mode                : best-compression
run                 : 3
file_size_bytes     : 1051
eval_prompt         : ### Instruction: We need to find algorithm from given input params: (usc:0, file_size:10kb, compression_type: best-compression).
result              : ### Instruction: We need to find algorithm from given input params: (usc:0, file_size:10kb, compression_type: best-compression). ### Output: The algorithm is: brotli
model_eval_time     : 1.2378358840942383
filename            : sensortest.txt
mode                : fast-compression
run                 : 1
file_size_bytes     : 1049
eval_prompt         : ### Instruction: We need to find algorithm from given input params: (usc:0, file_size:1kb, compression_type: fast-compression).
result              : ### Instruction: We need to find algorithm from given input params: (usc:0, file_size:1kb, compression_type: fast-compression). ### Output: The algorithm is: Snappy-Snzip
model_eval_time     : 1.5

 75%|███████▌  | 3/4 [00:38<00:12, 12.34s/it]

filename            : sensortest.txt
mode                : best-compression
run                 : 3
file_size_bytes     : 1049
eval_prompt         : ### Instruction: We need to find algorithm from given input params: (usc:0, file_size:1kb, compression_type: best-compression).
result              : ### Instruction: We need to find algorithm from given input params: (usc:0, file_size:1kb, compression_type: best-compression). ### Output: The algorithm is: brotli
model_eval_time     : 1.2452349662780762
filename            : sqltest.txt
mode                : fast-compression
run                 : 1
file_size_bytes     : 1029
eval_prompt         : ### Instruction: We need to find algorithm from given input params: (usc:0, file_size:1kb, compression_type: fast-compression).
result              : ### Instruction: We need to find algorithm from given input params: (usc:0, file_size:1kb, compression_type: fast-compression). ### Output: The algorithm is: Snappy-Snzip
model_eval_time     : 1.5230

100%|██████████| 4/4 [00:49<00:00, 12.49s/it]

filename            : sqltest.txt
mode                : best-compression
run                 : 3
file_size_bytes     : 1029
eval_prompt         : ### Instruction: We need to find algorithm from given input params: (usc:0, file_size:1kb, compression_type: best-compression).
result              : ### Instruction: We need to find algorithm from given input params: (usc:0, file_size:1kb, compression_type: best-compression). ### Output: The algorithm is: brotli
model_eval_time     : 1.2357890605926514



