In [None]:
import os
import os.path as osp

os.environ["HF_HOME"] = "./ephemeral/HF"
os.environ["HF_HUB_DISABLE_XET"] = "1"
os.environ["HF_TOKEN"] = ""

In [None]:
qwen_25_quantizations = ["", "-AWQ", "-GPTQ-Int4", "-GPTQ-Int8"]
qwen_25_default_name = "Qwen/Qwen2.5-3B-Instruct"

In [None]:
from vllm import LLM, SamplingParams

In [None]:
import gc


class Model:
    def __init__(self, model_name, gpu_mem_util=0.5, **kwargs):
        self.model_name = model_name
        self.model = None
        self.gpu_mem_util = gpu_mem_util
        self.kwargs = kwargs

    def __call__(self, *args, **kwargs):
        return self.model.generate(*args, **kwargs)

    def __enter__(self, *args, **kwargs):
        self.model = LLM(self.model_name, gpu_memory_utilization=self.gpu_mem_util, max_model_len=5000, **self.kwargs)
        return self
    
    def __exit__(self, *args, **kwargs):
        del self.model
        gc.collect()

In [None]:
from datetime import datetime


class TimeManagement:
    def __init__(self, name, output_dict):
        self.output_dict = output_dict
        self.name = name
    
    def __enter__(self, *args, **kwargs):
        self.begin = datetime.now()
        return self

    def __exit__(self, *args, **kwargs):
        self.output_dict[self.name] = (datetime.now() - self.begin).total_seconds()
        

In [None]:
time_dict = {}
token_length_dict = {}

def function_check(model_name):
    print(f"!!!BEGIN CHECKING {model_name}!!!")
    with Model(model_name) as model:
        model("Привет, медвед!", SamplingParams(max_tokens=1000, temperature=0.7))

        with TimeManagement(model_name, time_dict):
            result = model(
                "Напиши длинную историю России", 
                SamplingParams(max_tokens=8000, temperature=0.7)
            )

        token_length_dict[model_name] = len(result[0].outputs[0].token_ids)


for quantization in qwen_25_quantizations:
    function_check(qwen_25_default_name + quantization)

In [None]:
from tabulate import tabulate


headers = ["Name", "TPS", "Tokens", "Seconds"]
results = []


for name in time_dict.keys():
    results.append(
        [
            name,
            f"{token_length_dict[name]/time_dict[name]:.2f}",
            token_length_dict[name],
            time_dict[name]
        ]
    )

print(
    tabulate(
        results,
        headers=headers,
        tablefmt="grid"
    )
)

In [None]:
for k in [4, 8, 16, 24]:
    with Model(
        "Qwen/Qwen2.5-3B-Instruct",
        speculative_config={
            "model": "Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4",
            "num_speculative_tokens": k,
        }
    ) as model_1:
        name = f"Speculative_decoding {k}"
        with TimeManagement(name, time_dict):
            result = model_1(
                "Напиши длинную историю России", 
                SamplingParams(
                    max_tokens=8000, 
                    temperature=0,
                )
            )
        
        print(f"!!!!! + {name} !!!!!")
        print(result[0].outputs[0].text)
        token_length_dict[name] = len(result[0].outputs[0].token_ids)