In [3]:
from exllamav2 import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Cache, ExLlamaV2Tokenizer, Timer
from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2Sampler

In [2]:
total_cache_tokens = 16384

In [5]:
draft_model_dir = "/home/cwinkler/oreilly/models/Qwen3-1.7B-exl2"
draft_config = ExLlamaV2Config(draft_model_dir)
draft_config.arch_compat_overrides()
draft_model = ExLlamaV2(draft_config)
draft_cache = ExLlamaV2Cache(draft_model, max_seq_len = total_cache_tokens, lazy = True)
draft_model.load_autosplit(draft_cache, progress = True)

Output()

In [6]:
model_dir = "/home/cwinkler/oreilly/models/Qwen3-8B-exl2"
config = ExLlamaV2Config(model_dir)
config.arch_compat_overrides()
model = ExLlamaV2(config)
cache = ExLlamaV2Cache(model, max_seq_len = total_cache_tokens, lazy = True)
model.load_autosplit(cache, progress = True)

Output()

In [7]:
print("Loading tokenizer...")
tokenizer = ExLlamaV2Tokenizer(config)

Loading tokenizer...


In [8]:
prompt = """<|im_start|>system\nYou are a helpful assistant.<|im_end|>
<|im_start|>user\nTell me about O'Reilly online learning!<|im_end|>
<|im_start|>assistant
<think>

</think>

"""

In [9]:
max_new_tokens = 500
gen_settings = ExLlamaV2Sampler.Settings.greedy()

In [10]:
generator = ExLlamaV2DynamicGenerator(
    model = model,
    cache = cache,
    tokenizer = tokenizer,
)
generator.warmup()

In [11]:
with Timer() as t_no_draft:
    output = generator.generate(
        prompt = prompt,
        max_new_tokens = max_new_tokens,
        encode_special_tokens = True,
        gen_settings = gen_settings
    )
print(output)

"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nTell me about O'Reilly online learning!<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\nO'Reilly Online Learning is a leading platform for learning and mastering the skills needed in today's technology-driven world. It offers a vast library of books, videos, and courses on a wide range of topics, including programming, data science, cybersecurity, cloud computing, and more. Here's a breakdown of what O'Reilly Online Learning offers:\n\n### 1. **Comprehensive Learning Resources**\n- **Books**: O'Reilly is known for its authoritative books on technology and programming. These books are written by industry experts and provide in-depth knowledge on various subjects.\n- **Video Courses**: The platform offers video courses that are interactive and easy to follow. These courses are designed to help learners understand complex concepts through visual learning.\n- **Live Events**: O'Reilly hosts live events,

In [12]:
generator = ExLlamaV2DynamicGenerator(
    model = model,
    cache = cache,
    draft_model = draft_model,
    draft_cache = draft_cache,
    tokenizer = tokenizer,
    num_draft_tokens = 4,
)
generator.warmup()

In [13]:
with Timer() as t_draft:
    output = generator.generate(
        prompt = prompt,
        max_new_tokens = max_new_tokens,
        encode_special_tokens = True,
        gen_settings = gen_settings
    )

In [14]:
print(f"speed, -SD: {max_new_tokens / t_no_draft.interval:.2f} tokens/second")
print(f"speed, +SD: {max_new_tokens / t_draft.interval:.2f} tokens/second")

speed, -SD: 152.51 tokens/second
speed, +SD: 121.79 tokens/second
