In [None]:
import sys, os

In [None]:
from exllamav2 import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Cache, ExLlamaV2Tokenizer
from exllamav2.generator import ExLlamaV2StreamingGenerator, ExLlamaV2Sampler

In [None]:
model_directory = "/home/cwinkler/oreilly/models/Qwen3-8B-exl2"

In [None]:
config = ExLlamaV2Config()
config.model_dir = model_directory
config.prepare()

model = ExLlamaV2(config)
print("Loading model: " + model_directory)

cache = ExLlamaV2Cache(model, lazy = True)
model.load_autosplit(cache)

In [None]:
tokenizer = ExLlamaV2Tokenizer(config)

# Initialize generator

generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)

In [None]:
settings = ExLlamaV2Sampler.Settings()
settings.temperature = 0.1
settings.top_k = 50
settings.top_p = 0.8
settings.token_repetition_penalty = 1.05
settings.disallow_tokens(tokenizer, [tokenizer.eos_token_id])

max_new_tokens = 600

In [None]:
prompt = """<|im_start|>system\nDu bist ein hilfreicher Assistent.<|im_end|>
<|im_start|>user\nErkläre den Heise Verlag!<|im_end|>
<|im_start|>assistant
<think>

</think>

"""

In [None]:
input_ids = tokenizer.encode(prompt)
prompt_tokens = input_ids.shape[-1]

In [None]:
generator.warmup()

In [None]:
generator.set_stop_conditions([])
generator.begin_stream(input_ids, settings)

In [None]:
import time

start = time.time()

generated_tokens = 0
while True:
    chunk, eos, _ = generator.stream()
    generated_tokens += 1
    print (chunk, end = "")
    sys.stdout.flush()
    if eos or generated_tokens == max_new_tokens: break

used = time.time() - start
tps = generated_tokens / used
print(f"\n\n{used} seconds, {tps} tokens/s")

In [None]:
!nvidia-smi