In [21]:
from llama_cpp import Llama
import time
import psutil
import os

In [22]:
MODEL_PATH = os.path.join(
    "..", "models", "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
)

MODEL_PATH

'..\\models\\tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf'

In [23]:
process = psutil.Process()
mem_before = process.memory_info().rss / 1e6  # MB

In [None]:
llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=8192,
    n_threads=8,
    verbose=False
)

print("Model loaded")

llama_context: n_ctx_per_seq (8192) > n_ctx_train (2048) -- possible training context overflow


Model loaded


In [25]:
mem_after = process.memory_info().rss / 1e6  # MB

In [26]:
print(f"Memory delta (model footprint) (MB): {mem_after - mem_before:.1f}")

Memory delta (model footprint) (MB): -156.9


In [None]:
import numpy as np
import time
import psutil

MAX_TOKENS = 128

EOS_ID = llm.token_eos()
print("EOS token id:", EOS_ID)

def ban_eos_logits_processor(input_ids, logits):
    """
    llama-cpp-python logits_processor hook:
    - input_ids: array of token ids so far
    - logits: array of logits for next token
    We set EOS logit to -inf so generation won't end early.
    """
    logits[EOS_ID] = -1e10
    return logits

EOS token id: 2


In [28]:
prompt = (
    "Write a continuous technical explanation of energy efficiency in AI inference. "
    "Do not conclude. Keep expanding with details, examples, and tradeoffs.\n\n"
)

In [None]:
def run_once():
    process = psutil.Process()
    mem_before = process.memory_info().rss / 1e6

    t0 = time.time()
    out = llm(
        prompt,
        max_tokens=MAX_TOKENS,
        temperature=0.0,
        top_p=1.0,
        top_k=0, 
        repeat_penalty=1.0,
        logits_processor=[ban_eos_logits_processor],
    )
    t1 = time.time()

    mem_after = process.memory_info().rss / 1e6

    text = out["choices"][0]["text"]
    n = out["usage"]["completion_tokens"]
    elapsed = t1 - t0
    tps = n / elapsed if elapsed > 0 else float("inf")

    return {
        "tokens": n,
        "seconds": elapsed,
        "toks_per_sec": tps,
        "mem_delta_mb": mem_after - mem_before,
        "text_tail": text[-120:],
    }


In [30]:
results = []
for i in range(3):
    r = run_once()
    results.append(r)
    print(f"Run {i+1}: tokens={r['tokens']}  time={r['seconds']:.2f}s  tok/s={r['toks_per_sec']:.2f}")

print("\nToken counts:", [r["tokens"] for r in results])


Run 1: tokens=128  time=2.96s  tok/s=43.24
Run 2: tokens=128  time=2.69s  tok/s=47.64
Run 3: tokens=128  time=2.44s  tok/s=52.42

Token counts: [128, 128, 128]
