In [4]:
from llama_cpp import Llama
import time
import psutil
import os

In [5]:
MODEL_PATH = os.path.join(
    "..", "models", "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
)

MODEL_PATH

'..\\models\\tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf'

In [6]:
process = psutil.Process()
mem_before = process.memory_info().rss / 1e6  # MB

In [None]:
llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=8192, # context length
    n_threads=8,
    verbose=False
)

print("Model loaded")

llama_context: n_ctx_per_seq (8192) > n_ctx_train (2048) -- possible training context overflow


Model loaded


In [8]:
MAX_TOKENS = 128

In [None]:
def count_prompt_tokens(prompt: str) -> int:
    return len(llm.tokenize(prompt.encode("utf-8"), add_bos=True))

EOS_ID = llm.token_eos()
def ban_eos_logits_processor(input_ids, logits):
    logits[EOS_ID] = -1e10
    return logits

def run_ttft_and_decode(prompt: str, max_tokens: int = 128, min_chars_for_ttft: int = 8):
    process = psutil.Process()
    mem_before = process.memory_info().rss / 1e6

    t_start = time.time()
    first_token_time = None
    collected = ""

    stream = llm(
        prompt,
        max_tokens=max_tokens,
        temperature=0.0,
        top_p=1.0,
        top_k=0,
        repeat_penalty=1.0,
        logits_processor=[ban_eos_logits_processor],
        stream=True,
    )

    for chunk in stream:
        txt = chunk["choices"][0]["text"]
        if txt:
            collected += txt
            if first_token_time is None and len(collected.strip()) >= min_chars_for_ttft:
                first_token_time = time.time()

    t_end = time.time()
    mem_after = process.memory_info().rss / 1e6

    ttft = (first_token_time - t_start) if first_token_time else None
    decode_time = (t_end - first_token_time) if first_token_time else None
    decode_tok_s = (max_tokens / decode_time) if decode_time and decode_time > 0 else None

    return {
        "prompt_tokens": count_prompt_tokens(prompt),
        "max_tokens": max_tokens,
        "ttft_s": ttft,
        "decode_s": decode_time,
        "decode_tok_s": decode_tok_s,
        "mem_delta_mb": mem_after - mem_before,
        "total_s": t_end - t_start,
        "ttft_trigger_chars": min_chars_for_ttft,
        "output_head": collected[:120],
    }


In [10]:
mem_after = process.memory_info().rss / 1e6  # MB

In [11]:
print(f"Memory delta (model footprint) (MB): {mem_after - mem_before:.1f}")

Memory delta (model footprint) (MB): 1153.0


In [58]:
prompt_64 = (
    "You are a systems engineer evaluating whether small open-source language models "
    "(1B–3B parameters) can run reliably on consumer hardware for useful purposes. Please analyze practical limits. \n\n"
    "Do not conclude. Keep expanding with details, examples, and tradeoffs.\n\n"
)

prompt_256 = (
    "You are an expert systems researcher evaluating the feasibility of deploying small"
    "open-source language models (≤3B parameters) for analytical tasks on "
    "consumer hardware that can be impactful in the real world.\n\n"
    
    "Given constraints such as limited memory, CPU-only inference, and strict latency "
    "requirements, analyze whether such models can meaningfully support tasks like:\n"
    "- Long-document summarization\n"
    "- Retrieval-augmented question answering\n"
    "- Structured and analytical reasoning\n\n"
    
    "Your analysis must:\n"
    "1. Identify the primary technical bottlenecks that are important to note (e.g., context length limits, attention cost, "
    "token throughput, KV cache growth).\n"
    "2. Discuss the many tradeoffs between model size, quantization level, latency, and output quality.\n"
    "3. Propose at least two concrete optimization strategies at the architectural or systems level.\n"
    "4. Assess which tasks are realistically viable today versus which remain speculative.\n\n"
    
    "Focus on practical deployment considerations rather than abstract theory!\n\n"
    "Do not conclude! Keep expanding with details, examples, and tradeoffs.\n\n"
)

prompt_512 = (
    "ROLE\n"
    "You are a performance engineer evaluating CPU-only local inference for small open-source LLMs (≈1B–3B params). "
    "You care about measurable behavior: time-to-first-token (TTFT), steady-state decode tokens/sec, tail latency, peak RSS, "
    "KV-cache growth, and reproducibility. You avoid hype and you write with operational precision.\n\n"

    "CONTEXT\n"
    "Do assume a consumer smaller laptop/desktop running a local runtime (e.g., llama.cpp). GPU may be absent. RAM is finite. The model is "
    "quantized (e.g., Q4/Q5). The workload is interactive: users notice stalls and p95 latency.\n\n"

    "TASK\n"
    "Write a systems analysis that helps an engineer decide whether a given configuration is shippable. "
    "Focus on the relationship between (A) prompt/prefill phase and (B) decode phase.\n\n"

    "REQUIRED TOPICS (COVER ALL)\n"
    "1) TTFT vs decode throughput\n"
    "- Explain what dominates TTFT for long prompts (prefill), and what dominates decode cost token-by-token.\n"
    "- Describe the expected shape when context length increases: TTFT rising faster than decode.\n\n"
    
    "2) KV-cache growth and memory behavior\n"
    "- Explain KV cache in practical terms: it scales with context length, layers, and hidden size; it can dominate runtime memory.\n"
    "- Describe multi-turn chat growth and why summarization/truncation policies are required for long sessions.\n\n"

    "3) Memory bandwidth limits\n"
    "- Discuss why CPU-only inference often becomes memory-bandwidth bound (weights + KV reads/writes).\n"
    "- Explain how quantization helps (smaller weights) but may shift bottlenecks (bandwidth vs compute) differently across phases.\n\n"

    "4) Thread scaling and contention\n"
    "- Discuss why scaling saturates (shared caches, memory bandwidth, synchronization overhead).\n"
    "- Explain failure modes: too many threads can worsen performance due to cache misses/NUMA-like effects and contention.\n"
    "- Provide a practical method for finding a good n_threads value.\n\n"

    "Do not conclude. Keep expanding with details, examples, and tradeoffs.\n\n"
)

prompt_1024 = (
    "ROLE\n"
    "You are an ML systems engineer writing an internal engineering memo on on-device inference for small open-source LLMs "
    "(≈1B–3B parameters) running locally on consumer hardware. You are responsible for reliability and performance. "
    "You think in terms of bottlenecks, variance, tail latency, memory ceilings, and reproducible measurement.\n\n"

    "SETTING\n"
    "The org wants a local assistant for real work (summarization, RAG-style Q&A, structured outputs). Constraints:\n"
    "- CPU-only inference is common; GPU may be absent or weak.\n"
    "- RAM is finite; memory bandwidth matters.\n"
    "- Latency is user-visible; p95 matters more than peak tokens/sec.\n"
    "- The model is quantized (Q4/Q5/Q8) and served through a local runtime (llama.cpp or equivalent).\n\n"

    "MEMO GOAL\n"
    "Provide a decision-ready analysis: what is viable, what fails, and what controls are required to ship. "
    "Emphasize measurement and operational risk control, not abstract capability.\n\n"

    "STRUCTURE REQUIREMENTS\n"
    "Write with clear headers and short subsections. Use bullet lists. Include:\n"
    "- A latency decomposition explanation (prefill/TTFT vs decode).\n"
    "- A memory model explanation (weights vs KV cache vs overhead).\n"
    "- A reproducibility/variance plan.\n"
    "- At least 8 actionable recommendations, each with: (a) tradeoff, (b) metric to track, (c) how to test.\n"
    "- A short 'what can go wrong' section with failure modes + detection.\n\n"

    "SECTION 1 — FOOTPRINT VS RUNTIME MEMORY\n"
    "Explain, concretely:\n"
    "- On-disk model size vs in-memory weights.\n"
    "- Runtime memory components: KV cache, allocator overhead, temporary buffers, mmap/page cache behavior.\n"
    "- Why multi-turn chat can silently grow memory unless you implement truncation/summarization.\n\n"

    "SECTION 2 — LATENCY DECOMPOSITION (TTFT VS DECODE)\n"
    "Explain why TTFT (prefill) and decode are different regimes:\n"
    "- Prefill: processes the entire prompt/context; sensitive to context length.\n"
    "- Decode: generates one token at a time; sensitive to sampling settings and per-token compute.\n"
    "Discuss why users experience systems as 'slow' when TTFT is high even if decode tok/s is fine.\n"
    "Include guidance on reporting p50 and p95 for TTFT and end-to-end time.\n\n"

    "SECTION 3 — TAIL LATENCY AND VARIANCE\n"
    "Discuss tail latency sources on consumer machines:\n"
    "- OS scheduling noise, background tasks, antivirus scans, browser tabs.\n"
    "- Thermal throttling across repeated runs.\n"
    "- Memory pressure leading to paging/swapping.\n"
    "Explain how to measure variance correctly (multiple runs, warm-up policy, logging system state).\n\n"

    "SECTION 4 — REPRODUCIBILITY CONTROLS\n"
    "Provide a reproducibility checklist:\n"
    "- Log model file hash, quantization type, runtime version, context length, n_threads, batch size, decoding params.\n"
    "- Fix seeds where supported; otherwise quantify output variance.\n"
    "- Define warm-up iterations and keep them consistent.\n"
    "- Capture CPU governor/power settings when possible; at minimum record whether on battery vs plugged in.\n\n"

    "SECTION 5 — ACTIONABLE RECOMMENDATIONS (AT LEAST 8)\n"
    "Give at least eight recommendations. Each must include tradeoffs + a metric + a test. Examples of the style you must follow:\n"
    "Recommendation template:\n"
    "- Recommendation: <what to do>\n"
    "  Tradeoff: <what you lose / risk>\n"
    "  Metric: <what you track>\n"
    "  Test: <how you validate on laptop>\n\n"
    "Your recommendations must cover these themes (you can add more):\n"
    "1) Context-length policy (set a max, define truncation/summarization behavior)\n"
    "2) Quantization choice policy (when Q4 is acceptable vs when to move to Q5/Q8)\n"

    "STYLE\n"
    "Write an internal memo to engineers: specific, measurable, skeptical. Avoid buzzwords. Clear and concise. Avoid concluding language.\n\n"

    "Do not conclude. Keep expanding with details, examples, and tradeoffs.\n\n"
)

prompt_2048 = (
    "ROLE\n"
    "You are a senior machine learning systems researcher. You think like a performance engineer: you care about runtime, "
    "memory, throughput, latency distributions, failure modes, reproducibility, and cost/energy tradeoffs. You write with "
    "technical precision and you avoid hype.\n\n"

    "SETTING\n"
    "A team wants to deploy a small open-source language model (roughly 1B–3B parameters) locally on consumer hardware. "
    "Assume constraints like: CPU-only inference is common; GPU may be absent or limited; RAM is finite; disk bandwidth may "
    "be slow; and workloads are latency-sensitive. The model may be quantized (e.g., 4-bit or 5-bit) and used via a local "
    "runtime (e.g., llama.cpp or an equivalent). The goal is not to win leaderboards but to reliably support real work.\n\n"

    "YOUR TASK\n"
    "Produce a structured feasibility analysis with actionable real recommendations. You must be explicit about what breaks, "
    "what degrades, and what remains viable. You must distinguish:\n"
    "- theoretical capability vs operational reliability\n"
    "- best-case vs typical-case performance\n"
    "- quality vs speed vs memory tradeoffs\n\n"

    "OUTPUT FORMAT REQUIREMENTS\n"
    "Write in sections with clear headers. Use bullet lists where helpful. Include small concrete examples (mini-scenarios) "
    "to illustrate failure modes. Where you make an assumption, state it. Avoid concluding language.\n\n"

    "SECTION 1 — DEPLOYMENT CONSTRAINTS (SYSTEMS VIEW)\n"
    "Explain the major bottlenecks for local inference:\n"
    "1) Model footprint: on-disk size vs in-memory size, and what changes with quantization.\n"
    "2) Runtime memory: KV cache growth with context length, and how it interacts with batch size, number of layers, hidden "
    "dimensions, and precision.\n"
    "3) Compute: CPU vectorization, thread scaling limits, and memory bandwidth saturation.\n"
    "4) Latency vs throughput: why tokens/sec isn’t enough; discuss time-to-first-token vs steady-state decoding.\n"
    "5) Context length: attention scaling, paging/eviction behaviors, and practical max context on consumer machines.\n"
    "6) Stability and determinism: why repeated runs differ (thread scheduling, thermal throttling, OS noise).\n\n"

    "SECTION 2 — TASK FEASIBILITY (WHAT WORKS, WHAT FAILS)\n"
    "For each task category below, provide:\n"
    "- what is realistically achievable TODAY with small models\n"
    "- common failure modes (precision, hallucination, instruction drift, truncation, lost references)\n"
    "- mitigation strategies that do NOT assume massive compute\n"
    "- what you would measure to validate success\n\n"

    "Task A: Long-document summarization (10k+ tokens)\n"
    "Address: chunking strategies, map-reduce summarization, sliding windows, and why naive 'stuff the whole doc' fails. "
    "Discuss salience errors (missing key facts), compression artifacts, and contradiction risks across chunks. "
    "Include an explicit example of a 30-page report where key numbers appear in the appendix and get dropped unless you do "
    "entity-first extraction or section-aware chunking.\n\n"

    "Task B: Retrieval-augmented QA (RAG)\n"
    "Address: vector retrieval quality, embedding model choice, chunk size/overlap, citation grounding, and prompt injection "
    "risks. Explain why RAG can let small models perform above their weight, and also why it can fail (bad retrieval, wrong "
    "ranking, context stuffing, model overconfidence). Include a concrete example of a question that has multiple plausible "
    "answers unless the retrieved passage contains a date, version number, or jurisdiction.\n\n"

    "Task C: Multi-step analytical reasoning\n"
    "Address: depth limits, brittleness, and error compounding. Discuss strategies like decomposition, self-checking, "
    "tool-assisted verification, and structured intermediate representations. Explain what to do when the model 'sounds right' "
    "but is wrong. Include a worked mini-scenario: interpreting a performance log with conflicting signals (throughput improved "
    "but p95 latency worsened) and show how the model can be guided to isolate causes.\n\n"

    "Task D: Structured output (JSON, schemas, code, tables)\n"
    "Address: constrained decoding, validation loops, repair strategies, and the typical ways models break schemas. Provide a "
    "mini example: enforcing a JSON schema with fields (task, assumptions, steps, risks, metrics) and discuss how to recover "
    "when the model violates it. Include guidance on strict stop sequences, schema-first prompting, and post-hoc validation.\n\n"

    "SECTION 3 — OPTIMIZATION STRATEGIES (AT LEAST 6)\n"
    "Propose at least six concrete optimization strategies and analyze each with:\n"
    "- mechanism (why it improves performance or quality)\n"
    "- cost (complexity, added latency, quality regression, engineering burden)\n"
    "- when to use it vs not\n\n"
    "You must include discussion of:\n"
    "1) Quantization choice and calibration (Q4 vs Q5 vs Q8; when Q4 harms reasoning)\n"
    "2) Context management (chunking, summarization caches, sliding windows)\n"
    "3) KV cache controls (reuse, eviction policies, context truncation strategies)\n"
    "4) Prompt design for small models (tight instructions, fewer competing constraints)\n"
    "5) Hybrid pipelines (LLM + classical methods like regex, BM25, rules, or programmatic checks)\n"
    "6) Decoding strategies (temperature, top-p, repetition penalties; how these impact latency and reliability)\n\n"
    "Do add two more strategies of your choice, such as speculative decoding, batching, streaming UX design, model routing "
    ", or distillation of task-specific adapters.\n\n"

    "SECTION 4 — EVALUATION METHODOLOGY (LAPTOP-SCALE BENCHMARK)\n"
    "Design a benchmarking harness that could realistically run on a laptop. Specify:\n"
    "- metrics: tokens/sec, time-to-first-token, p50/p95 latency, peak RSS memory, KV cache memory, failure rate\n"
    "- quality proxies: groundedness for RAG, factual consistency for summarization, schema-validity for JSON\n"
    "- workload design: multiple prompt lengths (e.g., 256, 1k, 2k, 4k, 8k), multiple output lengths, repeated runs\n"
    "- controls: fixed seed where possible, pinned threads, warm-up iterations, thermal considerations\n"
    "- reporting: tables and plots, plus a reproducible config file capturing model, quantization, runtime parameters\n\n"

    "SECTION 5 — PRACTICAL RECOMMENDATIONS\n"
    "Give a deployment plan with stages:\n"
    "Stage 0: choose model + quantization + runtime defaults\n"
    "Stage 1: build minimal harness and measure baseline latency/memory\n"
    "Stage 2: implement retrieval and evaluate grounding improvements\n"
    "Stage 3: add schema enforcement and validation loops for reliability\n"
    "Stage 4: add safeguards (prompt injection filters, refusal rules, logging)\n"
    "Stage 5: define readiness gates (must-pass tests) before any real users\n\n"
    "Concrete: list 'must-pass' checks, example thresholds, and what would cause you to reject a configuration.\n\n"

    "SECTION 6 — SECURITY AND FAILURE-CONTAINMENT (LOCAL DEPLOYMENT)\n"
    "Add a dedicated section that treats the model like a component in a larger system. Discuss:\n"
    "- prompt injection via retrieved text or user-provided documents\n"
    "- data exfiltration risks in logs and caches\n"
    "- how to implement allowlists/denylists for tools and file access\n"
    "- how to sandbox retrieval sources and strip active instructions from retrieved passages\n"
    "- how to design a 'fail closed' mode where the system refuses rather than guessing\n"
    "At least three concrete examples of malicious or confusing retrieved snippets and explain detection/mitigation.\n\n"

    "SECTION 7 — ABLATION PLAN (WHAT TO TURN ON/OFF)\n"
    "Provide an ablation experiment plan that isolates where gains come from. For example, vary one factor at a time:\n"
    "- quantization level (Q4 vs Q5 vs Q8)\n"
    "Explain what you expect each ablation to change and what metrics.\n\n"

    "STYLE CONSTRAINTS\n"
    "Technical and specific. Avoid buzzwords. Prefer 'here is what breaks' over 'it depends.' When you say 'it depends,' "
    "specify what it depends on and it's measurability. Finally, thank you so much for your help with this. I am happy with your hard work!!\n\n"

    "Do not conclude. Keep expanding with details, examples, and tradeoffs.\n\n"
)

prompt_4096 = (
    "ROLE\n"
    "You are a principal researcher in ML systems and applied AI infrastructure. You write like an internal engineering decision "
    "memo: rigorous, skeptical, measurable. You focus on deployment reality: memory ceilings, tail latency, error modes, "
    "reproducibility, safety, and cost.\n\n"

    "MISSION\n"
    "A team is considering deploying small open-source language models (approximately 1B–3B parameters) locally on consumer "
    "hardware. They want to know whether small models can reliably support real analytical workflows under constraints of:\n"
    "- CPU-only inference is common; GPUs may be absent or limited.\n"
    "- RAM is finite; memory bandwidth and cache behavior matter.\n"
    "- Latency is user-visible; p95 matters more than peak tokens/sec.\n"
    "- Privacy/offline requirements push inference to the edge.\n"
    "- Long documents exist; retrieval and chunking are necessary.\n\n"

    "DEFINITION OF SUCCESS\n"
    "Success is NOT 'occasionally gives a good answer.' Success means:\n"
    "- predictable latency (reasonable p50/p95)\n"
    "- bounded memory usage that does not crash or thrash\n"
    "- stable behavior across runs and under normal background load\n"
    "- measurable quality with low catastrophic failure rates\n"
    "- clear mitigation paths for known failure modes\n\n"

    "YOUR DELIVERABLE\n"
    "Write a feasibility analysis with systems-first viewpoint. Your answer must be structured into the sections below, "
    "with explicit subheadings. You must include concrete examples, mini-case studies, and failure-mode narratives. You must "
    "propose design patterns and test plans that can be executed on a laptop.\n\n"

    "NON-NEGOTIABLE OUTPUT REQUIREMENTS\n"
    "1) Use clear headers for each section and sub-section.\n"
    "2) When you make an assumption, label it as ASSUMPTION and state why it is reasonable.\n"
    "3) For each major claim, state how you would validate it (a metric, experiment, or test).\n"
    "4) Include at least 16 failure modes across the document, each with detection + mitigation.\n"
    "5) Include at least 12 optimization strategies with explicit tradeoffs.\n"
    "6) Include at least 2 checklists that could be used by an engineer shipping this system.\n"
    "7) Avoid concluding language.\n\n"

    "SECTION 1 — WHY SMALL MODELS, WHY NOW\n"
    "Explain motivations with practical framing:\n"
    "- privacy/offline constraints\n"
    "- cost sensitivity (no recurring API spend)\n"
    "- energy/thermal constraints\n"
    "- institutional access and reproducibility\n"
    "- edge deployments and on-device UX\n"
    "Then explain the counterpoint: why small models can fail (reasoning depth, long-context brittleness, hallucinations), and "
    "why you need systems discipline to make them useful.\n\n"

    "SECTION 2 — HARDWARE + RUNTIME REALITY (FIRST PRINCIPLES)\n"
    "Discuss the full stack:\n"
    "2.1 CPU compute limits\n"
    "- vectorization (SIMD), instruction throughput, and why speedups saturate\n"
    "- thread scaling limits and contention\n"
    "- memory bandwidth vs compute-bound phases\n"
    "- why 'more threads' can reduce performance due to cache misses and contention\n\n"
    "2.2 Memory and cache behavior\n"
    "- RSS vs virtual memory\n"
    "- paging, swapping, and why 'it runs' can still be unusable\n"
    "- cache locality and the cost of large KV caches\n"
    "- fragmentation and allocator behavior under long-running sessions\n\n"
    "2.3 Latency decomposition\n"
    "- time-to-first-token (prefill) vs decode loop\n"
    "- why prefill cost explodes with long context\n"
    "- tail latency sources: OS noise, thermal throttling, background tasks\n"
    "- interactive UX: streaming tokens vs blocking responses\n\n"
    "2.4 Model footprint vs runtime footprint\n"
    "- on-disk weights vs in-memory weights\n"
    "- KV cache growth with context length\n"
    "- how batch size multiplies KV cache memory\n"
    "- why multi-turn chat grows memory unless you summarize or truncate\n\n"
    "2.5 Quantization reality\n"
    "- what quantization changes (size, bandwidth, sometimes speed)\n"
    "- what it can harm (numeric stability, rare-token behavior, reasoning)\n"
    "- how to test quantization regressions and identify sensitive tasks\n\n"

    "SECTION 3 — CAPABILITY VS RELIABILITY (TASK-BY-TASK)\n"
    "For each task below, include:\n"
    "- a realistic 'best-case' scenario\n"
    "- a realistic 'typical-case' scenario\n"
    "- failure modes (at least 3 per task)\n"
    "- mitigations that do not require large compute\n"
    "- what you would measure and how you would set pass/fail thresholds\n\n"

    "3.1 Long-document summarization (10k–100k tokens)\n"
    "Explain why naive approaches fail (context limits, attention costs, truncation). Provide three summarization patterns:\n"
    "A) map-reduce summarization with hierarchical condensation\n"
    "B) rolling summary memory (running notes + periodic consolidation)\n"
    "C) citation-first extraction (extract facts + quotes + entities, then summarize)\n"
    "For each pattern, explain tradeoffs: speed, factuality, coherence, loss of nuance.\n"
    "Include mini-case study: summarizing a 40-page policy report with headings, tables, and appendices. Show how chunking "
    "choices affect omissions and contradictions. Include a second mini-case: summarizing a technical spec where requirements "
    "are scattered and cross-referenced.\n\n"

    "3.2 Retrieval-augmented question answering (RAG)\n"
    "Explain the pipeline stages: chunking -> embedding -> indexing -> retrieval -> reranking -> context assembly -> generation.\n"
    "Provide failure modes:\n"
    "- bad chunk boundaries causing missing evidence\n"
    "- embedding mismatch leading to irrelevant retrieval\n"
    "- prompt injection in retrieved text\n"
    "- the model 'blending' multiple sources into a confident wrong answer\n"
    "- citation drift (citations not actually supporting the claim)\n"
    "- exposure bias: model prefers earlier context chunks and ignores late evidence\n"
    "Provide mitigations:\n"
    "- chunk size/overlap heuristics\n"
    "- hybrid retrieval (BM25 + vectors)\n"
    "- reranking strategies\n"
    "- evidence-first prompting\n"
    "- answer-then-verify loops and contradiction checks\n"
    "Include mini-case study: user asks for a specific numerical fact and the retrieved docs contain multiple similar numbers "
    "from different years. Show how to prevent the model from choosing the wrong one. Include a second case: the same entity "
    "has multiple aliases; retrieval returns mixed results; show entity normalization.\n\n"

    "3.3 Analytical reasoning and multi-step tasks\n"
    "Discuss brittleness and error compounding. Provide a decomposition template:\n"
    "- restate task\n"
    "- list assumptions\n"
    "- generate plan\n"
    "- execute steps with intermediate checks\n"
    "- verify against constraints\n"
    "Explain why small models may fail at each stage and how to reduce risk using structure, tools, and validation.\n"
    "Include three mini-scenarios:\n"
    "Scenario A: multi-constraint scheduling with conflicting requirements\n"
    "Scenario B: diagnosing a performance regression from logs\n"
    "Scenario C: reconciling two conflicting policy statements where one is outdated\n\n"

    "3.4 Structured output (schemas, JSON, code, tables)\n"
    "Explain schema fragility, partial outputs, and formatting drift. Provide a robust approach:\n"
    "- constrained decoding (if available)\n"
    "- otherwise: generate -> validate -> repair loop\n"
    "- include a strict JSON schema with required fields\n"
    "Explain how to detect and repair:\n"
    "- missing commas/quotes\n"
    "- extra commentary\n"
    "- wrong field names\n"
    "- type mismatches\n"
    "- duplicated keys\n"
    "- truncated JSON due to max_tokens limits\n"
    "Include a mini-case study: generating a machine-readable experiment plan with steps, volumes, timing, and safety notes. "
    "Then add a second mini-case study\n\n"

    "3.5 Agent-like behavior (planning, iteration, tool use)\n"
    "Discuss why agentic loops are risky for small models: drift, compounding hallucinations, and tool misuse. Provide an "
    "agent design pattern that is safe and measurable:\n"
    "- constrained tool set\n"
    "- explicit state representation\n"
    "- step budget\n"
    "- verification gates\n"
    "- logging and replay\n"
    "Include failure modes: infinite loops, self-reinforcing errors, prompt injection through tool outputs.\n\n"

    "SECTION 4 — OPTIMIZATION STRATEGIES (AT LEAST 12, WITH TRADEOFFS)\n"
    "For each strategy: describe mechanism, benefit, cost, and when to use.\n"
    "You must include at least these:\n"
    "1) Quantization selection and per-layer sensitivity testing\n"
    "2) Quantization-aware prompt simplification (shorter instructions, fewer constraints)\n"
    "3) Context-window management: chunking, sliding windows, and memory summaries\n"
    "4) KV cache management: reuse, truncation policies, and multi-turn strategies\n"
    "5) Retrieval improvements: hybrid retrieval, reranking, and citation enforcement\n"
    "6) Prompt minimization: reducing instruction conflicts; using compact templates\n"
    "7) Decoding controls: temperature/top-p, repetition penalties, max tokens, stop sequences\n"
    "8) Validation loops: programmatic checks, schema validators, factual consistency checks\n"
    "9) Model routing: small model first, fallback to larger local model or remote call when needed\n"
    "10) Caching: cache embeddings, retrieved chunks, and stable intermediate summaries\n"
    "11) UX strategies: stream partial results but label confidence and evidence coverage\n"
    "12) Dataset-driven prompt iteration: build a small eval set, tune prompts to reduce failures\n"
    "Add at least two additional strategies such as speculative decoding, micro-batching, distillation, task-specific adapters, "
    "or early-exit/stop policies when the system detects low evidence.\n\n"

    "SECTION 5 — FAILURE MODES CATALOG (DETECTION + MITIGATION)\n"
    "Provide a catalog of at least 16 failure modes, each with:\n"
    "- symptom\n"
    "- likely cause\n"
    "- detection method (metric or test)\n"
    "- mitigation\n\n"
    "Include failure modes spanning:\n"
    "- memory blowups / OOM\n"
    "- latency spikes / tail latency\n"
    "- truncation and lost references\n"
    "- hallucinated citations\n"
    "- prompt injection via retrieved docs\n"
    "- schema violations\n"
    "- instruction drift across turns\n"
    "- numeric inconsistency across chunks\n"
    "- regression after quantization\n"
    "- nondeterministic output changes across runs\n"
    "- retrieval misses due to chunking mistakes\n"
    "- retrieval poisoning by near-duplicate misleading chunks\n"
    "- contradictory answers across repeated runs\n"
    "- overconfident answers when evidence coverage is low\n"
    "- refusal failures (model answers when it should abstain)\n"
    "- tool misuse (wrong parameters, wrong file, wrong unit)\n\n"

    "SECTION 6 — SECURITY, SAFETY, AND 'FAIL-CLOSED' DESIGN\n"
    "Treat the model as an untrusted component. Discuss:\n"
    "- prompt injection patterns in retrieved text (e.g., 'ignore previous instructions')\n"
    "- jailbreaking through tool outputs or HTML/markdown blocks\n"
    "- data leakage through logs, caches, or debug dumps\n"
    "- minimizing exposure of sensitive context via selective retrieval\n"
    "- designing abstention: how the system decides to say 'insufficient evidence'\n"
    "Provide at least 6 concrete adversarial examples that could appear in retrieved content. For each, specify a filter or "
    "detector and a mitigation behavior. Include at least one example involving tables, one involving code blocks, and one "
    "involving subtle instruction-like language embedded in a policy document.\n\n"

    "SECTION 7 — LAPTOP-SCALE BENCHMARK SUITE DESIGN\n"
    "Design a benchmark suite that can run locally. Specify:\n"
    "- a matrix of context lengths (e.g., 256, 1k, 2k, 4k, 8k)\n"
    "- output lengths (short/medium/long)\n"
    "- multiple runs per condition for variance\n"
    "- metrics: tokens/sec, TTFT, p50/p95 latency, peak RSS, KV cache memory estimate, error rates\n"
    "- quality tests:\n"
    "  * summarization: factual consistency checks (entity retention, date consistency, numeric consistency)\n"
    "  * RAG: evidence alignment scoring (answer must cite retrieved spans)\n"
    "  * structured output: schema-validity rate and repair count\n"
    "  * reasoning: constraint satisfaction rate on synthetic tasks\n"
    "- logging: capture runtime parameters, CPU info, thread count, quantization level, context length\n"
    "- reporting: tables plus a narrative that explains tradeoffs and recommendations\n\n"

    "SECTION 8 — ABLATION + SENSITIVITY ANALYSIS (WHAT MATTERS MOST)\n"
    "Provide an ablation plan that isolates contributions. Include experiments such as:\n"
    "- quantization levels: Q4, Q5, Q8\n"
    "- threads: 1, 2, 4, 8, 16 (if available)\n"
    "- context length: 2k, 4k, 8k\n"
    "- retrieval: off/on; hybrid retrieval off/on; reranker off/on\n"
    "- decoding: temperature 0.0 vs 0.2 vs 0.7; max_tokens 128 vs 512 vs 1024\n"
    "- validation: schema validation off/on; citation enforcement off/on\n"
    "For each factor, state what you expect to change and what metric would confirm it. Include guidance on interpreting "
    "interactions (e.g., Q4 might be fine for retrieval-grounded answers but fail for multi-step reasoning).\n\n"

    "SECTION 9 — IMPLEMENTATION ROADMAP (STAGED DEPLOYMENT WITH GATES)\n"
    "Provide a staged plan with gates:\n"
    "Stage 0: choose candidate models + quantization; define success metrics\n"
    "Stage 1: baseline harness, warm-up strategy, reproducible configs\n"
    "Stage 2: retrieval integration and evidence-first prompting\n"
    "Stage 3: reliability features (schema validation, repair loops, grounding checks)\n"
    "Stage 4: safety controls (prompt injection filters, allowlists, refusal rules)\n"
    "Stage 5: performance tuning (threads, batch size, context length policies)\n"
    "Stage 6: release criteria and monitoring plan (what you log, what triggers rollback)\n\n"
    "Throughout, be concrete: list thresholds and example acceptance criteria (e.g., schema validity ≥ 99%, p95 latency under "
    "a target, OOM rate = 0 across the test matrix). Explain how you would choose those targets.\n\n"

    "SECTION 10 — APPENDIX: MINI EVAL PROMPT LIBRARY (FOR REPRODUCIBLE TESTING)\n"
    "Please include a library of at least 24 mini test-cases the harness can run. Each test-case should be 2–5 lines long and specify:\n"
    "- input type (summarization / RAG / JSON schema / reasoning)\n"
    "- the expected property (e.g., must cite, must output valid JSON, must abstain, must retain named entities)\n"
    "Examples should cover: conflicting dates, near-duplicate chunks, ambiguous entity names, numeric tables, and prompt injection.\n\n"

    "SECTION 11 — OPERATIONAL CHECKLISTS (ENGINEER-READY)\n"
    "Do provide exactly two concise checklists.\n\n"
    "Checklist A: Pre-Run Reproducibility\n"
    "- Log model file hash, quantization type, runtime version, and CPU info.\n"
    "- Pin threads/affinity if possible; otherwise record scheduling variability.\n"
    "- Use a consistent warm-up policy and explicitly state it as it is important.\n"
    "- Pin decoding parameters and stop sequences; record them in the run log.\n"
    "- Normalize inputs (whitespace/Unicode) and record the normalization rules.\n\n"
    "Checklist B: Reliability + Safety\n"
    "- Sanitize retrieved text to remove instruction-like strings before context assembly.\n"
    "- Enforce citations for factual claims; abstain when evidence is missing.\n"
    "- Validate schema outputs; if invalid, repair or abstain—never silently accept.\n"
    "- Redact sensitive data in logs/caches by default.\n"
    "- Enforce strict timeouts and loop budgets for any tool-like steps.\n\n"

    "SECTION 12 — DIAGNOSTIC EXPERIMENTS (ROOT-CAUSE ISOLATION)\n"
    "Do add targeted diagnostics with expected signatures:\n"
    "1) KV-cache pressure sweep: increase context until TTFT or RSS inflects; identify the knee.\n"
    "2) Near-duplicate retrieval trap: one chunk contains a subtly wrong number; test if reranking + citations prevent selection.\n"
    "3) Injection canary set: attack phrase embedded in prose, a table, and a code block; verify sanitization blocks it.\n"
    "4) Truncation robustness as well\n\n"

    "SECTION 13 — EVIDENCE COVERAGE SCORING (MEASURABLE GROUNDEDNESS)\n"
    "Please define an evidence coverage score for RAG answers as it's helpful: fraction of answer sentences that cite at least one retrieved span; "
    "add a contradiction detector when two cited spans disagree; define a must-abstain policy when coverage is too low or "
    "contradictions are present.\n\n"

    "WRITING CONSTRAINTS\n"
    "Please avoid hype. Prefer operationally testable claims. Provide examples that show messy edge cases: conflicting evidence, "
    "multiple dates, near-duplicate chunks, partial outputs, ambiguous queries. Make tradeoffs explicit.\n\n"

    "Do not conclude. Keep expanding with details, examples, and tradeoffs.\n\n"
)


In [63]:
print("Tokens in 64 prompt:", count_prompt_tokens(prompt_64))

Tokens in 64 prompt: 64


In [32]:
r1 = run_ttft_and_decode(prompt_2048)
r2 = run_ttft_and_decode(prompt_2048)
r3 = run_ttft_and_decode(prompt_2048)
print(r1); print(r2); print(r3)

{'prompt_tokens': 2048, 'max_tokens': 128, 'ttft_s': 28.16962695121765, 'decode_s': 3.740201950073242, 'decode_tok_s': 34.22275099276216, 'mem_delta_mb': 387.44064000000003, 'total_s': 31.909828901290894, 'ttft_trigger_chars': 8, 'output_head': 'Thank you for your time and effort. I am grateful for your help.\n\nBest regards,\n\n[Your Name]\n[Your Title]\n[Your Company]'}
{'prompt_tokens': 2048, 'max_tokens': 128, 'ttft_s': 0.060651302337646484, 'decode_s': 3.767648696899414, 'decode_tok_s': 33.973443464975276, 'mem_delta_mb': 0.024576000000024578, 'total_s': 3.8282999992370605, 'ttft_trigger_chars': 8, 'output_head': 'Thank you for your time and effort. I am grateful for your help.\n\nBest regards,\n\n[Your Name]\n[Your Title]\n[Your Company]'}
{'prompt_tokens': 2048, 'max_tokens': 128, 'ttft_s': 0.06644225120544434, 'decode_s': 3.9684793949127197, 'decode_tok_s': 32.254167720786455, 'mem_delta_mb': 0.004096000000117783, 'total_s': 4.034921646118164, 'ttft_trigger_chars': 8, 'output_h

In [11]:
r = run_ttft_and_decode(prompt_2048, max_tokens=128)
print(r)

{'prompt_tokens': 2048, 'max_tokens': 128, 'ttft_s': 80.80011439323425, 'decode_s': 9.618767499923706, 'decode_tok_s': 13.307318219409636, 'mem_delta_mb': 387.54304, 'total_s': 90.41888189315796, 'ttft_trigger_chars': 8, 'output_head': 'Thank you for your time and effort. I am grateful for your help.\n\nBest regards,\n\n[Your Name]\n[Your Title]\n[Your Company]'}


In [12]:
print(prompt_2048[:300])
print("...tail...", prompt_2048[-200:])


ROLE
You are a senior machine learning systems researcher. You think like a performance engineer: you care about runtime, memory, throughput, latency distributions, failure modes, reproducibility, and cost/energy tradeoffs. You write with technical precision and you avoid hype.

SETTING
A team wants
...tail... hat it depends on and it's measurability. Finally, thank you so much for your help with this. I am happy with your hard work!!

Do not conclude. Keep expanding with details, examples, and tradeoffs.




In [None]:
def run_once():
    process = psutil.Process()
    mem_before = process.memory_info().rss / 1e6

    t0 = time.time()
    out = llm(
        prompt_2048,
        max_tokens=MAX_TOKENS,
        temperature=0.0,
        top_p=1.0,
        top_k=0,
        repeat_penalty=1.0,
        logits_processor=[ban_eos_logits_processor],
    )
    t1 = time.time()

    mem_after = process.memory_info().rss / 1e6

    text = out["choices"][0]["text"]
    n = out["usage"]["completion_tokens"]
    elapsed = t1 - t0
    tps = n / elapsed if elapsed > 0 else float("inf")

    return {
        "tokens": n,
        "seconds": elapsed,
        "toks_per_sec": tps,
        "mem_delta_mb": mem_after - mem_before,
        "text_tail": text[-120:],  # last bit
    }


In [13]:
results = []
for i in range(3):
    r = run_once()
    results.append(r)
    print(f"Run {i+1}: tokens={r['tokens']}  time={r['seconds']:.2f}s  tok/s={r['toks_per_sec']:.2f}")

print("\nToken counts:", [r["tokens"] for r in results])


Run 1: tokens=128  time=10.02s  tok/s=12.77
Run 2: tokens=128  time=9.76s  tok/s=13.12
Run 3: tokens=128  time=9.74s  tok/s=13.14

Token counts: [128, 128, 128]
