In [8]:
from llama_cpp import Llama
import time
import psutil
import os

In [9]:
MODEL_PATH = os.path.join(
    "..", "models", "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
)

MODEL_PATH

'..\\models\\tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf'

In [10]:
llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=2048,        # context length (keep modest for now)
    n_threads=8,       # CPU threads (we'll tune later)
    verbose=False
)

print("Model loaded")


Model loaded


In [11]:
prompt = (
    "Explain in 5 bullet points what 'energy efficiency in AI inference' means."
)

prompt

"Explain in 5 bullet points what 'energy efficiency in AI inference' means."

In [12]:
# Measure memory before
process = psutil.Process()
mem_before = process.memory_info().rss / 1e6  # MB

start_time = time.time()

output = llm(
    prompt,
    max_tokens=128,
    temperature=0.7,
    stop=["</s>"]
)

end_time = time.time()

# Measure memory after
mem_after = process.memory_info().rss / 1e6  # MB

generated_text = output["choices"][0]["text"]
num_tokens = output["usage"]["completion_tokens"]
elapsed = end_time - start_time
tokens_per_sec = num_tokens / elapsed

print(generated_text)
print("\n--- Metrics ---")
print(f"Tokens generated: {num_tokens}")
print(f"Time elapsed (s): {elapsed:.2f}")
print(f"Tokens/sec: {tokens_per_sec:.2f}")
print(f"Memory delta (MB): {mem_after - mem_before:.1f}")


 Answer according to: Energy efficiency in AI inference is the process of optimizing the performance of AI models in terms of energy consumption, while maintaining a minimum level of performance. Energy efficiency in AI inference is closely related to energy-efficient computing, which is the process of reducing energy consumption in computing systems. Energy efficiency in AI inference refers to the optimization of AI inference hardware and software, including algorithms, data management, and inference algorithms. Energy efficiency in AI inference can be measured in different ways, depending on the requirements of the system. In some cases, energy efficiency can be measured by comparing the energy consumption of

--- Metrics ---
Tokens generated: 128
Time elapsed (s): 10.06
Tokens/sec: 12.73
Memory delta (MB): 160.5
