# Inference with Furiosa-LLM 

In [None]:
from furiosa_llm import LLM, SamplingParams

# Load the Llama 3.1 8B Instruct model
llm = LLM.load_artifact("furiosa-ai/Llama-3.1-8B-Instruct-FP8", devices="npu:0")

# You can specify various parameters for text generation
sampling_params = SamplingParams(max_tokens=100, top_p=0.3, top_k=100)


Fetching 170 files: 100%|██████████| 170/170 [00:00<00:00, 5983.82it/s]
INFO:2025-06-04 02:17:38+0000 Prefill buckets with output size: [BucketWithOutputLogitsSize(bucket=Bucket(batch_size=1, attention_size=256, kv_cache_size=0),
                            output_logits_size=1),
 BucketWithOutputLogitsSize(bucket=Bucket(batch_size=1, attention_size=320, kv_cache_size=0),
                            output_logits_size=1),
 BucketWithOutputLogitsSize(bucket=Bucket(batch_size=1, attention_size=384, kv_cache_size=0),
                            output_logits_size=1),
 BucketWithOutputLogitsSize(bucket=Bucket(batch_size=1, attention_size=512, kv_cache_size=0),
                            output_logits_size=1),
 BucketWithOutputLogitsSize(bucket=Bucket(batch_size=1, attention_size=640, kv_cache_size=0),
                            output_logits_size=1),
 BucketWithOutputLogitsSize(bucket=Bucket(batch_size=1, attention_size=768, kv_cache_size=0),
                            output_logits_siz

[2m2025-06-04T02:17:38.996881029Z[0m [32m INFO[0m [2mfuriosa_generator::backend::furiosa_rt[0m[2m:[0m Trying to open:
DeviceRow([Device::npu_fused(0, 0..=7)])
[2m2025-06-04T02:17:39.405244807Z[0m [32m INFO[0m [2mfuriosa_generator::backend[0m[2m:[0m KV caches on for each layer (I8, total 8.4 MB * num_blocks over 32 layers)  will be allocated
[2m2025-06-04T02:17:39.426210269Z[0m [32m INFO[0m [2mfuriosa_generator::backend::furiosa_rt[0m[2m:[0m Loading 18955 parameters from storages has started ...
[2m2025-06-04T02:17:39.426733339Z[0m [32m INFO[0m [2mfuriosa_sprinter::buffer::alloc[0m[2m:[0m Support for huge page size of 2 MiB has been detected.
[2m2025-06-04T02:17:55.30925103Z[0m [32m INFO[0m [2mfuriosa_generator::backend::furiosa_rt[0m[2m:[0m 18955 parameters (12.8 GiB) has been successfully loaded (15 secs).
[2m2025-06-04T02:17:55.709550566Z[0m [32m INFO[0m [2mfuriosa_generator::backend[0m[2m:[0m Determine the maximized available num_block

### 1. Single batch inference

In [4]:
# Prompt for the model
message = [{"role": "user", "content": "What is the capital of France?"}]
prompt = llm.tokenizer.apply_chat_template(message, tokenize=False)

# Generate text
response = llm.generate([prompt], sampling_params)

# Print the output of the model
print(response[0].outputs[0].text)


[2m2025-06-04T02:18:47.904529276Z[0m [32m INFO[0m [2mfuriosa_generator::scheduler::hf_compat[0m[2m:[0m num samples received: 1
assistant

The capital of France is Paris.


### 2. Multi batch inference 

In [29]:
messages = [[{"role": "user", "content": "What is the capital of France?"}],
            [{"role": "user", "content": "What is the capital of Germany?"}]]

prompts = [llm.tokenizer.apply_chat_template(message, tokenize=False) for message in messages]

# Generate text
responses = llm.generate(prompts, sampling_params)

# Print the output of the model
outputs = [responses[i].outputs[0].text.split("assistant\n\n")[-1] for i in range(len(responses))]
for i, output in enumerate(outputs):
    print(f"Batch {i+1}")
    print(f"Question {i + 1}: {messages[i][0]['content']}")
    print(f"Response {i + 1}: {output}")
    print("====================================================")

[2m2025-06-04T02:33:22.691431827Z[0m [32m INFO[0m [2mfuriosa_generator::scheduler::hf_compat[0m[2m:[0m num samples received: 2
Batch 1
Question 1: What is the capital of France?
Response 1: The capital of France is Paris.
Batch 2
Question 2: What is the capital of Germany?
Response 2: The capital of Germany is Berlin.


### 3. Async single batch inference

In [31]:
import asyncio
 
async def async_single_batch_inference():
    # Prompt for the model
    message = [{"role": "user", "content": "What is the capital of France?"}]
    prompt = llm.tokenizer.apply_chat_template(message, tokenize=False)

    # Generate text and print each token at a time
    async for output_txt in llm.stream_generate(prompt, sampling_params):
        print(output_txt, end="", flush=True)

await async_single_batch_inference()

assistant

The capital of France is Paris.