In [1]:
!pip install vllm
from vllm import LLM, SamplingParams

Collecting vllm
  Downloading vllm-0.9.0.1-cp38-abi3-manylinux1_x86_64.whl.metadata (15 kB)
Collecting blake3 (from vllm)
  Downloading blake3-1.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting huggingface-hub>=0.32.0 (from huggingface-hub[hf_xet]>=0.32.0->vllm)
  Downloading huggingface_hub-0.32.3-py3-none-any.whl.metadata (14 kB)
Collecting fastapi>=0.115.0 (from fastapi[standard]>=0.115.0->vllm)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Downloading prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl.metadata (13 kB)
Collecting lm-format-enforcer<0.11,>=0.10.11 (from vllm)
  Downloading lm_format_enforcer-0.10.11-py3-none-any.whl.metadata (17 kB)
Collecting llguidance<0.8.0,>=0.7.11 (from vllm)
  Downloading llguidance-0.7.26-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)
Collecting outlines==0.1.11 (from vllm)
  Download

INFO 06-01 08:40:26 [__init__.py:243] Automatically detected platform cuda.


In [2]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.auto import tqdm
from datasets import load_dataset
import random
import numpy as np
import csv
from huggingface_hub import login
login('enter your token')

In [3]:
def main():
    ############## Set Up ##############
    torch.manual_seed(0)
    random.seed(0)

    max_new_tokens = 256    # Number of new tokens to generate
    device_count = torch.cuda.device_count()
    tensor_parallel_size = device_count if device_count > 1 else 1

    model_name = "meta-llama/Llama-3.2-3B-Instruct"
    llm = LLM(
        model=model_name,
        dtype="float16",
        tensor_parallel_size=tensor_parallel_size,
        max_model_len=4096,                 # <-- added to avoid KV cache overflow
        gpu_memory_utilization=0.95,
        max_num_seqs=2*4,
        max_num_batched_tokens=8192*4,# <-- added to better use GPU memory

    )
    #####################################

    warmup_prompt = "Explain what AI is."
    sampling_params = SamplingParams(
        max_tokens=max_new_tokens,
        temperature=0.0  # deterministic for consistent speed
    )

    # === Warm-up phase ===
    print("Starting warm-up...")
    for i in tqdm(range(5), desc="Warm Up..."):
        outputs = llm.generate([warmup_prompt], sampling_params)

    # === Test phase ===
    prompt = "How to learn a new language?"
    tputs = []
    time_record = []
    for _ in tqdm(range(10), desc="Test Inference"):
        torch.cuda.synchronize()
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)
        start_event.record()

        outputs = llm.generate([prompt], sampling_params)

        end_event.record()
        torch.cuda.synchronize()
        elapsed_ms = start_event.elapsed_time(end_event)
        elapsed_sec = elapsed_ms / 1000
        tput = max_new_tokens / elapsed_sec
        time_record.append(elapsed_sec)
        tputs.append(tput)

    # Get response (first output)
    response = outputs[0].outputs[0].text
    sorted_tputs = np.sort(tputs)[2:-2]
    org_tput = np.mean(sorted_tputs)

    print(f'Prompt: {prompt}\nResponse: {response}\n')
    print(f'Time Record: {time_record}')
    print(f'Throughput Record: {tputs} toks/s\n')
    print(f'Throughput: {org_tput} toks/s')
    import csv
    # Save results to CSV
    rounded_tput = round(org_tput, 1)
    with open("result.csv", mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["Id", "value"])
        writer.writerow([1, rounded_tput])  # Only throughput here

if __name__ == '__main__':
    main()

INFO 06-01 07:47:30 [__init__.py:31] Available plugins for group vllm.general_plugins:
INFO 06-01 07:47:30 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 06-01 07:47:30 [__init__.py:36] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

INFO 06-01 07:48:02 [config.py:793] This model supports multiple tasks: {'classify', 'embed', 'score', 'generate', 'reward'}. Defaulting to 'generate'.
INFO 06-01 07:48:02 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.0.1) with config: model='meta-llama/Llama-3.2-3B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.2-3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_trace

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

INFO 06-01 07:48:05 [cuda.py:240] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 06-01 07:48:05 [cuda.py:289] Using XFormers backend.
INFO 06-01 07:48:06 [parallel_state.py:1064] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 06-01 07:48:06 [model_runner.py:1170] Starting to load model meta-llama/Llama-3.2-3B-Instruct...
INFO 06-01 07:48:07 [weight_utils.py:291] Using model weights format ['*.safetensors']


model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

INFO 06-01 07:50:47 [weight_utils.py:307] Time spent downloading weights for meta-llama/Llama-3.2-3B-Instruct: 159.894809 seconds


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 06-01 07:51:15 [default_loader.py:280] Loading weights took 27.75 seconds
INFO 06-01 07:51:15 [model_runner.py:1202] Model loading took 6.0160 GiB and 188.563183 seconds
INFO 06-01 07:51:38 [worker.py:291] Memory profiling takes 22.47 seconds
INFO 06-01 07:51:38 [worker.py:291] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.95) = 14.00GiB
INFO 06-01 07:51:38 [worker.py:291] model weights take 6.02GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 4.13GiB; the rest of the memory reserved for KV Cache is 3.81GiB.
INFO 06-01 07:51:38 [executor_base.py:112] # cuda blocks: 2227, # CPU blocks: 2340
INFO 06-01 07:51:38 [executor_base.py:117] Maximum concurrency for 4096 tokens per request: 8.70x
INFO 06-01 07:51:43 [model_runner.py:1512] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in

Capturing CUDA graph shapes:   0%|          | 0/4 [00:00<?, ?it/s]

INFO 06-01 07:51:47 [model_runner.py:1670] Graph capturing finished in 4 secs, took 0.04 GiB
INFO 06-01 07:51:47 [llm_engine.py:428] init engine (profile, create kv cache, warmup model) took 32.40 seconds
Starting warm-up...


Warm Up...:   0%|          | 0/5 [00:00<?, ?it/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Test Inference:   0%|          | 0/10 [00:00<?, ?it/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Prompt: How to learn a new language?
Response:  Here are some effective ways to learn a new language:
1.  Immersion: Surround yourself with the language you want to learn. Listen to music, watch TV shows and movies, read books and newspapers, and speak with native speakers.
2.  Language learning apps: There are many language learning apps available that can help you learn a new language, such as Duolingo, Babbel, and Rosetta Stone.
3.  Language exchange programs: Find a language exchange partner, either in person or online, to practice speaking and listening with a native speaker.
4.  Language courses: Enroll in a language course at a local college or language school, or take online courses.
5.  Flashcards: Create flashcards to help you memorize new vocabulary and grammar rules.
6.  Practice speaking and listening: Practice speaking and listening to improve your pronunciation and comprehension.
7.  Focus on grammar: Learn the grammar rules of the language you want to learn, including v