In [None]:
import copy
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, StaticCache
import transformers
import pandas as pd
import numpy as np

In [None]:
questions = [
    'Is the input related to food preparation?',
    'Does the input mention laughter?',
    'Is there an expression of surprise?',
    'Is there a depiction of a routine or habit?',
    'Is there stuttering or uncertainty in the input?',
    # 'Is there a first-person pronoun in the input?',
]
examples = [
    'i sliced some cucumbers and then moved on to what was next',
    'the kids were giggling about the silly things they did',
    'and i was like whoa that was unexpected',
    'walked down the path like i always did',
    'um no um then it was all clear',
    # 'i was walking to school and then i saw a cat',
]
prompt_prefix = '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a concise, helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nInput text: '
prompt_template = '{example}\nQuestion: {question}\nAnswer with yes or no, then give an explanation.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
prompts = [
    prompt_prefix + prompt_template.format(example=example, question=question)
    for example in examples
    for question in questions
]

In [None]:
# model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'
model_id = 'meta-llama/Llama-3.1-8B-Instruct'
model = AutoModelForCausalLM.from_pretrained(
    model_id, torch_dtype=torch.bfloat16,
    device_map="cuda:0"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Initialize cache for a batch scenario. For example, allow up to a batch size of 4.
# Increase max_cache_len as needed for your use case.
prompt_cache = StaticCache(config=model.config, max_batch_size=4,
                           max_cache_len=1024, device="cuda", dtype=torch.bfloat16)

# INITIAL_PROMPT = "You are a helpful assistant. "
INITIAL_PROMPT = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a concise, helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"

# Suppose we have multiple prompts that share the same initial prefix.
prompts = ["Help me write a blog post about traveling.",
           "What is the capital of France?",
           "Can you summarize this article?",
           "Give me a recipe for chocolate cake."]

SUFFIX = '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'

# First, we prepare a batch of identical prefixes. Each prompt in the batch starts with the same prefix.
batch_inputs_initial = tokenizer(
    [INITIAL_PROMPT] * len(prompts), return_tensors="pt", padding=True).to("cuda")

# Precompute the prefix past_key_values for the entire batch.
with torch.no_grad():
    prompt_cache = model(**batch_inputs_initial,
                         past_key_values=prompt_cache).past_key_values

prompts_full = [INITIAL_PROMPT + p + SUFFIX
                for p in prompts]

# Now we form the full prompt by appending each user query to the INITIAL_PROMPT.
# Since the prefix has already been cached, the model will start generating from that state.
batch_inputs = tokenizer(prompts_full,
                         return_tensors="pt", padding=True).to("cuda")

# Generate outputs for the entire batch at once, utilizing the cached prefix.
outputs = model.generate(
    **batch_inputs, past_key_values=prompt_cache, max_new_tokens=5)
responses = tokenizer.batch_decode(outputs, skip_special_tokens=True)
# responses = []
# for i, output_ids in enumerate(outputs):
# prompt_len = batch_inputs["input_ids"][i].shape[0]
# responses.append(responses[i][prompt_len:])

for i, response in enumerate(responses):
    print(f"Prompt: {repr(prompts_full[i])}")
    print(f"Response: {repr(response)}\n")

In [None]:
prompt_len

# Pipeilne

In [None]:
checkpoint = 'meta-llama/Llama-3.1-8B-Instruct'

pipeline_ = transformers.pipeline(
    "text-generation",
    model=checkpoint,
    model_kwargs={"torch_dtype": torch.bfloat16},
    # model_kwargs={'torch_dtype': torch.float16},
    device_map="cuda:0"
)
pipeline_.tokenizer.pad_token_id = pipeline_.tokenizer.eos_token_id
pipeline_.tokenizer.padding_side = 'left'

In [None]:
ps = prompts[:4]

In [None]:
INITIAL_PROMPT = '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a concise, helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nInput text:'
prompt_cache = StaticCache(config=pipeline_.model.config, max_batch_size=4,
                           max_cache_len=1024, device="cuda", dtype=torch.bfloat16)
# First, we prepare a batch of identical prefixes. Each prompt in the batch starts with the same prefix.
batch_inputs_initial = pipeline_.tokenizer(
    [INITIAL_PROMPT] * len(ps), return_tensors="pt", padding=True).to("cuda:0")

# Precompute the prefix past_key_values for the entire batch.
with torch.no_grad():
    prompt_cache = pipeline_.model(**batch_inputs_initial,
                                   past_key_values=prompt_cache).past_key_values

In [None]:
prompt = ps
max_new_tokens = 10
batch_size = 4

outputs = pipeline_(
    prompt,
    max_new_tokens=max_new_tokens,
    batch_size=batch_size,
    do_sample=False,
    pad_token_id=pipeline_.tokenizer.pad_token_id,
    past_key_values=prompt_cache,
)
if isinstance(prompt, str):
    texts = outputs[0]["generated_text"][len(prompt):]
else:
    texts = [outputs[i][0]['generated_text']
             [len(prompt[i]):] for i in range(len(outputs))]

In [None]:
outputs

In [None]:
answers = list(map(lambda x: 'yes' in x.lower(), texts))
answers = np.array(answers).reshape(len(examples), len(questions))
embeddings = np.array(answers, dtype=float)

In [None]:
df = pd.DataFrame(embeddings.astype(int), columns=[
    q.split()[-1] for q in questions])

In [None]:
df

# vLLM

In [None]:
from vllm import LLM, SamplingParams

# For generative models (task=generate) only
llm = LLM(model='meta-llama/Llama-3.1-8B-Instruct', task="generate",
          max_model_len=300)  # Name or path of your model
output = llm.generate("Hello, my name is")
sampling_params = SamplingParams(max_tokens=1)
print(output)

INFO 12-14 08:41:05 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=300, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_cachin

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 12-14 08:41:11 model_runner.py:1077] Loading model weights took 14.9888 GB
INFO 12-14 08:41:12 worker.py:232] Memory profiling results: total_gpu_memory=44.55GiB initial_memory_usage=33.22GiB peak_torch_memory=16.18GiB memory_usage_post_profile=33.25GiB non_torch_memory=18.25GiB kv_cache_size=5.67GiB gpu_memory_utilization=0.90
INFO 12-14 08:41:12 gpu_executor.py:113] # GPU blocks: 2902, # CPU blocks: 2048
INFO 12-14 08:41:12 gpu_executor.py:117] Maximum concurrency for 300 tokens per request: 154.77x
INFO 12-14 08:41:15 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 12-14 08:41:15 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
IN

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.12it/s, est. speed input: 12.76 toks/s, output: 34.02 toks/s]

[RequestOutput(request_id=0, prompt='Hello, my name is', prompt_token_ids=[128000, 9906, 11, 856, 836, 374], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text=' Helen and I started this quote-of-the-day blog to share inspiring quotes that might', token_ids=(43881, 323, 358, 3940, 420, 12929, 8838, 10826, 11477, 5117, 311, 4430, 34147, 17637, 430, 2643), cumulative_logprob=None, logprobs=None, finish_reason=length, stop_reason=None)], finished=True, metrics=RequestMetrics(arrival_time=1734194491.7371595, last_token_time=1734194491.7371595, first_scheduled_time=1734194491.7511826, first_token_time=1734194491.7839239, time_in_queue=0.014023065567016602, finished_time=1734194492.1947834, scheduler_time=0.001187936868518591, model_forward_time=None, model_execute_time=None), lora_request=None, num_cached_tokens=0)]



