In [None]:
import time
import vllm
from vllm import LLM, SamplingParams
from typing import List

sampling_params = SamplingParams(
    temperature=0.9,
    top_p=0.8,
    max_tokens=1024,
    top_k=50,
    # repetition_penalty=1.15
)

prefix = (
    "You are a helpful customer care chatbot for a financial company called as Mars."
    "For a user query, please respond appropriately by asking questions "
    "and gathering more information before responding.\n"
)

requests_q = {}
total_requests = 0

In [None]:
def initialize_engine(model_id: str = 'TheBloke/Llama-2-7b-Chat-AWQ') -> vllm.LLMEngine:

    llm = LLM(model=model_id, gpu_memory_utilization=0.7, max_model_len=102)
    # output = llm.generate(prefix, sampling_params)

    # print(output)

    return llm.llm_engine

def get_request_id() -> str:
    """
    Generates a request ID

    Returns:
        str: Request ID
    """
    global total_requests
    global requests_q

    request_id =  str(total_requests)
    total_requests += 1
    requests_q.update({
        request_id: {}
    })

    return request_id

def initiate_request(llm: vllm.LLMEngine, request_id: str, message: str, prefix_pos: int) -> bool:
    """
    Initiates a request to the LLM engine

    Args:
        llm (vllm.LLMEngine): vLLM engine
        request_id (str): Request ID
        message (str): Message description

    Returns:
        bool: If the initiation was success or not
    """

    llm.add_request(
        request_id=request_id,
        prompt=message,
        sampling_params=sampling_params,
        prefix_pos=prefix_pos
    )
        

def next_message(llm: vllm.LLMEngine) -> str:
    """
    Gets the next message for the given Request ID

    Args:
        llm (vllm.LLMEngine): vLLM engine
        request_id (str): Request ID

    Returns:
        str: Next message in the Chat
    """
    while llm.has_unfinished_requests():
        time.sleep(2)
        print('stepping')
        request_outputs: List[vllm.RequestOutput] = llm.step()
        yield request_outputs

In [None]:
%%time
model = initialize_engine()

In [None]:
prompt = 'generate 50 random words'

In [None]:
initiate_request(model, str(1), prompt, prefix_pos=None)

start_time = time.time()
i = 2

for elem in next_message(model):    
    print(len(elem))
    print(elem)
    print('----')
    initiate_request(model, str(i), prompt, prefix_pos=None)
    i += 1

    if i >= 10:
        break

In [None]:
for elem in next_message(model):    
    print(len(elem))
    print(elem)
    print('----')

In [None]:
[e.outputs[0].text for e in elem]

In [None]:
# initiate_request(model, str(1), prompt, prefix_pos=None)

start_time = time.time()

for elem in next_message(model):    
    print(
        elem.request_id + '\n',
        elem.prompt + '\n',
        elem.outputs[0].text + '\n'
    )

    completed_time = time.time() - start_time
    speed = elem.outputs[0].token_ids.__len__()/completed_time
    print(f'Speed {round(speed, 2)}, time: {round(completed_time, 2)}')

In [None]:
op1 = elem.outputs[0].text

In [None]:
modified_prompt = prompt + op1 + ' \nTell me 1 more joke about engineers.'

In [None]:
initiate_request(model, str(2), modified_prompt, prefix_pos=None)
# initiate_request(model, str(2), modified_prompt, prefix_pos=len(prompt) + len(op1) - 1)

start_time = time.time()

for elem in next_message(model):    
    print(
        elem.request_id + '\n',
        elem.prompt + '\n',
        elem.outputs[0].text + '\n'
    )

    completed_time = time.time() - start_time
    speed = elem.outputs[0].token_ids.__len__()/completed_time
    print(f'Speed {round(speed, 2)}, time: {round(completed_time, 2)}')

    break