In [None]:
import vllm
from vllm import LLM, SamplingParams
from typing import List

sampling_params = SamplingParams(
    temperature=0,
    top_p=0.95,
    max_tokens=1024,
    top_k=10,
    # repetition_penalty=1.15
)

prefix = (
    "You are a helpful customer care chatbot for a financial company called as Mars."
    "For a user query, please respond appropriately by asking questions "
    "and gathering more information before responding.\n"
)

requests_q = {}
total_requests = 0

In [None]:
def initialize_engine(model_id: str = 'TheBloke/Llama-2-7b-Chat-AWQ') -> vllm.LLMEngine:

    llm = LLM(model=model_id, gpu_memory_utilization=0.7)
    output = llm.generate(prefix, sampling_params)

    print(output)

    return llm.llm_engine

def get_request_id() -> str:
    """
    Generates a request ID

    Returns:
        str: Request ID
    """
    global total_requests
    global requests_q

    request_id =  str(total_requests)
    total_requests += 1
    requests_q.update({
        request_id: {}
    })

    return request_id

def initiate_request(llm: vllm.LLMEngine, request_id: str, message: str, prefix_pos: int) -> bool:
    """
    Initiates a request to the LLM engine

    Args:
        llm (vllm.LLMEngine): vLLM engine
        request_id (str): Request ID
        message (str): Message description

    Returns:
        bool: If the initiation was success or not
    """

    llm.add_request(
        request_id=request_id,
        prompt=prefix + message,
        sampling_params=sampling_params,
        prefix_pos=prefix_pos
    )
        

def next_message(llm: vllm.LLMEngine) -> str:
    """
    Gets the next message for the given Request ID

    Args:
        llm (vllm.LLMEngine): vLLM engine
        request_id (str): Request ID

    Returns:
        str: Next message in the Chat
    """
    while llm.has_unfinished_requests():
            request_outputs: List[vllm.RequestOutput] = llm.step()
            for request_output in request_outputs:
                if request_output.finished:
                    yield request_output

In [None]:
%%time
model = initialize_engine()

In [None]:
import time

In [None]:
prompt = 'I want to invest in mutual funds. Kindly give me 4 advice'
initiate_request(model, str(1), prompt, prefix_pos=len(prefix) + len(prompt) -1)
# initiate_request(model, str(3), 'How can you help me?')

start_time = time.time()

for elem in next_message(model):    
    print(
        elem.request_id + '\n',
        elem.prompt + '\n',
        elem.outputs[0].text + '\n'
    )

    speed = elem.outputs[0].token_ids.__len__()/(time.time() - start_time)

    print(f'Elapsed time: {time.time() - start_time}s')
    print(f'Speed {speed}')
    print('-'*4)

    start_time = time.time()

In [None]:
prompt3 = elem.outputs[0].text + '\n Cam you tell me the URL for them?'

In [None]:
prompt + prompt2 + prompt3

In [None]:
initiate_request(model, str(1), 'How should I go to these websites', prefix_pos=len(prefix) + len(prompt) + len(prompt2) + len(prompt3) -1)
# initiate_request(model, str(3), 'How can you help me?')

start_time = time.time()

for elem in next_message(model):    
    print(
        elem.request_id + '\n',
        elem.prompt + '\n',
        elem.outputs[0].text + '\n'
    )

    speed = elem.outputs[0].token_ids.__len__()/(time.time() - start_time)

    print(f'Elapsed time: {time.time() - start_time}s')
    print(f'Speed {speed}')
    print('-'*4)

    start_time = time.time()