Install litellm

https://docs.litellm.ai/docs/

In [None]:
!pip install litellm

You need to create a REPLICATE API KEY

In [None]:
import litellm
import os

## set ENV variables
os.environ["REPLICATE_API_KEY"] = "[YOUR_TOKEN]"

# define messages
messages = [{ "content": "Hello, how are you?","role": "user"}]

# replicate call
response = litellm.completion(
    model="replicate/anthropic/claude-3.5-haiku", 
    messages=messages    
)

Inspect response

In [None]:
response

In [None]:
for key in response.model_dump().keys():
    value = response.get(key)
    print(f"{key}: {value}")

In [None]:
for key in response.choices[0].model_dump().keys():
    value = response.choices[0].get(key)
    print(f"{key}: {value}")

In [None]:
response.choices[0].message

In [None]:
response.choices[0].message.content

#### [Streaming](https://docs.litellm.ai/docs/completion/stream)

LiteLLM supports streaming the model response back by passing `stream=True` as an argument to the completion function

In [None]:
messages = [{"role": "user", "content": "Hey, how's it going? Write me a poem please"}]

response = litellm.completion(
    model="replicate/anthropic/claude-3.5-haiku", 
    messages=messages,
    stream=True
)

for part in response:
    print(part.choices[0].delta.content or "")

#### Streaming helper function
LiteLLM also exposes a helper function to rebuild the complete streaming response from the list of chunks.

In [None]:
messages = [{"role": "user", "content": "Hey, how's it going? Write me a poem please"}]

response = litellm.completion(
    model="replicate/anthropic/claude-3.5-haiku", 
    messages=messages,
    stream=True
)

chunks = []

for chunk in response: 
    chunks.append(chunk)

print(litellm.stream_chunk_builder(chunks, messages=messages))

### Gradio app

Testing a simple gradio app. [Tutorial](https://www.gradio.app/guides/creating-a-chatbot-fast) / [Documentation](https://www.gradio.app/docs/gradio/chatinterface)

In [None]:
!pip install gradio

In [None]:
import gradio as gr

In [None]:
def inference(message, history):
    try:
        
        flattened_history = [item for sublist in history for item in sublist]
        full_message = " ".join(flattened_history + [message])
        messages_litellm = [{"role": "user", "content": full_message}]
        
        partial_message = ""
        
        for chunk in litellm.completion(
            model="replicate/anthropic/claude-3.5-haiku",
            messages=messages_litellm,
            max_new_tokens=512,
            temperature=0.7,
            stream=True
        ):
            content = chunk['choices'][0]['delta'].get('content', '') or ''
            partial_message += str(content)
            yield partial_message

    except Exception as e:
        yield f"An error occurred: {str(e)}"


In [None]:
gr.ChatInterface(
    inference,
    title="LiteLLM Chatbot",
    description="A simple chatbot using LiteLLM and Gradio",
).launch(share=True)