In [None]:
!pip install litellm # version 0.1.724 or higher 

## Currently running LLMs in three vscode Tasks on the host, due to conflicts
.vscode/tasks.json
```json
{
    "version": "2.0.0",
    "tasks": [
        {
            "label": "Start Llama2 on Port 50000",
            "type": "shell",
            "command": "export PYENV_VERSION=litellm && litellm -m llama2 -c ${workspaceFolder}/litellm_config.yaml --host localhost --port 50000",
            "isBackground": true,
            "problemMatcher": []
        },
        {
            "label": "Start Mistral on Port 50001",
            "type": "shell",
            "command": "export PYENV_VERSION=litellm && litellm -m mistral -c ${workspaceFolder}/litellm_config.yaml --host localhost --port 50001",
            "isBackground": true,
            "problemMatcher": []
        },
        {
            "label": "Start Orca on Port 50002",
            "type": "shell",
            "command": "export PYENV_VERSION=litellm && litellm -m orca2 -c ${workspaceFolder}/litellm_config.yaml --host localhost --port 50002",
            "isBackground": true,
            "problemMatcher": []
        }
        // Add more as needed
    ]
}
```

## Call Ollama - llama2 with Streaming

In [5]:
from litellm import completion

response = completion(
            model="ollama/llama2", 
            messages = [{ "content": "Hello, how are you, what is your name?","role": "user"}], 
            api_base="http://localhost:50000",
            stream=True,
)
print(response)
# for chunk in response:
#     print(chunk['choices'][0]['delta'])


<generator object ollama_completion_stream at 0x1129c6440>


## Call Ollama - Llama2 with Acompletion + Streaming

In [13]:
# litellm uses async_generator for ollama async streaming, ensure it's installed
!pip install async_generator

Collecting async_generator
  Downloading async_generator-1.10-py3-none-any.whl (18 kB)
Installing collected packages: async_generator
Successfully installed async_generator-1.10


In [4]:
import litellm

async def async_ollama():
    response = await litellm.acompletion(
        model="ollama/llama2", 
        messages=[{ "content": "what's the weather" ,"role": "user"}], 
        api_base="http://localhost:11434", 
        stream=True
    )
    async for chunk in response:
        print(chunk)

result = await async_ollama()
print(result)

try:
    async for chunk in result:
        print(chunk)
except TypeError: # the last chunk is None from Ollama, this raises an error with async streaming
    pass

ModelResponse(id='chatcmpl-d296371f-bed1-4b46-b084-e4bf1cc03ee7', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='I', role='assistant'))], created=1707556579, model='llama2', object='chat.completion.chunk', system_fingerprint=None, usage=Usage())
ModelResponse(id='chatcmpl-d296371f-bed1-4b46-b084-e4bf1cc03ee7', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content="'"))], created=1707556579, model='llama2', object='chat.completion.chunk', system_fingerprint=None, usage=Usage())
ModelResponse(id='chatcmpl-d296371f-bed1-4b46-b084-e4bf1cc03ee7', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='m'))], created=1707556579, model='llama2', object='chat.completion.chunk', system_fingerprint=None, usage=Usage())
ModelResponse(id='chatcmpl-d296371f-bed1-4b46-b084-e4bf1cc03ee7', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' just'))], created=1707556579, model='llama2', object='chat.completio

## Completion Call

In [15]:
from litellm import completion

response = completion(
    model="ollama/llama2", 
    messages=[{ "content": "respond in 20 words. who are you?","role": "user"}], 
    api_base="http://localhost:11434"
)
print(response)


ModelResponse(id='chatcmpl-1efd99d2-2d4a-4390-ba03-bbeb04a57ea9', choices=[Choices(finish_reason='stop', index=0, message=Message(content='I am LLaMA, an AI assistant developed by Meta AI that can understand and respond to human input in a conversational manner.', role='assistant'))], created=1707543741, model='ollama/llama2', object='chat.completion', system_fingerprint=None, usage=Usage(prompt_tokens=16, completion_tokens=30, total_tokens=46))
