In [2]:
!pip install openai
!pip install gradio
!pip install huggingface_hub
!pip install langchain
!pip install langchain_community
!pip install transformers



# Inference with Multiple LLMs
This notebook builds a system that lets users interact with multiple Large Language Models (LLMs) in a single interface. Users can choose specific LLMs to get responses or see responses from all available LLMs.

In [3]:
# Step 1: Load required packages
import os
import openai
from langchain import HuggingFacePipeline
from gradio import ChatInterface
from transformers import pipeline

# Load API keys
openai.api_key = os.getenv('OA_API')  # Set up your API key

In [4]:
# Step 2: Instantiate different LLMs
llms = {
    'gpt-3.5-turbo': openai.ChatCompletion,
    'gpt-4': openai.ChatCompletion,
    # 'zephyr-7b': pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta")
}

def get_llm_response(llm_key, prompt):
    """Return response from a selected LLM."""
    if llm_key == 'zephyr-7b':
        return llms[llm_key](prompt)
    else:
        return llms[llm_key].create(model=llm_key, messages=[{"role": "user", "content": prompt}]).choices[0].message['content']

In [11]:
# Step 3 & 4: Generate response and add memory
conversation_memory = []

def generate_response(user_input, selected_llm=None):
    global conversation_memory
    conversation_memory.append({'role': 'user', 'content': user_input})

    # Collect responses from each LLM or the selected LLM
    responses = {}
    if selected_llm:
        responses[selected_llm] = get_llm_response(selected_llm, user_input)
    else:
        for llm_key in llms.keys():
            responses[llm_key] = get_llm_response(llm_key, user_input)

    # Append LLM responses to conversation memory
    for llm, response in responses.items():
        conversation_memory.append({'role': llm, 'content': response})

    return responses

In [12]:
import gradio as gr

def reset_memory():
    global conversation_memory
    conversation_memory = []

with gr.Blocks() as interface:
    gr.Markdown("# Inference with Multiple LLMs")

    # Text input and LLM selection dropdown
    with gr.Row():
        text_input = gr.Textbox(label="User Input")
        llm_selector = gr.Dropdown(choices=list(llms.keys()), label="Select LLM (optional)")

    # Generate response button
    submit_button = gr.Button("Submit")

    # Display responses
    output_display = gr.JSON(label="LLM Responses")

    # Clear chat history button
    clear_button = gr.Button("Clear Chat")
    clear_button.click(reset_memory, [], [])

    # Optional audio input (without 'source' argument)
    audio_input = gr.Audio(type="filepath", label="Audio Input (optional)")

    # Define the button actions
    submit_button.click(
        fn=generate_response,
        inputs=[text_input, llm_selector],
        outputs=output_display
    )

interface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c2ed1df215101e4824.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


