### Installs

In [None]:
#!sudo apt install portaudio19-dev python3-pyaudio ffmpeg -y
!pip install whisper_mic SpeechRecognition langchain

### Imports

In [None]:
import os
import requests
import speech_recognition as sr
import torch

from langchain.llms.base import LLM, Optional, List, Mapping, Any
from langchain.memory import ConversationBufferMemory
from langchain import OpenAI, LLMChain, PromptTemplate
from langchain.callbacks import StdOutCallbackHandler
from pydantic import Field
from whisper_mic.whisper_mic import WhisperMic
from IPython.display import Audio, display

### Options

In [None]:
TRIGGER_WORD = "hey"
API_URL = "http://192.168.4.5:5000"
TORCH_DEVICE = "cpu" # gpu or cpu

VOICE_PROVIDER = "silero"
SILERO_VOICE = "random"

WHISPER_MODEL = "small"

### List available microphones

In [None]:
for index, name in enumerate(sr.Microphone.list_microphone_names()):
    print(f"{index}, {name}")

### List available GPUs (if any)

In [None]:
!nvidia-smi

### Call locally hosted LLM

This currently assumes you are self-hosting a model that uses the Vicuna 1.1 prompt format.

In [None]:
class OobaApiLLM(LLM):
    endpoint: str = Field(...)

    @property
    def _llm_type(self) -> str:
        return "custom"


    def _call(self, prompt: str, stop: Optional[List[str]]=None) -> str:
        data = {
            "prompt": prompt,
            "max_new_tokens": 1800,
            "preset": "simple-1"
        }

        if stop is not None:
            data["stop_sequence"] = stop

        response = requests.post(f"{self.endpoint}/api/v1/generate", json=data)
        response.raise_for_status()

        json_response = response.json()
        if "results" in json_response and len(json_response["results"]) > 0 and "text" in json_response["results"][0]:
            text = json_response["results"][0]["text"].strip()
            if stop is not None:
                for sequence in stop:
                    if text.endswith(sequence):
                        text = text[: -len(sequence)].rstrip()

            return text
        else:
            raise ValueError("Unexpected response format from Ooba API")

    def __call__(self, prompt: str, stop: Optional[List[str]]=None) -> str:
        return self._call(prompt, stop)

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        return {"endpoint": self.endpoint}


llm = OobaApiLLM(endpoint=API_URL)

### Setup Langchain Memory & Tools

In [None]:
handler = StdOutCallbackHandler()
template = """A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user"s input.

{chat_history}
USER: {human_input}
ASSISTANT:"""

prompt = PromptTemplate(
    input_variables=["chat_history", "human_input"], template=template
)
memory = ConversationBufferMemory(memory_key="chat_history")

llm_chain = LLMChain(
    llm=llm,
    prompt=prompt,
    verbose=True, # Enable for debugging
    memory=memory,
    callbacks=None
)

### Function to generate audio response w/ Silero

In [None]:
model, example_text = torch.hub.load(
    repo_or_dir='snakers4/silero-models',
    model='silero_tts',
    language='en',
    speaker='v3_en'
)

device = torch.device(TORCH_DEVICE)
model.to(device)  

def text2audio(text, speaker=SILERO_VOICE, sample_rate=48000, put_accent=True, put_yo=True):
    audio = model.apply_tts(
        text=text,
        speaker=speaker,
        sample_rate=sample_rate,
        put_accent=put_accent,
        put_yo=put_yo
    )
    return audio

### Listen & transcribe via Whisper

Model sizes:
- tiny
- base
- small
- medium
- large


In [None]:
mic = WhisperMic(
    model=WHISPER_MODEL,
    english=True,
    dynamic_energy=True
)

while True:
    captured_audio = mic.listen()
    print(captured_audio)
    if TRIGGER_WORD.upper() in captured_audio.upper():
        print()
        print(f"👂 HEARD: {captured_audio}")
        print()
        try:
            # llm_response = llm(f"""A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user"s input.
            # USER: {captured_audio}
            # ASSISTANT: 
            # """)
            llm_response = llm_chain.predict(human_input=captured_audio)
            print(f"📝 RESPONSE: {llm_response}")
            print()
        except Exception as err:
            print(f"⛔ LLM ERROR: {err}")

        try:
            audio_response = text2audio(text=llm_response)
            display(Audio(audio_response, rate=48000, autoplay=True))
        except Exception as err:
            print(f"⛔ TTS ERROR: {err}")