### Installs

In [16]:
!sudo apt install portaudio19-dev python3-pyaudio ffmpeg -y
!pip install -qq -U whisper_mic SpeechRecognition langchain ffmpeg omegaconf

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
portaudio19-dev is already the newest version (19.6.0-1.1).
python3-pyaudio is already the newest version (0.2.11-1.3ubuntu1).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


### Imports

In [2]:
import os
import requests
import speech_recognition as sr
import torch

from langchain.llms.base import LLM, Optional, List, Mapping, Any
from langchain.memory import ConversationBufferMemory
from langchain import OpenAI, LLMChain, PromptTemplate
from langchain.callbacks import StdOutCallbackHandler
from pydantic import Field
from whisper_mic.whisper_mic import WhisperMic
from IPython.display import Audio, display

[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.[0m


### Options

In [3]:
TRIGGER_WORD = "hey"
API_URL = "http://192.168.4.5:5000"
TORCH_DEVICE = "cuda" # cpu or cuda

VOICE_PROVIDER = "silero"
SILERO_VOICE = "random"

WHISPER_MODEL = "small"

### List available microphones

In [4]:
for index, name in enumerate(sr.Microphone.list_microphone_names()):
    print(f"{index}, {name}")

ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1334:(snd_func_refer) error evaluating name
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5701:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM sysdefault
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_concat returned error: No

### List available GPUs (if any)

In [8]:
!nvidia-smi

Mon Jul 17 22:16:12 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:07:00.0 Off |                  N/A |
|  0%   36C    P8              21W / 350W |  17453MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### Call locally hosted LLM

This currently assumes you are self-hosting a model that uses the Vicuna 1.1 prompt format.

In [12]:
class OobaApiLLM(LLM):
    endpoint: str = Field(...)

    @property
    def _llm_type(self) -> str:
        return "custom"


    def _call(self, prompt: str, stop: Optional[List[str]]=None) -> str:
        data = {
            "prompt": prompt,
            "max_new_tokens": 1800,
            "preset": "simple-1"
        }

        if stop is not None:
            data["stop_sequence"] = stop

        response = requests.post(f"{self.endpoint}/api/v1/generate", json=data)
        response.raise_for_status()

        json_response = response.json()
        if "results" in json_response and len(json_response["results"]) > 0 and "text" in json_response["results"][0]:
            text = json_response["results"][0]["text"].strip()
            if stop is not None:
                for sequence in stop:
                    if text.endswith(sequence):
                        text = text[: -len(sequence)].rstrip()

            return text
        else:
            raise ValueError("Unexpected response format from Ooba API")

    def __call__(self, prompt: str, stop: Optional[List[str]]=None) -> str:
        return self._call(prompt, stop)

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        return {"endpoint": self.endpoint}


llm = OobaApiLLM(endpoint=API_URL)

### Setup Langchain Memory & Tools

In [13]:
handler = StdOutCallbackHandler()
template = """A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user"s input.

{chat_history}
USER: {human_input}
ASSISTANT:"""

prompt = PromptTemplate(
    input_variables=["chat_history", "human_input"], template=template
)
memory = ConversationBufferMemory(memory_key="chat_history")

llm_chain = LLMChain(
    llm=llm,
    prompt=prompt,
    verbose=True, # Enable for debugging
    memory=memory,
    callbacks=None
)

### Function to generate audio response w/ Silero

In [5]:
model, example_text = torch.hub.load(
    repo_or_dir='snakers4/silero-models',
    model='silero_tts',
    language='en',
    speaker='v3_en'
)

device = torch.device(TORCH_DEVICE)
model.to(device)  

def text2audio(text, speaker=SILERO_VOICE, sample_rate=48000, put_accent=True, put_yo=True):
    audio = model.apply_tts(
        text=text,
        speaker=speaker,
        sample_rate=sample_rate,
        put_accent=put_accent,
        put_yo=put_yo
    )
    return audio

Using cache found in /home/jovyan/.cache/torch/hub/snakers4_silero-models_master


### Listen & transcribe via Whisper

Model sizes:
- tiny
- base
- small
- medium
- large


In [None]:
mic = WhisperMic(
    model=WHISPER_MODEL,
    english=True,
    dynamic_energy=True
)

while True:
    captured_audio = mic.listen()
    print(captured_audio)
    if TRIGGER_WORD.upper() in captured_audio.upper():
        print()
        print(f"👂 HEARD: {captured_audio}")
        print()
        try:
            llm_response = llm_chain.predict(human_input=captured_audio)
            print(f"📝 RESPONSE: {llm_response}")
            print()
        except Exception as err:
            print(f"⛔ LLM ERROR: {err}")

        try:
            audio_response = text2audio(text=llm_response)
            display(Audio(audio_response, rate=48000, autoplay=True))
        except Exception as err:
            print(f"⛔ TTS ERROR: {err}")