# **INTERACTIVE DEMO**

Welcome to the tool! Please run all of the cells. The actual interactive part comes at the bottom where you get to interact with the program! (Please connect to a GPU for this notebook).

#**Installations**
Here we install all of the libraries necesary for our program. Please note, we install our own whisper repo as we changed around the functionality for it. PLEASE note that this cell may take up to 10 minutes to run

In [None]:
#Please make sure you are connected to a GPU run time
#Running this cell may take up to 5-10 mins
%%capture
def installations():
  !pip install git+https://github.com/evaprakash/whisper.git;
  !pip install jiwer;
  !pip install torch torchvision torchaudio torchdata torchtext --index-url https://download.pytorch.org/whl/cu118;
  !pip install pydub
  !pip install openai
  !pip install gtts
installations()

# **Record Audio**

The following cell is the function that employs javascript and pydub's AudioSegment to record user audio at the sample rate 16000. 16000 is the best sample rate for whisper to understand.

In [None]:
def getAudio():
  from google.colab import output
  from IPython.display import display, Javascript, HTML, Audio
  import base64
  from pydub import AudioSegment
  import io

  def record_audio(rate=16000):
      js = Javascript('''
      async function recordAudio(rate) {
          // Function to detect the browser
          function detectBrowser() {
              const ua = navigator.userAgent;
              if (ua.includes("Safari") && !ua.includes("Chrome")) {
                  return "Safari";
              }
              return "Other";
          }

          const browser = detectBrowser();
          let mimeType;
          if (browser === "Safari") {
              mimeType = 'audio/mp4';
          } else {
              mimeType = 'audio/webm';
          }

          const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
          const mediaRecorder = new MediaRecorder(stream, { mimeType: mimeType });
          let audioChunks = [];

          mediaRecorder.addEventListener("dataavailable", event => {
              audioChunks.push(event.data);
          });

          // Create the stop button
          const stopButton = document.createElement("button");
          stopButton.textContent = "Stop Recording";
          document.body.appendChild(stopButton);

          // Function to stop recording
          async function stopRecording() {
              mediaRecorder.stop();
              stream.getTracks().forEach(track => track.stop());
              stopButton.remove();
          }

          stopButton.addEventListener("click", stopRecording);

          mediaRecorder.start();

          await new Promise(resolve => mediaRecorder.onstop = resolve);
          const audioBlob = new Blob(audioChunks, { type: mimeType });
          const audioUrl = URL.createObjectURL(audioBlob);
          const audio = new Audio(audioUrl);
          const reader = new FileReader();
          reader.readAsDataURL(audioBlob);
          await new Promise(resolve => {
              reader.onloadend = () => {
                  const base64data = reader.result;
                  resolve(base64data);
              };
          });

          return reader.result;
      }
      ''')
      display(js)

  def get_audio(rate=16000):
      record_audio(rate)
      print("Recording... Click the stop button to end the recording.")
      data = output.eval_js('recordAudio(%d)' % rate)
      binary = base64.b64decode(data.split(',')[1])

      # Detect the browser and set the format
      browser_info = output.eval_js("navigator.userAgent")
      if "Safari" in browser_info and not "Chrome" in browser_info:
          format = 'mp4'
      else:
          format = 'webm'

      # Use Pydub to handle the audio
      audio = AudioSegment.from_file(io.BytesIO(binary), format=format)
      audio = audio.set_frame_rate(rate)
      return audio

  audio = get_audio()
  audio.export("audio.wav", format="wav")
  return "audio.wav"

# **Creating GPT Query (whiper + confidence score analysis)**



This cell loads our version of the whisper model and creates a dataloader so we can load the data. Then it does a lot of data processing to understand whether a given token is unclear given our predetermined confidence score threshold. Then it returns the transcription of what the model heard and the query that is being passed into GPT.

In [None]:

def getGPTQ(audioPath):
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
  import jiwer
  from whisper.normalizers import EnglishTextNormalizer

  processor = WhisperProcessor.from_pretrained("openai/whisper-base.en")
  normalizer = EnglishTextNormalizer()

  import whisper
  import numpy as np

  model = whisper.load_model("base.en")
  # print(
  #     f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
  #     f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
  # )
  options = whisper.DecodingOptions(language="en", without_timestamps=True)

  import torch
  import torchaudio
  import os
  import json
  import whisper


  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

  class VoiceCommand(torch.utils.data.Dataset):
      def __init__(self, device=DEVICE):
        self.dataset = [torchaudio.load(audioPath)]
        self.device = DEVICE

      def __len__(self):
        return len(self.dataset)

      def __getitem__(self, item):
        audio, sample_rate = self.dataset[item]
        audio = whisper.pad_or_trim(audio.flatten()).to(self.device)
        mel = whisper.log_mel_spectrogram(audio)
        return mel

  dataset = VoiceCommand()

  import torch
  loader = torch.utils.data.DataLoader(dataset, batch_size=1)

  from tqdm.notebook import tqdm
  from IPython.utils import io
  ts = []
  tokens = []
  hypotheses = []
  references = []

  for mels in tqdm(loader, disable=True):
    with io.capture_output() as captured:
      results = model.decode(mels, options)
    for result in results:
      ts.append(result.token_scores)
      tokens.append(result.tokens)
      hypotheses.append(result.text)

  transcription = processor.batch_decode(tokens, skip_special_tokens=False)
  print(transcription)
  nonListTokens = tokens[0]
  nonListTS = ts[0]
  phraseToInsert = '[unclear]'
  preDeterminedLowestScore = 0.5

  query = ""

  wordsClarified = {}
  for word in nonListTokens:
    wordsClarified[word] = 0
  for i in range(len(nonListTokens)-2):
    token1 = nonListTokens[i]
    token2 = nonListTokens[i+1]
    token3 = nonListTokens[i+2]
    tokScore1 = nonListTS[i]
    tokScore2 = nonListTS[i+1]
    tokScore3 = nonListTS[i+2]
    meanTokScore = tokScore1 + tokScore2 + tokScore3
    meanTokScore/=3

    if meanTokScore <= preDeterminedLowestScore:
      wordsClarified[token1] = 1
      wordsClarified[token2] = 1
      wordsClarified[token3] = 1

  for i in range(len(nonListTokens)-1):
    word = processor.batch_decode([nonListTokens[i]], skip_special_tokens=False)[0]
    if wordsClarified[nonListTokens[i]] == 1:
      query = query + "(" + word + ")" + phraseToInsert
    else:
      query = query + word
  return transcription[0], query

#**Get Answer (GPT + Play Audio)**
This cell passes the query into GPT with a couple of few shot examples and asks it to generate a response. Then given this response it plays the audio out loud. We used the following resource to play the audio out loud: https://stackoverflow.com/questions/57563060/how-to-do-text-to-speech-conversion-in-google-colab

In [None]:
def runGPT(query, transcription):
  import os
  import openai
  openai.api_key = "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"

  phraseStarter = 'You are a voice assistant; words marked with `(<word>)[unclear]` are not heard clearly. If the word is essential in answering the question, please ask clarification questions to make sure you understand before answering. You can answer the users query only if it does not change the sentences meaning. If the question is unclear, make sure to ask any necessary clarification questions about the words labeled unclear. Here are a couple examples of what we would like you to do: '
  fewshot = "User query: (with)[unclear] me up (at)[unclear] 5am this week. GPT response: I do not understand. What would you like my to do for you at 5am this week? User query: Tell me the last time the Portland Wizards won (the)[unclear] MLB championship. GPT response: the Portland Wizards are not an MLB team. Please repeat the question, clarifying which team you are asking for."

  historyStarter = 'Here is the conversation history: '
  sep = ','
  gptHistoryStr = sep.join(gptHistory)
  questionStarter = " Please answer the following question like you are interacting with the user: "

  completion = openai.chat.completions.create(
      model="gpt-3.5-turbo",
      messages=[
          {"role": "system", "content": phraseStarter + fewshot + historyStarter + gptHistoryStr},
           {"role": "user", "content": questionStarter + query}
      ],
  )

  content = phraseStarter + gptHistoryStr + questionStarter + transcription[0]

  answer = completion.choices[0].message.content

  if len(gptHistory) >= 6:
    gptHistory[0:4] = gptHistory[2:]
    gptHistory[4] = "User query: " + query
    gptHistory[5] = "GPT response: " + answer
  else:
    us = "User query: " + query
    gp = "GPT response: " + answer
    gptHistory.append(us)
    gptHistory.append(gp)


  from gtts import gTTS #Import Google Text to Speech
  from IPython.display import Audio #Import Audio method from IPython's Display Class
  tts = gTTS(answer) #Provide the string to convert to speech
  tts.save('1.wav') #save the string converted to speech as a .wav file
  sound_file = '1.wav'
  display(Audio(sound_file, autoplay=True))

# **Putting it all together**

PLEASE NOTE: The first time you run this cell, it might run into an error (sometimes the downloading of the necessary libraries messes with the program). If you run into an error, PLEASE RERUN THE CELL!

In [None]:
gptHistory = []
cont = True
while cont:
  audioPath = getAudio()
  transcription, query = getGPTQ(audioPath)
  print("This is what Whisper heard: " + transcription)
  print("This is the query going to GPT: " + query)
  runGPT(transcription, query)
  # print(gptHistory)
  cont = input("Would you like to continue? (Yes or No): ")
  cont = True if cont.lower() == 'yes' or cont.lower() == 'y' else False
print("Thank you for interacting with our tool.")

<IPython.core.display.Javascript object>

Recording... Click the stop button to end the recording.
[' As a college student, what is the best card to get?']
This is what Whisper heard:  As a college student, what is the best card to get?
This is the query going to GPT:  As a college student, what is the best card to get


Would you like to continue? (Yes or No): no
Thank you for interacting with our tool.
