# Voice to Text ChatGPT Demo using Python

February 2023

Short demo transcribing voiced questions to text then having them answered using ChatGPT and python libraries openai, gradio and whisper.

openai -> https://github.com/openai/openai-python

gradio -> https://github.com/gradio-app/gradio

whisper -> https://github.com/openai/whisper.git

In [1]:
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q gradio
!pip install -q openai

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for openai-whisper (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.2/14.2 MB[0m [31m68.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.5/71.5 KB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m69.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━

### Install the Libraries

In [30]:
import openai as oai
import gradio as grd
import whisper
import time
import warnings

### Set your API Key
openai.api_key = Replace this with your API key from https://beta.openai.com/docs/quickstart/add-your-api-key

In [3]:
oai.api_key = '***' # Replace this with your API key: https://beta.openai.com/docs/quickstart/add-your-api-key

### Load Whisper

In [4]:
warnings.filterwarnings("ignore")

In [31]:
model = whisper.load_model("base")

In [32]:
model.device

device(type='cpu')

### Define a Chat Function

In [7]:
def openai_chat(prompt):
    completions = oai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt,
        max_tokens=1024,
        n=1,
        temperature=0.5,
    )

    message = completions.choices[0].text
    return message.strip()

### Define a Voice to Text Transcribe Function

In [46]:
def transcribe(audio):

    # load audio and pad/trim it to fit 30 seconds
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    # make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    # detect the spoken language
    _, probs = model.detect_language(mel)
    print(f"Detected language: {max(probs, key=probs.get)}")

    # decode the audio
    options = whisper.DecodingOptions(fp16 = False)
    result = whisper.decode(model, mel, options)
    result_text = result.text
    # print(result_text) # debug only

    # Pass the generated text to Audio
    resp = openai_chat(result_text)
    out_result = resp
    # print(out_result) # debug only

    return [result_text, out_result]
    # return result_text # debug only

### Launch UI using GradIO

In [None]:
output_1 = grd.Textbox(label="Speech to Text")
output_2 = grd.Textbox(label="ChatGPT Output")


grd.Interface(
    title = 'Audio Question to Text conversion and ChatGPT response', 
    fn=transcribe, 
    inputs=[
        grd.inputs.Audio(source="microphone", type="filepath")
    ],

    outputs=[
        output_1,  
        output_2,
    ],
    live=True).launch(debug = True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

Detected language: en


![Output 2](./images/Voice2Text_ChatGPT.png)