### Get audio from microphone

In [None]:
from ipywebrtc import AudioRecorder, CameraStream
from IPython.display import Audio

In [None]:
camera = CameraStream(constraints={'audio': True,'video':False})
recorder = AudioRecorder(stream=camera)
recorder

### Setup model

In [None]:
import whisper

In [None]:
model = whisper.load_model("large")

### Detect, transcribe and translate audio

In [None]:
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(rec_file_name)
audio = whisper.pad_or_trim(audio)

# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)

# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# transcribe the spoken audio (corrective feedback for user who does not speak English)
options = whisper.DecodingOptions(fp16 = False, task="transcribe")
result = whisper.decode(model, mel, options)

# print the recognized text
print(result.text)

# translate the spoken audio (for use with DALL-E-2)
options = whisper.DecodingOptions(fp16 = False, task="translate")
result = whisper.decode(model, mel, options)

# print the translated text
print(result.text)

### Generate images using DeepAI's `stable-diffusion`

In [None]:
import urllib
from PIL import Image
import requests

r = requests.post(
    "https://api.deepai.org/api/stable-diffusion",
    data={
        'text': result.text,
    },
    headers={'api-key': 'quickstart-QUdJIGlzIGNvbWluZy4uLi4K'}
)
im = Image.open(urllib.request.urlopen(r.json()['output_url']))
display(im)

### Choose an image to save

In [None]:
choice = 3

w, h = im.size
if choice == 0:
    im1 = im.crop((0, 0, int(w/2), int(h/2)))
elif choice == 1:
    im1 = im.crop((int(w/2), 0, int(w), int(h/2)))
elif choice == 2:
    im1 = im.crop((0, int(h/2), int(w/2), int(h)))
elif choice == 3:
    im1 = im.crop((int(w/2), int(h/2), int(w), int(h)))

display(im1)
    
im1.save(result.text.replace(" ", "_") + ".png")