### Settings

In [None]:
# Hardware settings
device = "cpu" # cpu, cuda, mps

# whisper settings
model_size = "large" # tiny, base, small, medium, large

# local stable-diffusion settings
num_inference_steps = 51 # default: 51

# Output settings
out_path = "./gallery/"

### Setup

In [None]:
from ipywebrtc import AudioRecorder, CameraStream
from IPython.display import Audio

import whisper
model = whisper.load_model(model_size, device=device)

from diffusers import StableDiffusionPipeline
import torch

!mkdir -p {out_path}

### Get audio from microphone

In [None]:
camera = CameraStream(constraints={'audio': True,'video':False})
recorder = AudioRecorder(stream=camera)
recorder

### Detect, transcribe and translate audio using OpenAI's `whisper` speech-to-text model

In [None]:
# save audio to temporary file
rec_file_name = "tmp.webm" # must use .webm
recorder.save(rec_file_name)

# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(rec_file_name)
audio = whisper.pad_or_trim(audio)

# remove the temporary audio file
!rm {rec_file_name}

# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)

# detect the spoken language
_, probs = model.detect_language(mel)
lang = max(probs, key=probs.get)

# print detected language
try:
    print(babel.Locale(lang).get_language_name())

# whisper uses a deprecated language code for Hebrew, see github.com/openai/whisper PR #401
except babel.UnknownLocaleError:
    if lang == "iw":
        lang = "he"
    else:
        raise babel.UnknownLocaleError(f"unknown locale {lang}")

# transcribe the spoken audio (corrective feedback for user who does not speak English)
options = whisper.DecodingOptions(fp16 = False, task="transcribe")
result = whisper.decode(model, mel, options)
result_orig = result

# print the recognized text
print(result.text)

if lang != "en":
    # translate the spoken audio (for use with text-to-image)
    options = whisper.DecodingOptions(fp16 = False, task="translate")
    result = whisper.decode(model, mel, options)

    # print the translated text
    print(result.text)

### Generate images using Stability AI's `stable-diffusion`, locally

In [None]:
if device == "cpu":
    pipe = StableDiffusionPipeline.from_pretrained("../stable-diffusion-v1-5")
else:
    pipe = StableDiffusionPipeline.from_pretrained("../stable-diffusion-v1-5", 
                                                   torch_dtype=torch.float16, 
                                                   revision="fp16")
pipe = pipe.to(device)

prompt = result.text

# First-time "warmup" pass
_ = pipe(prompt, num_inference_steps=1)

# Result generation
im = pipe(prompt, num_inference_steps=num_inference_steps).images[0]
display(im)

### Generate images using Stability AI's `stable-diffusion`, using DeepAI's API

In [None]:
import urllib
from PIL import Image
import requests

r = requests.post(
    "https://api.deepai.org/api/stable-diffusion",
    data={
        #'text': result.text,
        'text' : "Astonauts eating thai food in space"
    },
    headers={'api-key': 'quickstart-QUdJIGlzIGNvbWluZy4uLi4K'}
)
im = Image.open(urllib.request.urlopen(r.json()['output_url']))
display(im)

### Save image

In [None]:
im.save(out_path + result_orig.text.replace(" ", "_").replace(".", "").replace(",", "") + ".png")