# Speech-to-Image Using Google Cloud

- Speech is recorded directly in the notebook
- Recording is uploaded to Google Cloud Storate (GCS)
- Google Speech-to-Text API to transcribe the recording from the specified lang & loc combination
- Google Translate API to translate into English
- DeepAI's _stable-diffusion_ to generate images
- Save the desired image


- D. Mohaddes
- October 2022

In [None]:
# Spoken language
lang = "en" # "de"
loc  = "US" # "DE"

## Get audio from microphone

In [None]:
from ipywebrtc import AudioRecorder, CameraStream
import torchaudio
from IPython.display import Audio

In [None]:
camera = CameraStream(constraints={'audio': True,'video':False})
recorder = AudioRecorder(stream=camera)
recorder

In [None]:
with open('recording.webm', 'wb') as f:
    f.write(recorder.audio.value)
!ffmpeg -i recording.webm -ac 1 -f flac voice_file.flac -y -hide_banner -loglevel panic
# sig, sr = torchaudio.load("voice_file.flac")
# Audio(data=sig, rate=sr)

## Upload to Google Cloud Storage

In [None]:
!gsutil cp ./voice_file.flac gs://speech_to_image_data/

## Transcribe

In [None]:
from google.cloud import speech_v1 as speech

import IPython
import urllib
from PIL import Image
import requests

In [None]:
%set_env GOOGLE_APPLICATION_CREDENTIALS=/Users/danmohad/key.json

In [None]:
lang_code = lang + "_" + loc
audio_uri = "gs://speech_to_image_data/voice_file.flac"

In [None]:
def speech_to_text(config, audio):
    client = speech.SpeechClient()
    response = client.recognize(config=config, audio=audio)
    print_sentences(response)
    return response.results[0].alternatives[0].transcript


def print_sentences(response):
    for result in response.results:
        best_alternative = result.alternatives[0]
        transcript = best_alternative.transcript
        confidence = best_alternative.confidence
        print("-" * 80)
        print(f"Transcript: {transcript}")
        print(f"Confidence: {confidence:.0%}")

In [None]:
config = dict(
    language_code=lang_code,
    enable_automatic_punctuation=True,
    enable_word_time_offsets=True,
)
audio = dict(uri=audio_uri)

In [None]:
txt = speech_to_text(config, audio)

## Translate

In [None]:
def translate_text(target, text):
    """Translates text into the target language.

    Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    """
    import six
    from google.cloud import translate_v2 as translate

    translate_client = translate.Client()

    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")

    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.translate(text, source_language=lang, target_language=target)

    print(u"Text: {}".format(result["input"]))
    print(u"Translation: {}".format(result["translatedText"]))
    return result["translatedText"]

In [None]:
txt_tr = translate_text("en", txt)

## Generate image

In [None]:
r = requests.post(
    #"https://api.deepai.org/api/text2img",
    "https://api.deepai.org/api/stable-diffusion",
    data={
        'text': txt_tr,
    },
    headers={'api-key': 'quickstart-QUdJIGlzIGNvbWluZy4uLi4K'}
)

In [None]:
im = Image.open(urllib.request.urlopen(r.json()['output_url']))
display(im)

In [None]:
choice = 3

w, h = im.size
if choice == 0:
    im1 = im.crop((0, 0, int(w/2), int(h/2)))
elif choice == 1:
    im1 = im.crop((int(w/2), 0, int(w), int(h/2)))
elif choice == 2:
    im1 = im.crop((0, int(h/2), int(w/2), int(h)))
elif choice == 3:
    im1 = im.crop((int(w/2), int(h/2), int(w), int(h)))

display(im1)