# Speech-to-Image Using Google Cloud

- Speech is recorded directly in the notebook
- Recording is uploaded to Google Cloud (GC) Storage
- GC Speech-to-Text API to transcribe the recording from the specified `lang` & `loc` combination
- GC Translate API to translate into English
- DeepAI's `stable-diffusion` to generate images
- Save the desired image


- D. Mohaddes
- October 2022

## Setup

### Language settings

In [None]:
lang = "de" # Spoken language
loc  = "DE" # Language region

### Google Cloud settings

In [None]:
gcs_bucket_path = "gs://speech_to_image_data/"
google_creds_path = "/Users/danmohad/key.json"

## Execution

### Get audio from microphone

In [None]:
from ipywebrtc import AudioRecorder, CameraStream
import torchaudio
from IPython.display import Audio

In [None]:
camera = CameraStream(constraints={'audio': True,'video':False})
recorder = AudioRecorder(stream=camera)
recorder

### Convert to `.flac`

In [None]:
rec_file_name = "tmp.webm" # must use .webm
recorder.save(rec_file_name)

conv_file_name = "voice_file.flac" # must use .flac
!ffmpeg -i {rec_file_name} -ac 1 -f flac {conv_file_name} -y -hide_banner -loglevel panic

### Upload to GC storage

In [None]:
!gsutil cp {conv_file_name} {gcs_bucket_path}

### Transcribe using GC speech-to-text

In [None]:
from google.cloud import speech_v1 as speech

In [None]:
%set_env GOOGLE_APPLICATION_CREDENTIALS = {google_creds_path}

In [None]:
lang_code = lang + "_" + loc
audio_uri = gcs_bucket_path + conv_file_name

In [None]:
def speech_to_text(config, audio):
    client = speech.SpeechClient()
    response = client.recognize(config=config, audio=audio)
    print_sentences(response)
    return response.results[0].alternatives[0].transcript

def print_sentences(response):
    for result in response.results:
        best_alternative = result.alternatives[0]
        transcript = best_alternative.transcript
        confidence = best_alternative.confidence
        print(f"Transcript: {transcript}")
        print(f"Confidence: {confidence:.0%}")

In [None]:
config = dict(
    language_code=lang_code,
    enable_automatic_punctuation=True,
    enable_word_time_offsets=True,
)
audio = dict(uri=audio_uri)

In [None]:
txt = speech_to_text(config, audio)

### Translate using GC translate

In [None]:
import six
from google.cloud import translate_v2 as translate

def translate_text(target, text):
    """Translates text into the target language.

    Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    """
    translate_client = translate.Client()

    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")

    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.translate(text, source_language=lang, target_language=target)

    print(u"Text: {}".format(result["input"]))
    print(u"Translation: {}".format(result["translatedText"]))
    return result["translatedText"]

In [None]:
txt_tr = translate_text("en", txt)

### Generate images using DeepAI's `stable-diffusion`

In [None]:
import urllib
from PIL import Image
import requests

r = requests.post(
    "https://api.deepai.org/api/stable-diffusion",
    data={
        'text': txt_tr,
    },
    headers={'api-key': 'quickstart-QUdJIGlzIGNvbWluZy4uLi4K'}
)
im = Image.open(urllib.request.urlopen(r.json()['output_url']))
display(im)

### Choose an image to save

In [None]:
choice = 3

w, h = im.size
if choice == 0:
    im1 = im.crop((0, 0, int(w/2), int(h/2)))
elif choice == 1:
    im1 = im.crop((int(w/2), 0, int(w), int(h/2)))
elif choice == 2:
    im1 = im.crop((0, int(h/2), int(w/2), int(h)))
elif choice == 3:
    im1 = im.crop((int(w/2), int(h/2), int(w), int(h)))

im1.save(txt_tr.replace(" ", "_") + ".png")