# Demo the Phonemizer

In [1]:
from glados.TTS import phonemizer
import glados.utils.spoken_text_converter as stc

In [2]:
p = phonemizer.Phonemizer()
glados_stc = stc.SpokenTextConverter()

In [None]:
%%time
input = "Hello CPU, its 3:15 am! if you'll excuse me I'm GLaDOS, not GLadys."
phonemes = p.convert_to_phonemes(input)
print(phonemes)

# Demo the Text-to-Speech module
### GLaDOS Voice

In [5]:
import sounddevice as sd

from glados.TTS import tts_glados as tts
import glados.utils.spoken_text_converter as stc

In [6]:
glados_tts = tts.Synthesizer()
glados_stc = stc.SpokenTextConverter()

In [None]:
%%time
input = "Hello, this is Glados, your fiendish assistant. Please upgrade your GPU!"

# Convert the text to intermediate representation that the TTS model can better pronounce
intermediate = glados_stc.text_to_spoken(input)
print(intermediate)

# Generate the audio to from the text
audio = glados_tts.generate_speech_audio(intermediate)

# Play the audio
sd.play(audio, glados_tts.sample_rate)

### Save audio file

In [None]:
# Save the audio to a file
import soundfile as sf

sf.write("output.wav", audio, glados_tts.sample_rate, format="WAV", subtype="PCM_16")

### Kokoko Voice

Select from:
 - Female
   - **US** - af_alloy, af_aoede, af_bella, af_jessica, af_kore, af_nicole, af_nova, af_river, af_sarah, af_sky
   - **British** - bf_alice, bf_emma, bf_isabella, bf_lily
 - Male
   - **US** - am_adam, am_echo, am_eric, am_fenrir, am_liam, am_michael, am_onyx, am_puck
   - **British** - bm_daniel, bm_fable, bm_george, bm_lewis


In [10]:
import sounddevice as sd

from glados.TTS import tts_kokoro as ktts
import glados.utils.spoken_text_converter as stc

In [11]:
kokoro_tts = ktts.Synthesizer(model_path="./models/TTS/kokoro-v1.0.fp16.onnx")
kokoro_stc = stc.SpokenTextConverter()

In [None]:
%%time

voice = "af_bella"
input = "Hello, this is Glados, your fiendish assistant. Please upgrade your GPU!"

# Convert the text to intermediate representation that the TTS model can better pronounce
intermediate = kokoro_stc.text_to_spoken(input)
print(intermediate)

# Generate the audio to from the text
audio = kokoro_tts.generate_speech_audio(intermediate, voice=voice)

# Play the audio
sd.play(audio, kokoro_tts.sample_rate)

# Demo the Automatic Speech Recogntion system


In [11]:
from glados.ASR import asr

In [12]:
transcriber = asr.AudioTranscriber()
audio_path = "data/0.wav"

In [None]:
%%time
transcription = transcriber.transcribe_file(audio_path)
print(f"Transcription: {transcription}")

# Demo the Vision System

In [1]:
from pathlib import Path

import numpy as np
from PIL import Image

from glados.Vision import ModelLoader, PromptProcessor, TextGenerator
from glados.Vision.image_preprocessor import ImagePreprocessor

%load_ext autoreload
%autoreload 2

In [2]:
image = Image.open("data/glados.jpeg")

In [None]:
image

In [24]:

# Initialize components
model_dir = Path("./models/Vision")

processor = PromptProcessor()
model_loader = ModelLoader(model_dir=model_dir)
generator = TextGenerator(model_loader, processor)

# Prepare inputs
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "Can you describe this image?"}
        ]
    },
]
image = Image.open("data/glados.jpeg")

# Generate with optional streaming
def print_stream(text: str):
    print(text, end='')

response = generator.generate(messages, np.asarray([image]), callback=print_stream)
print(f"\nFinal response: {response}")

image_config {'do_convert_rgb': True, 'do_image_splitting': True, 'do_normalize': True, 'do_pad': True, 'do_rescale': True, 'do_resize': True, 'image_mean': [0.5, 0.5, 0.5], 'image_processor_type': 'Idefics3ImageProcessor', 'image_std': [0.5, 0.5, 0.5], 'max_image_size': {'longest_edge': 512}, 'processor_class': 'Idefics3Processor', 'resample': 1, 'rescale_factor': 0.00392156862745098, 'size': {'longest_edge': 2048}}
{'do_convert_rgb': True, 'do_image_splitting': True, 'do_normalize': True, 'do_pad': True, 'do_rescale': True, 'do_resize': True, 'image_mean': [0.5, 0.5, 0.5], 'image_processor_type': 'Idefics3ImageProcessor', 'image_std': [0.5, 0.5, 0.5], 'max_image_size': {'longest_edge': 512}, 'processor_class': 'Idefics3Processor', 'resample': 1, 'rescale_factor': 0.00392156862745098, 'size': {'longest_edge': 2048}}
image [[[[ 60  94 104]
   [ 60  94 104]
   [ 60  94 104]
   ...
   [ 42  69  80]
   [ 42  69  80]
   [ 43  70  81]]

  [[ 60  94 104]
   [ 60  94 104]
   [ 60  94 104]
   

ValueError: cannot broadcast source array for assignment

In [25]:
processor = PromptProcessor()

image = Image.open("data/glados.jpeg")
x = processor.preprocess("Can you describe this image?", [np.asarray(image)])

image_config {'do_convert_rgb': True, 'do_image_splitting': True, 'do_normalize': True, 'do_pad': True, 'do_rescale': True, 'do_resize': True, 'image_mean': [0.5, 0.5, 0.5], 'image_processor_type': 'Idefics3ImageProcessor', 'image_std': [0.5, 0.5, 0.5], 'max_image_size': {'longest_edge': 512}, 'processor_class': 'Idefics3Processor', 'resample': 1, 'rescale_factor': 0.00392156862745098, 'size': {'longest_edge': 2048}}
{'do_convert_rgb': True, 'do_image_splitting': True, 'do_normalize': True, 'do_pad': True, 'do_rescale': True, 'do_resize': True, 'image_mean': [0.5, 0.5, 0.5], 'image_processor_type': 'Idefics3ImageProcessor', 'image_std': [0.5, 0.5, 0.5], 'max_image_size': {'longest_edge': 512}, 'processor_class': 'Idefics3Processor', 'resample': 1, 'rescale_factor': 0.00392156862745098, 'size': {'longest_edge': 2048}}
image [[[ 60  94 104]
  [ 60  94 104]
  [ 60  94 104]
  ...
  [ 42  69  80]
  [ 42  69  80]
  [ 43  70  81]]

 [[ 60  94 104]
  [ 60  94 104]
  [ 60  94 104]
  ...
  [ 42 

In [29]:
type(x['pixel_values']) == N

True

In [27]:
for key in x:
    print(key, x[key].shape, x[key].dtype)

pixel_values (1, 13, 3, 512, 512) float32
pixel_attention_mask (1, 1, 512, 512) bool
input_ids (1, 872) int64
attention_mask (1, 872) int64


In [15]:
from glados.Vision.image_preprocessor import ImagePreprocessor
import json

In [16]:
config = json.load(open("models/Vision/preprocessor_config.json"))
print(config)
imp = ImagePreprocessor(config)

{'do_convert_rgb': True, 'do_image_splitting': True, 'do_normalize': True, 'do_pad': True, 'do_rescale': True, 'do_resize': True, 'image_mean': [0.5, 0.5, 0.5], 'image_processor_type': 'Idefics3ImageProcessor', 'image_std': [0.5, 0.5, 0.5], 'max_image_size': {'longest_edge': 512}, 'processor_class': 'Idefics3Processor', 'resample': 1, 'rescale_factor': 0.00392156862745098, 'size': {'longest_edge': 2048}}
{'do_convert_rgb': True, 'do_image_splitting': True, 'do_normalize': True, 'do_pad': True, 'do_rescale': True, 'do_resize': True, 'image_mean': [0.5, 0.5, 0.5], 'image_processor_type': 'Idefics3ImageProcessor', 'image_std': [0.5, 0.5, 0.5], 'max_image_size': {'longest_edge': 512}, 'processor_class': 'Idefics3Processor', 'resample': 1, 'rescale_factor': 0.00392156862745098, 'size': {'longest_edge': 2048}}


In [17]:
x = imp.preprocess_image(np.asarray(image))

here


In [23]:
np.asanyarray(image).shape

(1080, 1920, 3)

In [19]:
print(x['pixel_values'].shape)
print(x['pixel_attention_mask'].shape)


(1, 13, 3, 512, 512)
(1, 1, 512, 512)


In [None]:
{'do_convert_rgb': True, 'do_image_splitting': True, 'do_normalize': True, 'do_pad': True, 'do_rescale': True, 'do_resize': True, 'image_mean': [0.5, 0.5, 0.5], 'image_processor_type': 'Idefics3ImageProcessor', 'image_std': [0.5, 0.5, 0.5], 'max_image_size': {'longest_edge': 512}, 'processor_class': 'Idefics3Processor', 'resample': 1, 'rescale_factor': 0.00392156862745098, 'size': {'longest_edge': 2048}}
{'do_convert_rgb': True, 'do_image_splitting': True, 'do_normalize': True, 'do_pad': True, 'do_rescale': True, 'do_resize': True, 'image_mean': [0.5, 0.5, 0.5], 'image_processor_type': 'Idefics3ImageProcessor', 'image_std': [0.5, 0.5, 0.5], 'max_image_size': {'longest_edge': 512}, 'processor_class': 'Idefics3Processor', 'resample': 1, 'rescale_factor': 0.00392156862745098, 'size': {'longest_edge': 2048}}


In [None]:
import matplotlib.pyplot as plt

img = x['pixel_values'][0, 12].transpose(1, 2, 0)

# normalise image
img = (img - img.min()) / (img.max() - img.min())


plt.imshow(img)

In [None]:
img = x['pixel_values'][0, 2].transpose(1, 2, 0)

# normalise image
img = (img - img.min()) / (img.max() - img.min())


plt.imshow(img)

In [None]:
x['pixel_values'][11,:,:,:].shape

In [None]:
np.ones(x['pixel_values'].shape[0,1,3,4])