In [11]:
from allosaurus.app import read_recognizer
model = read_recognizer()

In [50]:
import os
import yaml
from PIL import Image

class Frames:
    def __init__(self, config_file: str):
        self.folder = os.path.dirname(config_file)
        self.data = yaml.load(open(config_file,'r',encoding='utf8'), yaml.FullLoader)
        self.image_cache = {}
    
    def get_image_filename(self, phoneme: str) -> str:
        for image_data in self.data['images']:
            if phoneme in image_data['symbols']:
                return self.folder + os.path.sep + image_data['image']
        return None

    def get_image(self, phoneme: str) -> Image.Image:
        filename = self.get_image_filename(phoneme)
        if filename is None:
            return None

        if filename in self.image_cache:
            return self.image_cache[filename]
        else:
            image = Image.open(filename)
            self.image_cache[filename] = image
            return image
    
    def get_symbols(self):
        for image_data in self.data['images']:
            for symbol in image_data['symbols']:
                yield symbol

    def has_phoneme_image(self, phoneme: str):
        for image_data in self.data['images']:
            if phoneme in image_data['symbols']:
                return True
        return False

frames = Frames('test_config/config.yml')
list(frames.get_symbols())

['-',
 'a',
 'ɐ',
 'ɑ',
 'ɒ',
 'æ',
 'ɶ',
 'ɛ',
 'œ',
 'ɜ',
 'ɞ',
 'ʌ',
 'e',
 'ɘ',
 't͡ʃʲ',
 'o',
 'ɔ',
 'ø',
 'ɤ',
 'uə',
 'ʂ',
 's',
 'ŋ',
 'l',
 'm',
 'b',
 'p',
 'q',
 'w',
 'u',
 'ɯ',
 'ʊ',
 'ʉ',
 'ɻ̩',
 't͡',
 'r',
 'ʃ',
 'i',
 'ɪ',
 'ɨ',
 'ʏ',
 'ᵻ',
 'y',
 'ɨ']

In [51]:
class Phoneme:
    start_time: float
    duration: float
    phoneme: str

    def __init__(self, start_time: float, duration: float, phoneme: str):
        self.start_time = start_time
        self.duration = duration 
        self.phoneme = phoneme

    def __str__(self):
        return f"Phoneme({self.start_time}, {self.duration}, {self.phoneme})"

In [63]:
result = model.recognize("virus_mixagem_3_chunk22.wav", timestamp=True)
silence_duration_threshold = 0.25
silence_duration = 0.05
phonemes = []
for str_phoneme in result.split('\n'):
    values = str_phoneme.split(' ')
    phonemes.append(Phoneme(float(values[0]), float(values[1]), values[2]))

for i, phoneme in enumerate(phonemes):
    if i == len(phonemes)-1:
        duration = phoneme.duration
    else:
        duration = phonemes[i+1].start_time - phoneme.start_time
    phoneme.duration = duration
    if duration > silence_duration_threshold and phonemes[i-1].phoneme != '-':
        phoneme.duration -= silence_duration
        phonemes.insert(i+1, Phoneme(phoneme.start_time + phoneme.duration, silence_duration, "-"))

# Add silence to end.
phonemes.append(Phoneme(phonemes[-1].start_time + phonemes[-1].duration, 0.1, "-"))

for phoneme in phonemes:
    print(phoneme)

Phoneme(0.06, 0.03, ɳ)
Phoneme(0.09, 0.09, ɒ)
Phoneme(0.18, 0.09000000000000002, u)
Phoneme(0.27, 0.02999999999999997, p)
Phoneme(0.3, 0.18, uə)
Phoneme(0.48, 0.08999999999999997, ʂ)
Phoneme(0.57, 0.2800000000000001, ɻ̩)
Phoneme(0.8500000000000001, 0.04999999999999993, -)
Phoneme(0.9, 0.05999999999999994, m)
Phoneme(0.96, 0.20999999999999996, a)
Phoneme(1.17, 0.09000000000000008, s)
Phoneme(1.26, 0.030000000000000027, l)
Phoneme(1.29, 0.06000000000000005, u)
Phoneme(1.35, 0.05999999999999983, ŋ)
Phoneme(1.41, 0.09000000000000008, ɡ̤)
Phoneme(1.5, 0.2800000000000001, a)
Phoneme(1.78, 0.050000000000000044, -)
Phoneme(1.83, 0.20999999999999996, ɳ)
Phoneme(2.04, 0.08999999999999986, s)
Phoneme(2.13, 0.1499999999999999, ɒ)
Phoneme(2.28, 0.06000000000000005, ɡ)
Phoneme(2.34, 0.18000000000000016, o)
Phoneme(2.52, 0.029999999999999805, b̤)
Phoneme(2.55, 0.4000000000000002, uə)
Phoneme(2.95, 0.04999999999999982, -)
Phoneme(3.0, 0.06000000000000005, s)
Phoneme(3.06, 0.08999999999999986, i)
Phone

In [66]:
available_symbols = list(frames.get_symbols())
def update_phoneme_list(phonemes):
    last_available_phoneme = available_symbols[0]
    for phoneme in phonemes:
        if phoneme.phoneme in available_symbols:
            last_available_phoneme = phoneme.phoneme
        else:
            phoneme.phoneme = last_available_phoneme

updated_phonemes = phonemes.copy()
update_phoneme_list(updated_phonemes)
[phoneme.phoneme for phoneme in updated_phonemes]

['-',
 'ɒ',
 'u',
 'p',
 'uə',
 'ʂ',
 'ɻ̩',
 '-',
 'm',
 'a',
 's',
 'l',
 'u',
 'ŋ',
 'ŋ',
 'a',
 '-',
 '-',
 's',
 'ɒ',
 'ɒ',
 'o',
 'o',
 'uə',
 '-',
 's',
 'i',
 'b',
 'b',
 'b',
 '-',
 't͡ʃʲ',
 '-']

In [15]:
https://media1.thehungryjpeg.com/thumbs2/ori_3489187_fa8c67ce7d99079b813850ea43a09a7adb42c3ec_expressive-cartoon-articulation-mouth-lips-lip-sync-animation-phonem.jpg

SyntaxError: invalid syntax (266652320.py, line 1)

In [64]:
from moviepy.editor import ImageClip, concatenate_videoclips, AudioFileClip

def add_phoneme_with_duration_to_video(clips, phoneme, duration):
    clip = ImageClip(frames.get_image_filename(phoneme), transparent=True, duration=duration)
    clips.append(clip)


def generate_video(phonemes):
    clips = []
    duration = 0.0
    for i, phoneme in enumerate(phonemes):
        if i == len(phonemes)-1:
            duration = phoneme.duration
        else:
            duration = phonemes[i+1].start_time - phoneme.start_time
        if frames.has_phoneme_image(phoneme.phoneme):
            add_phoneme_with_duration_to_video(clips, phoneme.phoneme, duration)
        else:
            # No initial image
            if i == 0:
                add_phoneme_with_duration_to_video(clips, '-', duration)
            else:
                add_phoneme_with_duration_to_video(clips, phonemes[i].phoneme, duration)
    return clips
    

In [65]:
clips = generate_video(updated_phonemes)
video = concatenate_videoclips(clips, method="compose")
audio = AudioFileClip("virus_mixagem_3_chunk22.wav")
video.audio = audio
video.write_videofile("test.mp4", fps=24)

Moviepy - Building video test.mp4.
MoviePy - Writing audio in testTEMP_MPY_wvf_snd.mp3


                                                       

MoviePy - Done.
Moviepy - Writing video test.mp4



                                                            

Moviepy - Done !
Moviepy - video ready test.mp4
