
# Overview

This is a prototype of the pipeline to fix slow speech rate by adding speech breaks between the sentences.

Known Issue: there are times when the resulting waveform is longer than original length + breaks.

Workaround: this will result in longer clips and overlapping with the following event. Can be seen in the logs and speech breaks can be adjusted.
Add breaks before each sentence and in the end of a sentences. To avoid long weird pauses and occasionally truncated the end of the clip.

In [None]:
!mkdir /tmp/freespeech

In [None]:
import asyncio

from dataclasses import replace
from typing import Sequence

from freespeech.lib import gdocs, media, speech
from freespeech.types import Event, Language


async def synthesize_events(events: Sequence[Event], rate: float, lang: Language) -> list[dict]:
    acc = []
    for event in events:
        file, voice = await speech.synthesize_text(
            text := " ".join(event.chunks),
            duration_ms=None,
            voice=replace(event.voice, speech_rate=rate),
            lang=lang,
            output_dir="/tmp/freespeech")
        ((audio, *_), *_) = media.probe(file)
        acc += [{
            "actual": audio.duration_ms,
            "target": event.duration_ms,
            "delta": event.duration_ms - audio.duration_ms if event.duration_ms is not None else None,
            "ratio": audio.duration_ms / event.duration_ms if event.duration_ms is not None else None,
            "event": event,
            "voice": voice,
            "file": file
        }]
    return acc


transcript = gdocs.load("https://docs.google.com/document/d/1CVnL1fbBjLOPNAwwfReZuGVm34FPpE7ZKrWMqKOD6lY/edit#")
rates = [0.9, 1.0]
res = [await synthesize_events(transcript.events, rate=rate, lang=transcript.lang) for rate in rates]


In [None]:
import re
import spacy


nlp = spacy.load("en_core_web_sm")


def fix_breaks(text):
    breaks = re.compile(r"#(\d+(\.\d+)?)#")
    total_breaks = sum([float(value) for value, *_ in breaks.findall(text)])
    doc = nlp(breaks.sub("", text))
    senter = nlp.get_pipe("senter")
    sentences = [sentence for span in senter(doc).sents if (sentence := span.text.strip())]

    return ' '.join(f"#{total_breaks / len(sentences):.2f}# {sentence}" for sentence in sentences)


fix_breaks("#10.87# Well, I wanted to start. Let me begin. #1.0# My name is Sergey, I'm a Product Owner, I've been working with DataArt for more than a year on different projects. Why am I here? Actually, because I believe that each of us can always learn something better. Those who we are today can become better tomorrow. That's actually what I came here with in mind.")



In [None]:
def fix_event(event, actual, target, voice):
    if target is None:
        return event

    if actual / target < 0.8:
        text = f"#{(target - actual - 200) / 1000:.2f}# {' '.join(event.chunks)}"
        return replace(event, chunks=[fix_breaks(text)], duration_ms=None, voice=voice)
    else:
        return event


fixed_events = [[fix_event(item["event"], item["actual"], item["target"], item["voice"]) for item in batch] for batch in res]