In [5]:
from freespeech.lib import gdocs


transcript = gdocs.load("https://docs.google.com/document/d/1RT3DFxDGZrZpIHc68p-Qy4VpEqk_Pat4Rufv958qbE8/edit#")

In [6]:
transcript.events

[Event(time_ms=170, chunks=['Was ist los, Leute? Willkommen zurück. #0.1#'], duration_ms=2460, voice=Voice(character='Alan', pitch=0.0, speech_rate=1.2)),
 Event(time_ms=2630, chunks=['Mein Name ist Scott Moss. '], duration_ms=1220, voice=Voice(character='Alan', pitch=0.0, speech_rate=1.2)),
 Event(time_ms=3850, chunks=['In diesem Video machen wir etwas ganz anderes. #0.2#'], duration_ms=3020, voice=Voice(character='Alan', pitch=0.0, speech_rate=1.2)),
 Event(time_ms=6870, chunks=['Ich wollte ein wenig Abwechslung in die Sache bringen. '], duration_ms=1680, voice=Voice(character='Alan', pitch=0.0, speech_rate=1.2)),
 Event(time_ms=8550, chunks=['Ich habe mich ein wenig inspiriert gefühlt. '], duration_ms=1440, voice=Voice(character='Alan', pitch=0.0, speech_rate=1.2)),
 Event(time_ms=9990, chunks=['Vor ein paar Tagen traf ich einen Freund von mir, der Software-Ingenieur ist, hier in der Bay Area.'], duration_ms=3490, voice=Voice(character='Alan', pitch=0.0, speech_rate=1.2)),
 Event(ti

In [7]:
import re
from itertools import zip_longest

from dataclasses import dataclass
from tempfile import TemporaryDirectory
from freespeech.types import Character, Event, Language
from freespeech.lib import text, speech, audio, media


@dataclass(frozen=True)
class Interval:
    speech_ms: int
    silence_ms: int
    rate: float
    character: Character
    outline: list[str | int]


def merge(a: Interval, b: Interval) -> Interval:
    speech_ms = a.speech_ms + b.speech_ms
    rate = (a.rate * a.speech_ms + b.rate * b.speech_ms) / speech_ms
    silence_ms = a.silence_ms + b.silence_ms

    if a.character != b.character:
        raise ValueError(f"character in a and b should be the same")

    return Interval(
        silence_ms=silence_ms,
        speech_ms=speech_ms,
        rate=rate,
        outline=a.outline + b.outline,
        character=a.character
    )


def adjust(a: Interval, base_rate: float) -> Interval:
    speech_ms = min(
            a.speech_ms + a.silence_ms,  # can't be longer than total duration
            a.speech_ms * (a.rate / base_rate)
        )
    silence_ms = (a.speech_ms + a.silence_ms) - speech_ms
    rate = a.rate * (a.speech_ms / speech_ms)

    return Interval(
        speech_ms=round(speech_ms),
        silence_ms=round(silence_ms),
        rate=rate,
        character=a.character,
        outline=[
            chunk if isinstance(chunk, str) else round(chunk * (silence_ms / a.silence_ms))
            for chunk in a.outline
        ]
    )


def chunk_paragraph(s: str, lang: Language) -> list[str | int]:
    split = re.split(r"#(\d+(\.\d+)?)#", s)
    outline = [
        [
            *text.sentences(s, lang=lang),
            round((float(pause) if pause else 0.0) * 1000)
        ] for s, pause in zip_longest(split[0::3], split[1::3], fillvalue="")
        if s or pause
    ]

    return sum(outline, [])


async def interval(event: Event, lang: Language) -> Interval:
    paragraph = " ".join(event.chunks)
    outline = chunk_paragraph(paragraph, lang)
    with TemporaryDirectory() as tmp_dir:
        clips = [
            audio.strip((await speech.synthesize_text(
                text=chunk,
                duration_ms=None,
                voice=event.voice,
                lang=lang,
                output_dir=tmp_dir
            ))[0]) if isinstance(chunk, str) else chunk
            for chunk in outline
        ]
        speech_ms = sum(media.audio_duration(clip) for clip in clips if not isinstance(clip, int))
        silence_ms = sum(clip for clip in clips if isinstance(clip, int))
        rate = event.voice.speech_rate
        character = event.voice.character

    return Interval(
        speech_ms=speech_ms,
        silence_ms=silence_ms,
        rate=rate,
        character=character,
        outline=outline
    )


In [8]:
from itertools import zip_longest


s = "Was ist los, Leute? Willkommen zurück #0.1#"
chunk_paragraph(s, lang="de-DE")

['Was ist los, Leute?', 'Willkommen zurück', 100]

In [10]:
i = await interval(transcript.events[0], lang=transcript.lang)

In [11]:
adjust(i, 1.2)

Interval(speech_ms=2275, silence_ms=100, rate=1.2, character='Alan', outline=['Was ist los, Leute?', 'Willkommen zurück.', 100])

In [3]:
chunk_paragraph("#1.0# Hello world! #1.0#", lang="en-US")

[1000, 'Hello world!', 1000]

In [30]:
from itertools import permutations


def all_merges(intervals: list[Interval], n: int) -> list[tuple[bool]]:
    return sum((
        list(set(permutations([True] * i + [False] * (len(intervals) - i - 1))))
        for i in range(n)), [])

In [31]:
def merge_intervals(intervals: list[Interval], flags: tuple[bool]) -> list[Interval]:
    intervals = intervals.copy()
    acc = [intervals.pop(0)]

    for interval, should_merge in zip(intervals, flags):
        if not should_merge:
            acc += [interval]
        else:
            acc += [merge(interval, acc.pop())]

    return acc

In [32]:
solutions = [merge_intervals(intervals, flags) for flags in all_merges(intervals, len(intervals))]

In [33]:
solutions

[[Interval(speech_ms=10, pause_ms=0, speech_rate=1.0),
  Interval(speech_ms=10, pause_ms=0, speech_rate=2.0),
  Interval(speech_ms=20, pause_ms=10, speech_rate=1.0),
  Interval(speech_ms=20, pause_ms=0, speech_rate=2.0)],
 [Interval(speech_ms=20, pause_ms=0, speech_rate=1.5),
  Interval(speech_ms=20, pause_ms=10, speech_rate=1.0),
  Interval(speech_ms=20, pause_ms=0, speech_rate=2.0)],
 [Interval(speech_ms=10, pause_ms=0, speech_rate=1.0),
  Interval(speech_ms=10, pause_ms=0, speech_rate=2.0),
  Interval(speech_ms=40, pause_ms=10, speech_rate=1.5)],
 [Interval(speech_ms=10, pause_ms=0, speech_rate=1.0),
  Interval(speech_ms=30, pause_ms=10, speech_rate=1.3333333333333333),
  Interval(speech_ms=20, pause_ms=0, speech_rate=2.0)],
 [Interval(speech_ms=20, pause_ms=0, speech_rate=1.5),
  Interval(speech_ms=40, pause_ms=10, speech_rate=1.5)],
 [Interval(speech_ms=40, pause_ms=10, speech_rate=1.25),
  Interval(speech_ms=20, pause_ms=0, speech_rate=2.0)],
 [Interval(speech_ms=10, pause_ms=0, 

In [35]:
[[adjust(interval, base_rate=1.3) for interval in intervals] for intervals in solutions]

[[Interval(speech_ms=8, pause_ms=2, speech_rate=1.3),
  Interval(speech_ms=10, pause_ms=0, speech_rate=2.0),
  Interval(speech_ms=15, pause_ms=15, speech_rate=1.3),
  Interval(speech_ms=20, pause_ms=0, speech_rate=2.0)],
 [Interval(speech_ms=20, pause_ms=0, speech_rate=1.5),
  Interval(speech_ms=15, pause_ms=15, speech_rate=1.3),
  Interval(speech_ms=20, pause_ms=0, speech_rate=2.0)],
 [Interval(speech_ms=8, pause_ms=2, speech_rate=1.3),
  Interval(speech_ms=10, pause_ms=0, speech_rate=2.0),
  Interval(speech_ms=46, pause_ms=4, speech_rate=1.3000000000000003)],
 [Interval(speech_ms=8, pause_ms=2, speech_rate=1.3),
  Interval(speech_ms=31, pause_ms=9, speech_rate=1.3),
  Interval(speech_ms=20, pause_ms=0, speech_rate=2.0)],
 [Interval(speech_ms=20, pause_ms=0, speech_rate=1.5),
  Interval(speech_ms=46, pause_ms=4, speech_rate=1.3000000000000003)],
 [Interval(speech_ms=38, pause_ms=12, speech_rate=1.3),
  Interval(speech_ms=20, pause_ms=0, speech_rate=2.0)],
 [Interval(speech_ms=8, pause

In [24]:
merge(
    Interval(10, 0, 1.0),
    Interval(10, 10, 2.0),
)

Interval(speech_ms=20, pause_ms=10, speech_rate=1.5)

In [18]:
adjust_speech_rate(Interval(5, 10, 2.0), 1.0)

Interval(signal_ms=10, silence_ms=5, speech_rate=1.0)