In [1]:
%pip install hypothesis

Collecting hypothesis
  Downloading hypothesis-6.47.3-py3-none-any.whl (387 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m387.2/387.2 KB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sortedcontainers<3.0.0,>=2.1.0
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
Collecting exceptiongroup>=1.0.0rc8
  Downloading exceptiongroup-1.0.0rc8-py3-none-any.whl (11 kB)
Installing collected packages: sortedcontainers, exceptiongroup, hypothesis
Successfully installed exceptiongroup-1.0.0rc8 hypothesis-6.47.3 sortedcontainers-2.4.0
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [17]:
from dataclasses import dataclass, asdict
from hypothesis.strategies import from_regex
import random
from typing import Dict, Literal, Sequence, Tuple, List

N = 100
LANGUAGES = ["Russian", "English", "Spanish", "Ukrainian", "German", "Portuguese"]
METHODS = ["Machine A", "Machine B", "Subtitles"]
VOICES = ["Alan Turing"]

Entity = Literal["language", "url", "method"]

ENTITIES = {
    "language": lambda: random.choice(LANGUAGES),
    "method": lambda: random.choice(METHODS),
    "url": lambda: from_regex(r"^((https://www\.youtube\.com/watch\?v=)|(https://youtu\.be/))[A-Za-z0-9_]+$").example().strip(),
    "voice": lambda: random.choice(VOICES),
}

PHRASES = {
    "transcribe": [
        "{url} with {language} {method}",
        "{url} with {language} transcript using {method}",
        "load {url} with {language} {method}",
        "load {url} with {language} transcript using {method}",
        "create transcript from {url} using {language} {method}",
        "create transcript from {url} and use {language} {method}",
        "Create {language} transcript for {url} using {method}",
        "Create {language} transcript for {url} from {method}",
        "transcribe {url} from {language} using {method}",
    ],
    "dub": [
        "dub {url} with {voice}",
        "dub using transcript {url} with {voice}",
        "dub using {url}",
        "hey, {voice} dub me a {url}"
    ],
    "reset": [
        "reset transcript {url}",
        "reset {url}",
    ],
    "translate": ["translate {url} to {language}"]
}


@dataclass(frozen=True)
class EntityRecord:
    category: Entity
    offset: int
    length: int


@dataclass(frozen=True)
class UtteranceRecord:
    intent: str
    language: str
    text: str
    entities: List[EntityRecord]


def example(template: str, entities: Dict[str, str]) -> Tuple[str, Sequence[EntityRecord]]:
    utterance = template.format(**entities)

    return utterance, [
        EntityRecord(
            category=category,
            offset=utterance.index(value),
            length=len(value)
        )
        for category, value in entities.items()
        if value in utterance
    ]


def create(intent: str, n: int) -> Sequence[UtteranceRecord]:
    for _ in range(n):
        text, entities = example(random.choice(PHRASES[intent]), {name: func() for name, func in ENTITIES.items()})
        yield UtteranceRecord(
            intent=intent,
            language="en-us",
            text=text,
            entities=entities
        )


res = list(create("transcribe", n=N)) + list(create("dub", n=N)) + list(create("reset", n=N)) + list(create("translate", n=N))
res = {item.text: item for item in res}


import json
with open("/workspaces/freespeech/output/bot-examples.json", "w") as fp:
    json.dump([asdict(item) for item in res.values()], fp, indent=4)