In [132]:
import os
import time
import uuid
from typing import List, Type, TypeVar, Literal, TypedDict, Optional, Annotated
import annotated_types
import json
import logging

import replicate
import requests
from dotenv import load_dotenv
from elevenlabs.client import ElevenLabs
from lumaai import LumaAI
from openai import NOT_GIVEN, OpenAI
from pydantic import BaseModel
import boto3
from dataclasses import dataclass, field

load_dotenv()

True

In [127]:
def get_temp_filename(extension: str) -> str:
    return f"./content/{uuid.uuid4()}.{extension}"


openai = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

elevenlabs = ElevenLabs(
    api_key=os.environ.get("ELEVENLABS_API_KEY"),
)

luma = LumaAI(
    auth_token=os.environ.get("LUMAAI_API_KEY"),
)

logger = logging.getLogger("file_logger")
file_handler = logging.FileHandler("generate.log")
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.setLevel(logging.INFO)



def generate_text(system_prompt: str, user_prompt: str, json_mode: bool = False) -> str:
    logger.info(f"Generating text with system prompt: {system_prompt} and user prompt: {user_prompt}")
    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        response_format={"type": "json_object"} if json_mode else NOT_GIVEN,
    )
    if not response.choices[0].message.content:
        raise Exception("No response from OpenAI")
    logger.info(f"Response from OpenAI: {response.choices[0].message.content}")
    return response.choices[0].message.content


T = TypeVar("T", bound=BaseModel)


def generate_json(system_prompt: str, user_prompt: str, model: Type[T]) -> T:
    data = generate_text(system_prompt, user_prompt, json_mode=True)
    return model.model_validate_json(data)

def upload_to_r2(file_path: str) -> str:
    session = boto3.session.Session()
    endpoint_url = f"https://{os.environ.get("R2_ACCOUNT_ID")}.r2.cloudflarestorage.com"

    s3_client = session.client(
        "s3",
        region_name="auto",
        endpoint_url=endpoint_url,
        aws_access_key_id=os.environ.get("R2_ACCESS_KEY_ID"),
        aws_secret_access_key=os.environ.get("R2_SECRET_ACCESS_KEY"),
    )

    filename = os.path.basename(file_path)

    s3_client.upload_file(file_path, "test", filename, ExtraArgs={"ACL": "public-read"})

    return f"https://pub-{os.environ.get('R2_BUCKET_PUB_ID')}.r2.dev/{filename}"

def generate_image(prompt: str, character_reference_url: str = None) -> str:
    logger.info(f"Generating image with prompt: {prompt} and character reference URL: {character_reference_url}")

    input = {"prompt": prompt, "aspect_ratio": "16:9"}

    if character_reference_url:
        input["character_reference_url"] = character_reference_url

    output = replicate.run(
        "luma/photon",
        input=input,
    )

    filename = get_temp_filename("jpg")
    with open(filename, "wb") as file:
        file.write(output.read())

    logger.info(f"Generated image: {filename}")

    return filename


def generate_video(prompt: str) -> str:
    logger.info(f"Generating video with prompt: {prompt}")

    generation = luma.generations.create(
        prompt=prompt,
    )

    completed = False
    while not completed:
        if not generation.id:
            raise RuntimeError("Generation ID is None")
        generation = luma.generations.get(id=generation.id)
        if generation.state == "completed":
            completed = True
        elif generation.state == "failed":
            raise RuntimeError(f"Generation failed: {generation.failure_reason}")
        print("Generating video..")
        time.sleep(3)

    if not generation.assets:
        raise RuntimeError("Generation has no video")

    video_url = generation.assets.video

    if not video_url:
        raise RuntimeError("Generation has no video URL")

    filename = get_temp_filename("mp4")
    response = requests.get(video_url, stream=True)
    with open(filename, "wb") as file:
        file.write(response.content)

    logger.info(f"Generated video: {filename}")

    return filename


def generate_audio(text: str, voice: str = "Brian") -> str:
    logger.info(f"Generating audio with text: {text} and voice: {voice}")

    audio = elevenlabs.generate(
        text=text,
        voice=voice,
        model="eleven_multilingual_v2",
    )
    filename = get_temp_filename("mp3")
    with open(filename, "wb") as file:
        for chunk in audio:
            file.write(chunk)

    logger.info(f"Generated audio: {filename}")
    
    return filename


In [4]:
language = "German"

In [8]:
idea = generate_text(
    system_prompt="If you are a screenwriter for the TV show Beyond All Belief, or in German X-Faktor das Unfassbare, generate an idea for a short story for that particular TV show. Don't write acts, scenes or anything else. Just a quick idea. Include a twist.",
    user_prompt=f"Return the idea as short text in {language}. The story should take place in {language}.",
)

print(idea)

Ein junger Mann namens Lukas zieht in eine alte, nahezu verlassene Kleinstadt im Schwarzwald, um dem hektischen Stadtleben zu entfliehen. Eines Nachts bemerkt er ungewöhnliche Aktivitäten in einem alten, baufälligen Hotel am Rande der Stadt. Neugierig geworden, beginnt er, nachts das Hotel heimlich zu erkunden und hört dabei immer wieder eine melancholische Melodie, die aus einem alten Klavier zu kommen scheint.

Er erfährt, dass das Hotel seit Jahrzehnten unbewohnt ist, nachdem eine berühmte Pianistin dort unter mysteriösen Umständen verschwand. Fasziniert von der Geschichte, kehrt Lukas jeden Abend zurück, versucht die Quelle der Melodie zu finden und wird das Gefühl nicht los, dass die Pianistin ihm irgendwie nahe ist. Seine Suche nach der Wahrheit wird zur Obsession, bis er eines Nachts ein vergilbtes Foto hinter dem Klavier entdeckt – das Foto zeigt die verschwundene Pianistin … und Lukas selbst, in der Kleidung und dem Stil von vor hundert Jahren.

Der Twist: Lukas ist die Reinka

In [48]:
class Character(BaseModel):
    id: str
    name: str
    description: str
    appearance: str
    voice: Literal["Aria", "Roger", "Sarah", "Laura", "Charlie", "George", "Callum", "River", "Liam", "Charlotte", "Alice", "Matilda", "Will", "Jessica", "Eric", "Chris", "Brian", "Daniel", "Lily", "Bill"]

class Characters(BaseModel):
    characters: List[Character]


characters = generate_json(
    system_prompt=f"""Generate the characters for a short story.
Return the characters as a JSON:
{{
    "characters": [
        {{
            "id": a unique identifier for the character as a string, e.g the name in lower case (must be unique)
            "name": the name of the character
            "description": a short description of the character
            "appearance": a short description of the character's appearance in style of prompt for an image generator in English, add country of origin, age, gender, hair color, eye color, ethnicity, clothing, etc.,
            "voice": chose of voice for the character, you find a list below.
        }}
    ]
}}
in {language}.
Here the list of voices: (they can all speak in {language}):
1.	Aria: Expressive and middle-aged American female voice, ideal for social media content.
2.	Roger: Confident middle-aged American male voice, suited for social media applications.
3.	Sarah: Soft, young American female voice, perfect for news delivery.
4.	Laura: Upbeat young American female voice, commonly used in social media settings.
5.	Charlie: Natural middle-aged Australian male voice, excellent for conversational purposes.
6.	George: Warm middle-aged British male voice, great for narration.
7.	Callum: Intense middle-aged Transatlantic male voice, tailored for character-based uses.
8.	River: Confident middle-aged American non-binary voice, designed for social media.
9.	Liam: Articulate young American male voice, often used for narration.
10.	Charlotte: Seductive young Swedish female voice, crafted for character work.
11.	Alice: Confident middle-aged British female voice, suitable for news contexts.
12.	Matilda: Friendly middle-aged American female voice, fitting for narration.
13.	Will: Friendly young American male voice, focused on social media.
14.	Jessica: Expressive young American female voice, ideal for conversational projects.
15.	Eric: Friendly middle-aged American male voice, used for conversational settings.
16.	Chris: Casual middle-aged American male voice, great for conversational uses.
17.	Brian: Deep middle-aged American male voice, primarily used for narration.
18.	Daniel: Authoritative middle-aged British male voice, perfect for news delivery.
19.	Lily: Warm middle-aged British female voice, crafted for narration.
20.	Bill: Trustworthy older American male voice, tailored for narration.""",
    user_prompt=f"The story should take place in {language}. Use common names in that language and region. The story is about a {idea}.",
    model=Characters,
).characters

print(characters)

[Character(id='lukas', name='Lukas', description='Ein junger Mann, der dem hektischen Stadtleben entflieht und in eine fast verlassene Kleinstadt im Schwarzwald zieht.', appearance='Deutscher Mann, Anfang 30, mit kurzem braunem Haar und blauen Augen. Ethnie: europäisch. Trägt oft lässige Kleidung, die für Wanderungen im Schwarzwald geeignet ist.', voice='Will'), Character(id='clara', name='Clara', description='Die verschwundene Pianistin, deren Geist noch immer im alten Hotel verweilt.', appearance='Deutsche Frau, um die 25 Jahre alt, mit mittellangem gewelltem blonden Haar und grünen Augen. Ethnie: europäisch. Gekleidet in elegante, antike Abendkleidung aus dem frühen 20. Jahrhundert.', voice='Charlotte'), Character(id='johann', name='Johann', description='Ein älterer Bewohner der Stadt, der Lukas von der geheimnisvollen Geschichte des Hotels erzählt.', appearance='Deutscher Mann, Mitte 70, mit weißem Haar und braunen Augen. Ethnie: europäisch. Trägt eine alte, abgetragene Jacke und o

In [92]:
class FileMapping(TypedDict):
    filename: str
    url: str

character_images: dict[str, FileMapping] = {}
for n, character in enumerate(characters):
    print(f"Generating character image for {character.name} ({n + 1} of {len(characters)})")
    file_path = generate_image(f"A neutral portrait of {character.appearance}")
    character_images[character.id] = FileMapping(
        filename=file_path,
        url=upload_to_r2(file_path),
    )

print(character_images)

  0%|          | 0/4 [00:00<?, ?it/s]

Deutscher Mann, Anfang 30, mit kurzem braunem Haar und blauen Augen. Ethnie: europäisch. Trägt oft lässige Kleidung, die für Wanderungen im Schwarzwald geeignet ist.


 25%|██▌       | 1/4 [00:08<00:24,  8.17s/it]

Deutsche Frau, um die 25 Jahre alt, mit mittellangem gewelltem blonden Haar und grünen Augen. Ethnie: europäisch. Gekleidet in elegante, antike Abendkleidung aus dem frühen 20. Jahrhundert.


 50%|█████     | 2/4 [00:15<00:15,  7.70s/it]

Deutscher Mann, Mitte 70, mit weißem Haar und braunen Augen. Ethnie: europäisch. Trägt eine alte, abgetragene Jacke und oft einen Hut.


 75%|███████▌  | 3/4 [00:23<00:07,  7.60s/it]

Deutscher Mann, 50 Jahre alt, mit graumeliertem Haar und einem buschigen Bart. Ethnie: europäisch. Zieht robuste Arbeitskleidung an, die für harte Arbeit geeignet ist.


100%|██████████| 4/4 [00:31<00:00,  7.96s/it]

{'lukas': {'filename': './content/f50eb527-f8a8-4ebc-bdbd-53df2acac234.jpg', 'url': 'https://pub-7fdd542f3dda44bcaef4c36bf4bb48e1.r2.dev/f50eb527-f8a8-4ebc-bdbd-53df2acac234.jpg'}, 'clara': {'filename': './content/8d56ee36-673b-41db-9e8e-f046dcf1314d.jpg', 'url': 'https://pub-7fdd542f3dda44bcaef4c36bf4bb48e1.r2.dev/8d56ee36-673b-41db-9e8e-f046dcf1314d.jpg'}, 'johann': {'filename': './content/e29c0f7a-dcaa-4ed1-b7dc-dc97e82de730.jpg', 'url': 'https://pub-7fdd542f3dda44bcaef4c36bf4bb48e1.r2.dev/e29c0f7a-dcaa-4ed1-b7dc-dc97e82de730.jpg'}, 'hans': {'filename': './content/bdcb3ab6-d9f4-4fcb-98c2-35c2b7ad82bc.jpg', 'url': 'https://pub-7fdd542f3dda44bcaef4c36bf4bb48e1.r2.dev/bdcb3ab6-d9f4-4fcb-98c2-35c2b7ad82bc.jpg'}}





In [50]:
class Act(BaseModel):
    description: str


class Acts(BaseModel):
    acts: List[Act]

acts = generate_json(
    system_prompt=f"""Generate the acts for a short story. Make it a 3 act play. Return the acts as a JSON:
{{
    "acts": [
        {{
            "description": a short description of the act, the act need to be short around 2-3 minutes of screen time. Describe exactly what in the scene happens and how the scene ends. Don't describe the characters.
        }}
    ]
}}
in {language}.""",
    user_prompt=f"The story should take place in {language}. Use common names in that language and region. The story is about a {idea}. The only characters are {', '.join(str(obj) for obj in characters)}.",
    model=Acts,
).acts

for n, act in enumerate(acts):
    print(f"Act {n + 1}:")
    print(act.description)


Act 1:
Act 1: Lukas zieht in die idyllische, doch nahezu verlassene Kleinstadt im Schwarzwald ein, um dem dröhnenden Stadtleben zu entfliehen. Bei einem Spaziergang in der Dämmerung bemerkt er das alte, baufällige Hotel am Rande der Stadt, von dem ein geheimnisvolles Flair ausgeht. In den Abendstunden, als die Lichter der Stadt zurückweichen, sieht er ein schwaches Flackern aus einem der zerschmetterten Fenster und hört die melancholische Melodie eines Klaviers. Neugierig und von der melancholischen Melodie angezogen, beschließt Lukas, das Geheimnis weiter zu erkunden. Der Vorhang fällt, als er zögert und darüber nachdenkt, wie er das Gebäude unbemerkt betreten kann.
Act 2:
Act 2: Lukas nimmt Kontakt zu Johann auf, einem älteren Stadtbewohner, der die Geschichte des verwunschenen Hotels kennt. Johann erzählt ihm von der berühmten Pianistin Clara, die vor Jahrzehnten unter mysteriösen Umständen verschwunden ist. Der junge Mann ist fasziniert und beschließt, das Hotel bei Nacht heimlich 

In [133]:
class Scene(BaseModel):
    type: Literal["narration", "dialogue"]
    characters: Annotated[list[str], annotated_types.Len(min_length=1)]
    description: str

class Scenes(BaseModel):
    scenes: List[Scene]

act_scenes: List[List[Scene]] = []

for act_n, act in enumerate(acts):
    print(f"Generating scenes for act {act_n + 1} of {len(acts)}")
    scenes = generate_json(
        system_prompt=f"""Generate the scenes for a short story. Make it 3 scenes. Make it a mix of narration and dialogue. Use narrations sparingly. Return the scenes as a JSON:
{{
    "scenes": [
        {{
            "type": "narration" or "dialogue". A narration is kind of a scenic view or an establishing shot with a video in the background and an off-text. A dialogue is a dialogue between two or more people.,
            "characters": a list of the character IDs that are in the scene. If it is a narration, there must be exactly one character. If its a dialogue, it needs to be a least two characters.,
            "description": a short description of the scene. It should have around 1 minute of screen time. Describe exactly what in the scene happens and how the scene ends. Don't describe the characters. Don't write dialogue or off-text. The scene can not include any characters.
        }}
    ]
}}
    in {language}.""",
        user_prompt=f"The story should take place in {language}. Use common names in that language and region. The story is about a {idea}. The only characters are {', '.join(str(obj) for obj in characters)}. The act you should generate the scenes for is {act.description}.",
        model=Scenes,
    ).scenes
    act_scenes.append(scenes)

for n, act in enumerate(act_scenes):
    for m, scene in enumerate(act):
        print(f"Act {n + 1}, Scene {m + 1}:")
        print(scene.type)
        print(scene.description)
        print(scene.characters)


100%|██████████| 3/3 [00:14<00:00,  4.98s/it]

Act 1, Scene 1:
narration
Die Sonne senkt sich hinter den dichten, uralten Bäumen des Schwarzwalds, als Lukas sein neues Zuhause erreicht – eine beschauliche Kleinstadt, die den Charme vergangener Tage ausstrahlt. Der Reisekoffer in der einen Hand und ein Entschlossenheitsbekundung im Gesicht, wird ihm bewusst, wie winzig und ruhig es hier ist im Gegensatz zu seinem früheren hektischen Leben in der Stadt. Ein leichter Nebel fängt an aufzusteigen und umgibt die Fachwerkhäuser, die sich malerisch aneinanderreihen. Lukas genießt die frische, klare Luft und lässt sich von der erholsamen Stille tragen. Diese Stille weckt in ihm eine neue Hoffnung auf Frieden und Ruhe.
['lukas']
Act 1, Scene 2:
dialogue
Lukas trifft Johann in der kleinen Bäckerei der Stadt. Während sehr Alter und Erfahrung in Johanns Augen zu sehen sind, ist allein der freundliche Bäcker bereit, Neuigkeiten und Geschichten zu erzählen. Beim Kauf von Brot kommt das Gespräch auf das alte, baufällige Hotel am Stadtrand. Johann 




In [121]:
episode_title = generate_text(
    system_prompt=f"Generate a title for a short story of the series Beyond Belief or X-Faktor das Unfassbare. Return the title in {language}. It must be short. Return it without any quotes.",
    user_prompt=f"The story is about a {idea}.",
)

print(episode_title)

Das Geheimnis der Melodie


In [136]:
@dataclass
class Audio:
    filename: str


@dataclass
class Content:
    filename: str
    audio: Optional[Audio] = None


@dataclass
class Scene:
    type: str  # "video" or "image"
    content: Content


@dataclass
class Appearance:
    start: float  # seconds
    end: float  # seconds


@dataclass
class TextOverlay:
    text: str
    appearance: Appearance


@dataclass
class SoundEffect:
    filename: str
    start: float  # seconds


@dataclass
class Episode:
    backgroundMusic: Audio
    scenes: List[Scene]
    textOverlays: List[TextOverlay] = field(default_factory=list)
    soundEffects: List[SoundEffect] = field(default_factory=list)

    def to_json(self) -> str:
        return json.dumps(self, default=lambda o: o.__dict__, indent=4)

class NarrationResult(BaseModel):
    video_description: str
    monologue: str

class DialogueLine(BaseModel):
    character: str
    text: str

class DialogueResult(BaseModel):
    scene_background_description: str
    lines: List[DialogueLine]

episode = Episode(
    backgroundMusic=Audio(filename="background.mp3"),
    scenes=[],
    textOverlays=[TextOverlay(text=episode_title, appearance=Appearance(start=3, end=8))],
)

for act_n, act in enumerate(act_scenes):
    for scene_n, scene in enumerate(act):
        if scene.type == "narration":
            print(f"Generating narration for scene {scene_n + 1}, act {act_n + 1}")
            narration_result = generate_json(
                system_prompt=f"""Generate a narration for a short story. Return the narration as a JSON:
{{
    "video_description": a prompt for a video generator describing the scene. Must be in English,
    "monologue": a short monologue of the narrator. It should be short 10-20 seconds. It's narrated by the character,
}}
in {language}.""",
                user_prompt=f"The story should take place in {language}. The story is about a {idea}. The only characters are {', '.join(str(obj) for obj in characters)}. The act you is act number {act_n + 1} of {len(act_scenes)}, {acts[act_n].description}. The scene you are generating the narration for is scene number {scene_n + 1} of {len(act)}, {scene.description}.",
                model=NarrationResult,
            )
            video = generate_video(f"A hyper-realistic video of {narration_result.video_description}")
            # video = "content/7a761974-14ae-468e-bbc4-9996e762fe2e.mp4"  
            print("Generating audio ..")
            audio = generate_audio(
                narration_result.monologue,
                next((item for item in characters if item.id == scene.characters[0]), None).voice,
            )
            episode.scenes.append(Scene(type="video", content=Content(filename=video, audio=Audio(filename=audio))))
        elif scene.type == "dialogue":
            print(f"Generating dialogue for scene {scene_n + 1}, act {act_n + 1}")
            dialogue_result = generate_json(
                system_prompt=f"""Generate a dialogue for a short story. Return the dialogue as a JSON:
{{
    "scene_background_description": a short description of the scene for a video generator in English. That with "The character ..",
    "lines": {{
        "character": the character ID,
        "text": the text of the dialogue line,
    }}
}}
in {language}.""",
                user_prompt=f"The story should take place in {language}. The story is about a {idea}. The only characters are {', '.join(str(obj) for obj in characters)}. The act you is act number {act_n + 1} of {len(act_scenes)}, {acts[act_n].description}. The scene you are generating the dialogue for is scene number {scene_n + 1} of {len(act)}, {scene.description}.",
                model=DialogueResult,
            )
            character_images_for_scene = {}
            for n, character in enumerate(scene.characters):
                print(f"Generating character image for {character} ({n + 1} of {len(scene.characters)})")
                character_images_for_scene[character] = generate_image(dialogue_result.scene_background_description, character_images[character]["url"])
                # character_images_for_scene[character] = (
                #     "content/00af9220-1b61-4b7b-a6c8-504e5d744741.jpg"
                # )
            for n, line in enumerate(dialogue_result.lines):
                print(f"Generating audio for {line.character} ({n + 1} of {len(dialogue_result.lines)})")
                audio = generate_audio(
                    line.text,
                    next((item for item in characters if item.id == line.character), None).voice,
                )
                # audio = "content/0e901ae0-f622-40f5-aeb0-e2b5a05c821a.mp3"
                episode.scenes.append(
                    Scene(
                        type="image",
                        content=Content(
                            filename=character_images_for_scene[line.character],
                            audio=Audio(filename=audio),
                        ),
                    )
                )
        
with open("episode.json", "w") as file:
    file.write(episode.to_json())


Generating narration for scene 1, act 1
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating audio ..
Generating dialogue for scene 2, act 1


Generating character images..: 100%|██████████| 2/2 [00:25<00:00, 12.70s/it]
Generating dialogue..: 100%|██████████| 8/8 [00:23<00:00,  2.93s/it]


Generating narration for scene 3, act 1
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating audio ..
Generating dialogue for scene 1, act 2


Generating character images..: 100%|██████████| 2/2 [00:30<00:00, 15.00s/it]
Generating dialogue..: 100%|██████████| 8/8 [00:17<00:00,  2.18s/it]


Generating narration for scene 2, act 2
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating audio ..
Generating narration for scene 3, act 2
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating audio ..
Generating dialogue for scene 1, act 3


Generating character images..: 100%|██████████| 2/2 [00:25<00:00, 12.82s/it]
Generating dialogue..: 100%|██████████| 9/9 [00:18<00:00,  2.08s/it]


Generating narration for scene 2, act 3
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating video..
Generating audio ..
Generating dialogue for scene 3, act 3


Generating character images..: 100%|██████████| 2/2 [00:25<00:00, 12.93s/it]
Generating dialogue..: 100%|██████████| 14/14 [00:28<00:00,  2.04s/it]
