##### Copyright 2024 Google LLC.

In [1]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

by [Yousif Ahmed](https://www.linkedin.com/in/yousif-hag-ahmed/) .

# Gemini API: Animated Story Video Generation


<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/drive/1bGg17MHQMRke8n0ryH5WOu6uM56HZo-y?usp=sharing"><img src="../images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
</table>


This Colab Notebook demonstrates how to generate an animated story video by:

1. Generating a story sequence using structured Google Gemini API (for character consistency).
2. Generating images for each scene using Google’s Imagen API.
3. Synthesizing narration audio using Kokoro's KPipeline.
4. Creating short video clips (image + audio overlay) for each scene.
5. Combining all clips into one final video.
6. Cleaning up temporary files after processing.


In [2]:
!pip install google-genai


In [3]:
import google.generativeai as genai

## Set up your API key

To run the following cell, your API key must be stored it in a Colab Secret named `GOOGLE_API_KEY`. If you don't already have an API key, or you're not sure how to create a Colab Secret, see the [Authentication](../quickstarts/Authentication.ipynb) quickstart for an example.

* Go to "Tools" -> "command palette" -> "User secrets" in the Colab menu.
* Click "Add a secret".
* In the "Secret name" field, enter GOOGLE_API_KEY.
* In the "Secret value" field, paste your actual Google API key.
* Click "Add" to save the secret.

In [4]:
from google.colab import userdata




from google.colab import userdata
import os

os.environ['GOOGLE_API_KEY']=userdata.get('GOOGLE_API_KEY')

## Notebook Information

This notebook, titled **Animated Story Video Generation**, uses multiple Google APIs and open source libraries to produce an animated video from a generated story sequence. The notebook demonstrates the integration of state‑of‑the‑art text, image, and audio generation methods along with video composition using MoviePy.

# **Code Cell 1: Installation and Setup Commands**

In [5]:

!apt-get update -qq && apt-get install -qq locales
!locale-gen en_US.UTF-8
!update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8

!apt-get -qq -y install espeak-ng > /dev/null 2>&1
!pip install -q google-generativeai moviepy Pillow
!pip install -q nest_asyncio

In [6]:

import os
import gc
import time
import json
import numpy as np
from pathlib import Path
from PIL import Image
from scipy.io.wavfile import write
from io import BytesIO
from moviepy.editor import ImageClip, TextClip, CompositeVideoClip, AudioFileClip, concatenate_videoclips
import typing_extensions as typing
from IPython.display import display, Audio
import soundfile as sf
from PIL import Image, ImageDraw, ImageFont
import nest_asyncio
nest_asyncio.apply()
import asyncio
import contextlib
import json
import wave
from IPython import display
from google import genai
from google.genai import types


# Google Generative Models Setup


In [7]:

from google import genai
client = genai.Client(http_options= {
      'api_version': 'v1alpha'
})
# Create a client for text generation using Gemini.
MODEL = "gemini-2.0-flash-exp"
# Create a client for image generation using Imagen.
IMAGE_MODEL_ID = "imagen-3.0-generate-002"



# SECTION 1: Story Generation


In [8]:
# Define the structure for each story segment.
class StorySegment(typing.TypedDict):
    image_prompt: str
    audio_text: str
    character_description: str

# Define the structure for the overall story response.
class StoryResponse(typing.TypedDict):
    complete_story: list[StorySegment]
    pages: int

def generate_story_sequence(complete_story: str, pages: int) -> list[StorySegment]:
    """
    Generate a story sequence given a theme and number of scenes.
    Each scene includes:
      - image_prompt: short scene description
      - audio_text: dialogue or narration text
      - character_description: detailed character/background hints
    """
    response = client.models.generate_content(
        model=MODEL,
        contents=f'''you are an animation video producer. Generate a story sequence about {complete_story} in {pages} scenes (with interactions and characters), 1 sec each scene. Write:

image_prompt: a full description of the scene, the characters in it, and the background in 20 words or less. Progressively shift the scene as the story advances.
audio_text: a one-sentence dialogue/narration for the scene.
character_description: no people ever, only animals and objects. Describe all characters (consistent names, features, clothing, etc.) with an art style reference (e.g., "Pixar style," "photorealistic," "Ghibli") in 30 words or less.
''',
        config={
            'response_mime_type': 'application/json',
            'response_schema': list[StoryResponse]
        }
    )

    try:
        story_data_text = response.text  # Get the JSON text
        story_data_list = json.loads(story_data_text)
        if isinstance(story_data_list, list) and len(story_data_list) > 0:
            story_data = story_data_list[0]
            return story_data.get('complete_story', []), story_data.get('character_description', {})
        else:
            return []
    except (KeyError, TypeError, IndexError, json.JSONDecodeError) as e:
        print(f"Error parsing JSON: {e}")
        return []




**Example usage: define a theme and generate scenes.**

* write your own theme and num of scenes to play and experiment with it

In [9]:
# example:
# theme = "cats and dogs"
# story_segments, _ = generate_story_sequence(theme, 3)

# Get user input for theme and number of scenes in Colab.
theme = input("Enter a theme for your animated story: ")
num_scenes = int(input("Enter the number of scenes: "))
story_segments, _ = generate_story_sequence(theme, num_scenes)
print("Generated story segments:", story_segments)


# SECTION 2: Process Each Story Segment and Create Video Clips

In [10]:
@contextlib.contextmanager
def wave_file(filename, channels=1, rate=24000, sample_width=2):
    with wave.open(filename, "wb") as wf:
        wf.setnchannels(channels)
        wf.setsampwidth(sample_width)
        wf.setframerate(rate)
        yield wf


In [11]:



# --- Cell 2: Definitions and setup ---
temp_audio_files = []  # To track temporary audio files
temp_image_files = []  # To track temporary image files
video_clips = []       # To store individual video clips for each scene

def generate_audio_live(api_text, output_filename):
    import asyncio
    collected_audio = bytearray()

    async def _generate():
        config = {
            "generation_config": {"response_modalities": ["AUDIO"]}
        }
        # Connect to the Live API using the client already initialized above.
        async with client.aio.live.connect(model=MODEL, config=config) as session:
            # Send the audio_text prompt; mark as end_of_turn.
            await session.send(input=api_text, end_of_turn=True)
            # Collect audio data as it streams in.
            async for response in session.receive():
                if response.data:
                    collected_audio.extend(response.data)
        return bytes(collected_audio)

    # Run the async function and collect the audio bytes.
    audio_bytes = asyncio.run(_generate())
    # Write the collected audio bytes into a WAV file using the helper.
    with wave_file(output_filename) as wf:
        wf.writeframes(audio_bytes)
    return output_filename

# --- Cell 3: Main processing loop ---
for i, segment in enumerate(story_segments):
    # Retrieve details for the current scene.
    image_prompt = segment['image_prompt']
    audio_negative_prompt = "dont say OK , I will do this or that, just only read this story using voice expressions without introductions or ending ,more segments are comming ,dont say OK , I will do this or that:\n"
    audio_text =  audio_negative_prompt + segment['audio_text']
    audio_text_prompt = segment['audio_text']
    char_desc = segment['character_description']

    print(f"Processing scene {i}:")
    print("Image Prompt:", image_prompt)
    print("Audio Text:", audio_text_prompt)
    print("Character Description:", char_desc)
    print("--------------------------------")

    # -------------------------
    # Image Generation using Google Imagen
    # -------------------------
    combined_prompt = "detailed children book animation style " + image_prompt + " " + char_desc

    result = client.models.generate_images(
        model=IMAGE_MODEL_ID,
        prompt=combined_prompt,
        config={
            "number_of_images": 1,
            "output_mime_type": "image/jpeg",
            "person_generation": "DONT_ALLOW",
            "aspect_ratio": "1:1"
        }
    )

    try:
        if not result.generated_images:
            raise ValueError("No images were generated. The prompt might have been flagged as harmful. Please modify your prompt and try again.")
        for generated_image in result.generated_images:
            image = Image.open(BytesIO(generated_image.image.image_bytes))
    except Exception as e:
        print("Image generation failed ", e)

    image_path = f"image_{i}.png"
    image.save(image_path)
    temp_image_files.append(image_path)
    image.show()

    # -------------------------
    # Audio Generation using Google Live API
    # -------------------------
    audio_path = f"audio_{i}.wav"
    audio_path = generate_audio_live(audio_text, audio_path)
    temp_audio_files.append(audio_path)


    # -------------------------
    # Create Video Clip (Image + Audio)
    # -------------------------
    audio_clip = AudioFileClip(audio_path)

    # Convert PIL Image to numpy array
    np_image = np.array(image)

    # Create ImageClip (size is inferred from np_image)
    image_clip = ImageClip(np_image).set_duration(audio_clip.duration)

    # Store composite clip with audio in memory
    composite_clip = CompositeVideoClip([image_clip]).set_audio(audio_clip)
    video_clips.append(composite_clip)


# SECTION 3: Final Video Assembly and Cleanup


In [12]:
# @title
final_video = concatenate_videoclips(video_clips)
output_filename = f"{int(time.time())}_output_video.mp4"
print("Writing final video to", output_filename)
final_video.write_videofile(output_filename, fps=24)

# Cleanup: Close video clips and remove temporary files.
final_video.close()
for clip in video_clips:
    clip.close()
for file in temp_audio_files:
    os.remove(file)
for file in temp_image_files:
    os.remove(file)

gc.collect()

## Final Notes

This notebook is designed to run without modifications. It generates an animated story video using multiple Google APIs and open source libraries. Make sure to have a valid API key and to install all the necessary dependencies before running the notebook in Google Colab.

## Next Steps
### Useful API references:

* Learn more about [Structured Outputs](https://ai.google.dev/gemini-api/docs/structured-outputs) in the docs.

* [imagen pricing ](https://ai.google.dev/pricing#2_0flash)

* Imagen prompt guide and [Prompt writing basics](https://ai.google.dev/gemini-api/docs/imagen-prompt-guide)

### Related examples

* [Get started imagen](https://github.com/google-gemini/cookbook/blob/main/quickstarts/Get_started_imagen.ipynb)

* [Get started LiveAPI](https://github.com/Yousif-GO/cookbook/blob/main/quickstarts/Get_started_LiveAPI_tools.ipynb_)

### Continue your discovery of the Gemini API

checkout other great gemini features  

* [video understanding](https://github.com/google-gemini/cookbook/blob/4437c15aa0bcb8f397b49f5b2e549f64e3a0985f/quickstarts/Video_understanding.ipynb)

* [Prompting with video](https://github.com/google-gemini/cookbook/blob/4437c15aa0bcb8f397b49f5b2e549f64e3a0985f/quickstarts/Video.ipynb)
