##### Copyright 2024 Google LLC.

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Gemini API: Animated Story Video Generation


<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/drive/1aO2MesYp6NKNIXcEdXhcuSQKvL9rb2Xn?usp=sharing"><img src="../images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
</table>


This Colab Notebook demonstrates how to generate an animated story video by:

1. Generating a story sequence using structured Google Gemini API (for character consistency).
2. Generating images for each scene using Google’s Imagen API.
3. Synthesizing narration audio using Kokoro's KPipeline.
4. Creating short video clips (image + audio overlay) for each scene.
5. Combining all clips into one final video.
6. Cleaning up temporary files after processing.


In [None]:
!pip install -U -q "google-generativeai>=0.7.2"  # Install the Python SDK

# If you are relying on a new feature, be sure to include the minimum
# SDK version here too - e.g. 'google-generativeai>=0.9.9'.

In [None]:
import google.generativeai as genai

## Set up your API key

To run the following cell, your API key must be stored it in a Colab Secret named `GOOGLE_API_KEY`. If you don't already have an API key, or you're not sure how to create a Colab Secret, see the [Authentication](../quickstarts/Authentication.ipynb) quickstart for an example.

* Go to "Tools" -> "command palette" -> "User secrets" in the Colab menu.
* Click "Add a secret".
* In the "Secret name" field, enter GOOGLE_API_KEY.
* In the "Secret value" field, paste your actual Google API key.
* Click "Add" to save the secret.

In [None]:
from google.colab import userdata

GOOGLE_API_KEY = userdata.get("GOOGLE_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)

## Notebook Information

This notebook, titled **Animated Story Video Generation**, uses multiple Google APIs and open source libraries to produce an animated video from a generated story sequence. The notebook demonstrates the integration of state‑of‑the‑art text, image, and audio generation methods along with video composition using MoviePy.

# **Code Cell 1: Installation and Setup Commands**

In [None]:

!apt-get update -qq && apt-get install -qq locales
!locale-gen en_US.UTF-8
!update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8

!pip install -q kokoro>=0.3.4 soundfile
!apt-get -qq -y install espeak-ng > /dev/null 2>&1
!pip install -q google-generativeai moviepy Pillow

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Generating locales (this might take a while)...
  en_US.UTF-8... done
Generation complete.


In [None]:

import os
import gc
import time
import json
import numpy as np
from pathlib import Path
from PIL import Image
from scipy.io.wavfile import write
from io import BytesIO
from moviepy.editor import ImageClip, TextClip, CompositeVideoClip, AudioFileClip, concatenate_videoclips
import typing_extensions as typing
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf
from PIL import Image, ImageDraw, ImageFont
# Initialize the Kokoro pipeline with the desired language.
# For American English use 'a'; adjust if needed.
pipeline_kokoro = KPipeline(lang_code='a')


  if event.key is 'enter':

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



config.json:   0%|          | 0.00/2.35k [00:00<?, ?B/s]


  WeightNorm.apply(module, name, dim)



kokoro-v1_0.pth:   0%|          | 0.00/327M [00:00<?, ?B/s]


# Google Generative Models Setup


In [None]:


from google import genai
# Create a client for text generation using Gemini.
text_client = genai.Client(api_key=GOOGLE_API_KEY)
gemini_model = "gemini-2.0-flash"

# Create a client for image generation using Imagen.
image_client = genai.Client(api_key=GOOGLE_API_KEY)
IMAGE_MODEL_ID = "imagen-3.0-generate-002"


  warn(




# SECTION 1: Story Generation


In [None]:
# Define the structure for each story segment.
class StorySegment(typing.TypedDict):
    image_prompt: str
    audio_text: str
    character_description: str

# Define the structure for the overall story response.
class StoryResponse(typing.TypedDict):
    complete_story: list[StorySegment]
    pages: int

def generate_story_sequence(complete_story: str, pages: int) -> list[StorySegment]:
    """
    Generate a story sequence given a theme and number of scenes.
    Each scene includes:
      - image_prompt: short scene description
      - audio_text: dialogue or narration text
      - character_description: detailed character/background hints
    """
    response = text_client.models.generate_content(
        model=gemini_model,
        contents=f'''you are an animation video producer. Generate a story sequence about {complete_story} in {pages} scenes (with interactions and characters), 1 sec each scene. Write:

image_prompt: a full description of the scene, the characters in it, and the background in 20 words or less. Progressively shift the scene as the story advances.
audio_text: a one-sentence dialogue/narration for the scene.
character_description: no people ever, only animals and objects. Describe all characters (consistent names, features, clothing, etc.) with an art style reference (e.g., "Pixar style," "photorealistic," "Ghibli") in 30 words or less.
''',
        config={
            'response_mime_type': 'application/json',
            'response_schema': list[StoryResponse]
        }
    )

    try:
        story_data_text = response.text  # Get the JSON text
        story_data_list = json.loads(story_data_text)
        if isinstance(story_data_list, list) and len(story_data_list) > 0:
            story_data = story_data_list[0]
            return story_data.get('complete_story', []), story_data.get('character_description', {})
        else:
            return []
    except (KeyError, TypeError, IndexError, json.JSONDecodeError) as e:
        print(f"Error parsing JSON: {e}")
        return []




**Example usage: define a theme and generate scenes.**

* write your own theme and num of scenes to play and experiment with it

In [None]:
# example:
# theme = "cats and dogs"
# story_segments, _ = generate_story_sequence(theme, 3)

# Get user input for theme and number of scenes in Colab.
theme = input("Enter a theme for your animated story: ")
num_scenes = int(input("Enter the number of scenes: "))
story_segments, _ = generate_story_sequence(theme, num_scenes)
print("Generated story segments:", story_segments)

Enter a theme for your animated story: a super hero man
Enter the number of scenes: 1
Generated story segments: [{'audio_text': 'Captain Comet, a shining star, zoomed across the starlit sky, ready for action.', 'character_description': 'Captain Comet is a radiant, metallic-looking comet with a determined face, zooming through space. Art style: Pixar.', 'image_prompt': 'Captain Comet soaring across a vibrant, nebular sky; stars twinkling; dynamic perspective; sense of speed.'}]



# SECTION 2: Process Each Story Segment and Create Video Clips

In [None]:


temp_audio_files = []  # To track temporary audio files
temp_image_files = []  # To track temporary image files
video_clips = []       # To store individual video clips for each scene


for i, segment in enumerate(story_segments):
    # Retrieve details for the current scene.
    image_prompt = segment['image_prompt']
    audio_text = segment['audio_text']
    char_desc = segment['character_description']

    print(f"Processing scene {i}:")
    print("Image Prompt:", image_prompt)
    print("Audio Text:", audio_text)
    print("Character Description:", char_desc)
    print("--------------------------------")

    # -------------------------
    # Image Generation using Google Imagen
    # -------------------------
    combined_prompt = "detailed children book animation style " + image_prompt + " " + char_desc

    result = image_client.models.generate_images(
        model=IMAGE_MODEL_ID,
        prompt=combined_prompt,
        config={
            "number_of_images": 1,
            "output_mime_type": "image/jpeg",
            "person_generation": "DONT_ALLOW",
            "aspect_ratio": "1:1"
        }
    )


    try:
        if not result.generated_images:
            raise ValueError("No images were generated. The prompt might have been flagged as harmful. Please modify your prompt and try again.")
        for generated_image in result.generated_images:
            image = Image.open(BytesIO(generated_image.image.image_bytes))
    except Exception as e:
        print("Image generation error:", e)
    #Initialize ImageDraw
    draw = ImageDraw.Draw(image)


    image_path = f"image_{i}.png"
    image.save(image_path)
    temp_image_files.append(image_path)
    image.show()

    # -------------------------
    # Audio Generation using Kokoro's KPipeline
    # -------------------------
    audio_generator = pipeline_kokoro(
        audio_text,
        voice='af_heart',  # Change to a different voice if desired.
        speed=1,
        split_pattern=r'\n+'
    )

    audio_segments = []
    # The generator can yield multiple segments if the text has newlines.
    for idx, (gs, ps, audio) in enumerate(audio_generator):
        if idx == 0:
            display(Audio(data=audio, rate=24000, autoplay=True))
        audio_segments.append(audio)

    # Concatenate segments if more than one, else use the single segment.
    final_audio = np.concatenate(audio_segments) if len(audio_segments) > 1 else audio_segments[0]

    # -------------------------
    # Modification: Convert PyTorch Tensor to NumPy array if needed
    # -------------------------
    if hasattr(final_audio, "cpu"):
        final_audio = final_audio.detach().cpu().numpy()

    # Write the final audio waveform to a WAV file (sample rate 24000 Hz).
    audio_path = f"audio_{i}.wav"
    write(audio_path, 24000, (final_audio * 32767).astype(np.int16))
    temp_audio_files.append(audio_path)

    # -------------------------
    # Create Video Clip (Image + Audio)
    # -------------------------
    audio_clip = AudioFileClip(audio_path)
    # -------------------------
    # Create Video Clip (Image + Audio)
    # -------------------------
    audio_clip = AudioFileClip(audio_path)

    # Convert PIL Image to numpy array
    np_image = np.array(image)

    # Create ImageClip (size is inferred from np_image)
    image_clip = ImageClip(np_image).set_duration(audio_clip.duration)

    # Store composite clip with audio in memory
    composite_clip = CompositeVideoClip([image_clip]).set_audio(audio_clip)
    video_clips.append(composite_clip)


Processing scene 0:
Image Prompt: Captain Comet soaring across a vibrant, nebular sky; stars twinkling; dynamic perspective; sense of speed.
Audio Text: Captain Comet, a shining star, zoomed across the starlit sky, ready for action.
Character Description: Captain Comet is a radiant, metallic-looking comet with a determined face, zooming through space. Art style: Pixar.
--------------------------------


af_heart.pt:   0%|          | 0.00/523k [00:00<?, ?B/s]


# SECTION 3: Final Video Assembly and Cleanup


In [None]:
# @title
final_video = concatenate_videoclips(video_clips)
output_filename = f"{int(time.time())}_output_video.mp4"
print("Writing final video to", output_filename)
final_video.write_videofile(output_filename, fps=24)

# Cleanup: Close video clips and remove temporary files.
final_video.close()
for clip in video_clips:
    clip.close()
for file in temp_audio_files:
    os.remove(file)
for file in temp_image_files:
    os.remove(file)

gc.collect()

Writing final video to 1739239079_output_video.mp4
Moviepy - Building video 1739239079_output_video.mp4.
MoviePy - Writing audio in 1739239079_output_videoTEMP_MPY_wvf_snd.mp3




MoviePy - Done.
Moviepy - Writing video 1739239079_output_video.mp4





Moviepy - Done !
Moviepy - video ready 1739239079_output_video.mp4


474

## Final Notes

This notebook is designed to run without modifications. It generates an animated story video using multiple Google APIs and open source libraries. Make sure to have a valid API key and to install all the necessary dependencies before running the notebook in Google Colab.

## Next Steps
### Useful API references:

* Learn more about [Structured Outputs](https://ai.google.dev/gemini-api/docs/structured-outputs) in the docs.

* [imagen pricing ](https://ai.google.dev/pricing#2_0flash)

* Imagen prompt guide and [Prompt writing basics](https://ai.google.dev/gemini-api/docs/imagen-prompt-guide)

### Related examples

* [Get_started_imagen](https://github.com/google-gemini/cookbook/blob/main/quickstarts/Get_started_imagen.ipynb)

### Continue your discovery of the Gemini API

checkout other great gemini features  

* [video understanding](https://github.com/google-gemini/cookbook/blob/4437c15aa0bcb8f397b49f5b2e549f64e3a0985f/quickstarts/Video_understanding.ipynb)

* [Prompting with video](https://github.com/google-gemini/cookbook/blob/4437c15aa0bcb8f397b49f5b2e549f64e3a0985f/quickstarts/Video.ipynb)
