In [1]:
# Fetch transcript from Youtube video

import json
from fetch_transcript.youtube_fetcher import YouTubeTranscriptFetcher


video_id = "RXeOiIDNNek"

fetcher = YouTubeTranscriptFetcher(languages=["en", "vi"])
result = fetcher.fetch(video_id)

print(json.dumps(
    {
        "video_id": result["video_id"],
        "language": result["language"],
        "duration": result["duration"],
        "cleaned_text": result["text"]
    },
    indent=2,
    ensure_ascii=False
))



{
  "video_id": "RXeOiIDNNek",
  "language": "English (auto-generated)",
  "duration": {
    "seconds": 6199.12,
    "minutes": 103.32
  },
  "cleaned_text": "[5.92s] big day everyone. Biggest day of the [8.88s] year. Who's fired up? Beth is fired up. [12.56s] doctor rockwell. [16.56s] one more thing. Just have fun out there. [21.36s] okay, phil. [24.08s] i'm getting too old for this stuff. [46.32s] go. [48.32s] it's showtime. [55.44s] heat [69.68s] up [82.48s] here. [88.24s] wow, that was so cool. Good morning. [92.08s] welcome to apple park. We're glad you [94.56s] could join us for what promises to be an [97.12s] action-packed and memorable wwdc. [100.88s] wwdc marks a moment in the year when [104.00s] we're able to celebrate our global [105.92s] developer community. Developers continue [108.88s] to amaze us with the apps they create [110.80s] for our products. Apps that are used by [113.52s] over a billion people around the world. [116.88s] it's important for us to provide this [11

In [2]:
# Generate outline from transcript

from llm.gemini_client import GeminiClient
from llm.prompts import build_outline_prompt, OUTLINE_PROMPT
from schemas.output_format import OutlineOutput
import json

video_transcript = result["text"]
prompt = build_outline_prompt(video_transcript=video_transcript)
gemini = GeminiClient()
response = gemini.models.generate_content(
    model="gemini-2.5-flash",
    contents=prompt,
    config={
        "response_mime_type": "application/json",
        "response_json_schema": OutlineOutput.model_json_schema(),
    }
)
outline = OutlineOutput.model_validate_json(response.text)

print("===== OUTLINE INFORMATION =====")
print(f"Total sections: {len(outline.sections)}")
print(json.dumps(outline.model_dump(), indent=2, ensure_ascii=False))

===== OUTLINE INFORMATION =====
Total sections: 36
{
  "sections": [
    {
      "section_id": 1,
      "title": "WWDC Welcome and Vision",
      "start": 0.0,
      "end": 141.44,
      "keywords": [
        "WWDC",
        "welcome",
        "developers",
        "intelligence",
        "platforms"
      ]
    },
    {
      "section_id": 2,
      "title": "Apple TV+ 5th Anniversary and New Content",
      "start": 141.44,
      "end": 312.56,
      "keywords": [
        "Apple TV+",
        "anniversary",
        "original shows",
        "movies",
        "entertainment"
      ]
    },
    {
      "section_id": 3,
      "title": "VisionOS 2 Introduction and Core Features",
      "start": 312.56,
      "end": 612.96,
      "keywords": [
        "VisionOS 2",
        "Apple Vision Pro",
        "spatial photos",
        "Mac Virtual Display",
        "gestures"
      ]
    },
    {
      "section_id": 4,
      "title": "VisionOS 2 Developer APIs and Content Creation",
      "start": 

In [3]:
# Segment transcript according to outline


from preprocess.segmenter import TranscriptSegmenter
import json

# Parse transcript tá»« timestamps
segmenter = TranscriptSegmenter(video_transcript)

# Segment theo outline
outlined_sections = segmenter.segment_by_outline(outline.sections)

print("===== SEGMENTED SECTIONS =====")
print(f"Total sections: {len(outlined_sections)}")
print(json.dumps(outlined_sections, indent=2, ensure_ascii=False))

===== SEGMENTED SECTIONS =====
Total sections: 36
[
  {
    "section_id": 1,
    "title": "WWDC Welcome and Vision",
    "start": 0.0,
    "end": 141.44,
    "text": "big day everyone. Biggest day of the year. Who's fired up? Beth is fired up. doctor rockwell. one more thing. Just have fun out there. okay, phil. i'm getting too old for this stuff. go. it's showtime. heat up here. wow, that was so cool. Good morning. welcome to apple park. We're glad you could join us for what promises to be an action-packed and memorable wwdc. wwdc marks a moment in the year when we're able to celebrate our global developer community. Developers continue to amaze us with the apps they create for our products. Apps that are used by over a billion people around the world. it's important for us to provide this community with the newest tools and technologies to do their very best work. today, we're going to have some incredible updates to our platforms. And i'm excited that we'll introduce profound new in

In [4]:
# Summarize each section with memory

from llm.gemini_client import GeminiClient
from llm.prompts import build_section_summary_prompt
from schemas.output_format import SectionSummaryOutput


memory = ""
section_summaries = []

for section in outlined_sections:
    section_text = section["text"]

    prompt = build_section_summary_prompt(
        section_text=section_text,
        memory=memory,
        video_language=result["language"]
    )

    response = gemini.models.generate_content(
        model="models/gemini-2.5-flash-lite",
        contents=prompt,
        config={
            "response_mime_type": "application/json",
            "response_json_schema": SectionSummaryOutput.model_json_schema(),
        }
    )

    # ---- Parse JSON output ----
    summary_obj = SectionSummaryOutput.model_validate_json(response.text)

    # ---- LÆ°u káº¿t quáº£ ----
    section_summaries.append({
        "section_id": section["section_id"],
        "title": section["title"],
        "start": section["start"],
        "end": section["end"],
        "summary": summary_obj.summary,
    })

    # ---- Update memory ----
    memory = summary_obj.summary

print(json.dumps(section_summaries, indent=2, ensure_ascii=False))


[
  {
    "section_id": 1,
    "title": "WWDC Welcome and Vision",
    "start": 0.0,
    "end": 141.44,
    "summary": "The video opens with an energetic welcome to WWDC, emphasizing its significance for the global developer community. Apple highlights the role of developers in creating apps used by over a billion people worldwide and their commitment to providing them with the latest tools and technologies. The presentation promises significant updates to their platforms and the introduction of profound new intelligence capabilities aimed at inspiring developers, delighting users, and enhancing platform usefulness."
  },
  {
    "section_id": 2,
    "title": "Apple TV+ 5th Anniversary and New Content",
    "start": 141.44,
    "end": 312.56,
    "summary": "The section celebrates the fifth anniversary of Apple TV+, highlighting its status as a premier entertainment platform recognized for critically acclaimed and highly-rated original content. It mentions award-winning shows and movie

In [5]:
# Global Summary

from llm.gemini_client import GeminiClient
from llm.prompts import build_global_summary_prompt
from schemas.output_format import GlobalSummaryOutput

# Prepare section summaries as input
section_summaries_text = "" 
for sec in section_summaries:
    section_summaries_text += f"Section {sec['section_id']} - {sec['title']}:\n{sec['summary']}\n\n"
prompt = build_global_summary_prompt(
    section_summaries=section_summaries_text,
    video_language=result["language"]   
)
response = gemini.models.generate_content(
    model="models/gemini-2.5-flash-lite",
    contents=prompt,
    config={
        "response_mime_type": "application/json",
        "response_json_schema": GlobalSummaryOutput.model_json_schema(),
    }
)
overall_summary = GlobalSummaryOutput.model_validate_json(response.text)
print("===== GLOBAL SUMMARY =====")
print(json.dumps(overall_summary.model_dump(), indent=2, ensure_ascii=False))

===== GLOBAL SUMMARY =====
{
  "global_summary": "WWDC 2024 showcased significant advancements across Apple's platforms, including major updates to visionOS, iOS 18, iPadOS 18, and macOS Sequoia. A central theme was the introduction of 'Apple Intelligence,' a new personal intelligence system deeply integrated into Apple devices, leveraging generative AI and large language models for enhanced user experiences while prioritizing privacy. Key features include advanced writing and image generation tools, a vastly improved Siri, contextual awareness across apps, and on-device processing with private cloud compute for complex tasks. Platform updates focus on extensive customization for iOS 18 (home screen, Control Center), enhanced privacy and messaging features, redesigned Photos app, and new capabilities for Apple Watch, Apple TV, and AirPods. iPadOS 18 introduces a powerful Calculator with Math Notes and Smart Script in Notes. macOS Sequoia brings iPhone Mirroring, improved window tiling,