# 📹 YouTube Transcript Summarizer

This notebook takes a list of YouTube video URLs, extracts their transcripts using the `youtube-transcript-api`, and summarizes all of them together using the Inception Labs API.

In [10]:
%pip install youtube_transcript_api

Note: you may need to restart the kernel to use updated packages.


In [11]:
# ✅ Required libraries
import os
import requests
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter
from urllib.parse import urlparse, parse_qs
from IPython.display import Markdown, display

In [12]:
# Get API key from environment variable
import os
import pathlib
from dotenv import load_dotenv

# Try to find the project root (where .env is located)
current_dir = pathlib.Path.cwd()
# Look for .env in parent directories
found = False
for parent in [current_dir] + list(current_dir.parents):
    env_path = parent / '.env'
    if env_path.exists():
        print(f"Loading .env from {env_path}")
        load_dotenv(env_path)
        found = True
        break

API_KEY = os.getenv('SECRET__INCEPTION_LABS__API_KEY')
if not API_KEY:
    raise ValueError("SECRET__INCEPTION_LABS__API_KEY is not set. Please set the SECRET__INCEPTION_LABS__API_KEY environment variable.")

Loading .env from /home/superdev/projects/OpenMates/.env


In [None]:
# 🔗 Input: List of YouTube video URLs
youtube_urls = [
    # talks from partners
    "https://youtube.com/watch?v=xlEQ6Y3WNNI",
    "https://youtube.com/watch?v=UjboGsztHd8",
    "https://youtube.com/watch?v=67a5yrKH-nI",
    "https://youtube.com/watch?v=Ac4LiuoJT20",
    # talks from anthropic?
    "https://youtube.com/watch?v=XSZP9GhhuAc",
    "https://youtube.com/watch?v=ysPbXH0LpIE",
    "https://youtube.com/watch?v=j8NlbEWAsmc",
    "https://youtube.com/watch?v=HNzH5Us1Rvg",
    "https://youtube.com/watch?v=gv0WHhKelSE",
    "https://youtube.com/watch?v=dRsjO-88nBs"
]

In [14]:
# 🔍 Helper function to extract video ID
def extract_video_id(url):
    parsed = urlparse(url)
    if "youtube" in parsed.hostname:
        return parse_qs(parsed.query).get("v", [None])[0]
    elif "youtu.be" in parsed.hostname:
        return parsed.path[1:]
    return None

In [19]:
# 📝 Fetch transcripts for all videos
all_transcripts = []
all_transcripts_md = ""
success_count = 0
failed_count = 0
total_word_count = 0

print(f"🎬 Fetching transcripts for {len(youtube_urls)} videos...")

# Initialize the YouTube Transcript API
ytt_api = YouTubeTranscriptApi()
formatter = TextFormatter()

for url in youtube_urls:
    video_id = extract_video_id(url)
    print(f"🔍 Processing URL: {url} (ID: {video_id})")
    if not video_id:
        failed_count += 1
        continue
        
    try:
        # Fetch transcript
        transcript = ytt_api.fetch(video_id)
        
        # Format transcript to text
        formatted_transcript = formatter.format_transcript(transcript)
        
        # Count words
        word_count = len(formatted_transcript.split())
        total_word_count += word_count
        
        # Add to our collection
        all_transcripts.append({
            "video_id": video_id,
            "url": url,
            "transcript": formatted_transcript,
            "word_count": word_count
        })
        
        # Add to markdown for summarization
        all_transcripts_md += f"## Video: {url}\n\n{formatted_transcript}\n\n---\n\n"
        
        success_count += 1
    except Exception as e:
        print(f"❌ Error fetching transcript for {url}: {e}")
        failed_count += 1
        # Uncomment to debug errors
        # print(f"Error fetching transcript for {url}: {e}")

# Print summary statistics
success_rate = (success_count / len(youtube_urls)) * 100 if youtube_urls else 0
print(f"✅ Successfully fetched {success_count} transcripts ({success_rate:.1f}% success rate)")
print(f"❌ Failed to fetch {failed_count} transcripts")
print(f"📊 Total word count: {total_word_count:,} words")

🎬 Fetching transcripts for 10 videos...
🔍 Processing URL: https://youtube.com/watch?v=xlEQ6Y3WNNI (ID: xlEQ6Y3WNNI)
❌ Error fetching transcript for https://youtube.com/watch?v=xlEQ6Y3WNNI: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=xlEQ6Y3WNNI! This is most likely caused by:

YouTube is blocking requests from your IP. This usually is due to one of the following reasons:
- You have done too many requests and your IP has been blocked by YouTube
- You are doing requests from an IP belonging to a cloud provider (like AWS, Google Cloud Platform, Azure, etc.). Unfortunately, most IPs from cloud providers are blocked by YouTube.

There are two things you can do to work around this:
1. Use proxies to hide your IP address, as explained in the "Working around IP bans" section of the README (https://github.com/jdepoix/youtube-transcript-api?tab=readme-ov-file#working-around-ip-bans-requestblocked-or-ipblocked-exception).
2. (NOT RECOMMENDED) If you authenticate

In [16]:
# 🧠 Function to summarize all transcripts with Inception Labs
def summarize_all_transcripts(transcript_markdown: str) -> str:
    prompt = f"""
You are an expert summarizer. Below are transcripts of several YouTube videos in markdown format.

Please extract the key themes, highlights, and insights from the entire set of videos.
Organize your summary using clear markdown structure: use headings, bullet points, and concise descriptions.

---
{transcript_markdown}
---
"""
    try:
        response = requests.post(
            "https://api.inceptionlabs.ai/v1/chat/completions",
            headers={
                "Content-Type": "application/json",
                "Authorization": f"Bearer {API_KEY}"
            },
            json={
                "model": "mercury-coder",
                "messages": [{"role": "user", "content": prompt}],
                "max_tokens": 2000
            }
        )
        return response.json()["choices"][0]["message"]["content"]
    except Exception as e:
        return e
        return f"❌ Error during summarization: {e}"

In [17]:
# 🧠 Summarize combined transcripts
if all_transcripts_md !="":
    print("🧠 Summarizing all transcripts...")
    summary = summarize_all_transcripts(all_transcripts_md)
else:
    summary = "No transcripts available to summarize."

# 📤 Display final result
display(Markdown("## 🧠 Combined Video Summary"))
display(Markdown(summary))

## 🧠 Combined Video Summary

No transcripts available to summarize.