In [4]:
!pip install -U langchain faiss-cpu transformers sentence-transformers youtube-transcript-api langchain-community



Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect

In [8]:
import re
from youtube_transcript_api import YouTubeTranscriptApi
from langchain_community.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# Load models
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def extract_transcript(video_url):
    video_id_match = re.search(r"(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})", video_url)
    if not video_id_match:
        raise ValueError("Invalid YouTube URL")
    video_id = video_id_match.group(1)
    print(f"📥 Fetching transcript for video ID: {video_id}")
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    return " ".join([entry['text'] for entry in transcript])

def create_vectorstore(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = splitter.split_text(text)
    docs = [Document(page_content=chunk) for chunk in chunks]

    texts = [doc.page_content for doc in docs]
    embeddings = embedding_model.encode(texts)

    # Zip into (text, embedding) pairs
    text_embedding_pairs = list(zip(texts, embeddings))

    db = FAISS.from_embeddings(text_embedding_pairs, embedding_model)
    return db, chunks


def summarize_chunks(chunks, top_k=5):
    input_text = " ".join(chunks[:top_k])[:4096]
    summary = summarizer(input_text, max_length=150, min_length=40, do_sample=False)
    return summary[0]['summary_text']

def summarize_youtube_video(video_url):
    try:
        transcript = extract_transcript(video_url)
    except Exception as e:
        print(f"❌ Error: {e}")
        return

    print("🔍 Creating vector store...")
    db, chunks = create_vectorstore(transcript)

    print("✍️ Summarizing transcript...")
    summary = summarize_chunks(chunks)

    print("\n📄 Video Summary:\n")
    print(summary)

# 👇 Input cell for Colab users
video_url = input("🔗 Paste your YouTube video link here: ").strip()
summarize_youtube_video(video_url)


Device set to use cpu


🔗 Paste your YouTube video link here: https://www.youtube.com/watch?v=BmUZ2wp1lM8
📥 Fetching transcript for video ID: BmUZ2wp1lM8
🔍 Creating vector store...




✍️ Summarizing transcript...

📄 Video Summary:

Grav Stars are the most extreme objects in the Universe. They are Cosmic soap bubbles filled with pure energy and with a shell made of the weirdest material that's possible in nature. There might be an object so indestructible extreme and brutal that it could kill black holes.
