In [6]:

!pip install -q youtube-transcript-api sentence-transformers transformers nltk


from youtube_transcript_api import YouTubeTranscriptApi
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt_tab')
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from sentence_transformers import SentenceTransformer, util
import time

def fetch_transcript_text(video_id):
    """
    Return transcript text as a single string for a YouTube video id.

    """
    try:
            ytt = YouTubeTranscriptApi()
            transcript = ytt.fetch(video_id)

            texts = []
            for item in transcript:
                if isinstance(item, dict) and 'text' in item:
                    texts.append(item['text'])
                else:

                    texts.append(getattr(item, "text", str(item)))
            return " ".join(texts)
    except Exception as e2:
            raise RuntimeError(f"Failed to fetch transcript")

# SBERT models for embeddings / scoring
sbert_model_name = "all-MiniLM-L6-v2"
sbert = SentenceTransformer(sbert_model_name)

# BART setup (abstractive)
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

# Scoring functions
def compute_relevance_scores(sentences):
    """
    Relevance score: cosine similarity of each sentence embedding to the mean document embedding.
    Returns scores list matching sentences.
    """
    embeds = sbert.encode(sentences, convert_to_tensor=True, show_progress_bar=False)
    doc_embed = embeds.mean(dim=0, keepdim=True)
    cos = util.cos_sim(embeds, doc_embed).squeeze().cpu().numpy()
    # normalize to 0..1
    cos = (cos - cos.min()) / (cos.max() - cos.min() + 1e-8)
    return cos.tolist(), embeds

def compute_coherence_scores(sentences, sent_embeds=None):
    """
    Coherence score for each sentence: similarity with the previous sentence (0 for first sentence).
    We return a list same length as sentences.
    """
    if sent_embeds is None:
        sent_embeds = sbert.encode(sentences, convert_to_tensor=True, show_progress_bar=False)
    scores = [0.0] * len(sentences)
    if len(sentences) < 2:
        return scores
    for i in range(1, len(sentences)):
        sim = util.cos_sim(sent_embeds[i], sent_embeds[i-1]).item()
        # normalize later combined with relevance; keep raw sim ([-1,1])
        scores[i] = (sim + 1) / 2.0  # map to [0,1]
    return scores

# Select top-k extractive sentences using combined score
def select_extractive(sentences, top_k=8, alpha=0.7):
    """
    alpha: weight for relevance, (1-alpha) for coherence
    Returns selected sentences and their original indices (kept in original order).
    """
    relevance_scores, sent_embeds = compute_relevance_scores(sentences)
    coherence_scores = compute_coherence_scores(sentences, sent_embeds)
    final_scores = []
    for r, c in zip(relevance_scores, coherence_scores):
        final_scores.append(alpha * r + (1 - alpha) * c)
    # pick top_k indices
    import numpy as np
    idx_sorted = np.argsort(final_scores)[::-1]
    top_idx = sorted(idx_sorted[:min(top_k, len(sentences))])  # sort to keep original order
    selected = [sentences[i] for i in top_idx]
    return selected, top_idx, sent_embeds

# Abstractive rewrite with BART (input is a concatenated string)
def abstractive_rewrite(text, max_input_tokens=1024, max_output_tokens=500, min_output_tokens=120):
    """
    Feed text to BART and return abstractive summary string.
    Text will be truncated if needed by tokenizer.
    """
    inputs = bart_tokenizer([text], max_length=max_input_tokens, truncation=True, return_tensors="pt")
    with torch.no_grad():
        summary_ids = bart_model.generate(
            inputs["input_ids"],
            num_beams=5,
            length_penalty=1.0,
            max_length=max_output_tokens,
            min_length=min_output_tokens,
            no_repeat_ngram_size=3,
            early_stopping=True,
        )
    summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return summary

# 8) Hybrid fusion and deduplication
def hybrid_fusion(extractive_sents, abstractive_summary, dedupe_threshold=0.82):
    """
    Combine abstractive summary (split to sentences) with extractive sentences.
    We dedupe by embedding similarity: if a candidate sentence is highly similar to one already taken, skip it.
    Final ordering: we preserve original order for extractive sentences; abstractive sentences are placed first,
    but duplicates removed.
    """
    abs_sents = sent_tokenize(abstractive_summary)
    all_candidates = []
    # mark source type for potential weighting/debugging
    for s in abs_sents:
        all_candidates.append(("abs", s))
    for s in extractive_sents:
        all_candidates.append(("ext", s))

    final = []
    final_embeds = []
    for src, s in all_candidates:
        emb = sbert.encode(s, convert_to_tensor=True)
        keep = True
        for fe in final_embeds:
            if util.cos_sim(emb, fe).item() >= dedupe_threshold:
                keep = False
                break
        if keep:
            final.append((src, s))
            final_embeds.append(emb)
    # produce final text: we can order by preferring extractive order after abstractive:

    abs_final = [s for src, s in final if src == "abs"]
    ext_final = [s for src, s in final if src == "ext"]
    final_text = " ".join(abs_final + ext_final)
    return final_text

# 9) Full pipeline function
def summarize_youtube(video_id, top_k=8, alpha=0.7):
    t0 = time.time()
    print(f"Fetching transcript for {video_id} ...")
    text = fetch_transcript_text(video_id)
    print("Transcript length (chars):", len(text))
    # sentence tokenization
    sentences = sent_tokenize(text)
    print("Total sentences:", len(sentences))

    # extractive selection with coherence-aware scoring
    selected_sents, selected_idx, sent_embeds = select_extractive(sentences, top_k=top_k, alpha=alpha)
    print("Selected extractive sentences count:", len(selected_sents))
    print("Selected indices:", selected_idx)

    # prepare input for BART: join selected sentences (preserve order)
    bart_input = " ".join(selected_sents)
    print("\n--- BART input preview ---\n", bart_input[:800], "\n---\n")

    # abstractive rewriting
    abstractive_summary = abstractive_rewrite(bart_input)
    print("\n--- Abstractive summary (BART) preview ---\n", abstractive_summary[:800], "\n---\n")

    # hybrid fusion
    final_summary = hybrid_fusion(selected_sents, abstractive_summary)
    t1 = time.time()

    print("Hybrid summary generated in {:.1f}s".format(t1 - t0))
    return {
        "video_id": video_id,
        "extractive_selected_sentences": selected_sents,
        "abstractive_summary": abstractive_summary,
        "final_summary": final_summary,
        "time_sec": t1 - t0
    }


VIDEO_ID = "lOD_EE96jhM"  # replace with your video id
result = summarize_youtube(VIDEO_ID, top_k=10, alpha=0.7)

print("\n===== FINAL HYBRID SUMMARY =====\n")
print(result["final_summary"])
print("\n===== DETAILS =====")
print("Time (s):", result["time_sec"])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Fetching transcript for lOD_EE96jhM ...
Transcript length (chars): 7390
Total sentences: 65
Selected extractive sentences count: 10
Selected indices: [np.int64(5), np.int64(6), np.int64(7), np.int64(14), np.int64(23), np.int64(34), np.int64(39), np.int64(42), np.int64(48), np.int64(62)]

--- BART input preview ---
 So a picture or a graph or a handwritten note that might contain some valuable information, but without a way to convert that visual data into a form the LLM understands, it's gonna be inaccessible, but this is where vision language models come in, or VLMs. So vision language models are multimodal. That means that they can take in text but they can also take in image files as well and interpret their meaning and then generate as a response a text based output. So here we have the model generate a natural language description of an image. So, vision language models, they don't just process images and text. And then the result of all of this is then output, which is a text out