# Automated YouTube video analysis pipeline
This project is a Google Colab–based  that transforms YouTube videos into structured, verifiable knowledge. It accepts a YouTube URL or a direct link to a social media–hosted video, then performs end-to-end processing that includes

- Video ingestion – Downloads and extracts audio/video from shared links

- Speech-to-text transcription – Converts spoken content into accurate, readable text

- Content analysis – Identifies key topics, claims, and themes

- Automated summarization – Produces a concise, human-readable summary

- Independent fact-checking – Cross-verifies extracted claims against external sources to assess credibility and accuracy

In [None]:
# Install all required libraries
!pip install -q --upgrade bitsandbytes accelerate transformers==4.57.6
!pip install openai-whisper yt-dlp sentencepiece moviepy -q

In [None]:
# imports
from huggingface_hub import login
from google.colab import userdata
# Sign in to HuggingFace Hub
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

# Download audio from Youtube URL

In [None]:
import yt_dlp

# Replace with your desired YouTube URL
video_url = "https://www.youtube.com/watch?v=VIDEO_ID" 

# Configuration for high-quality audio extraction
ydl_opts = {
    'format': 'm4a/bestaudio/best',
    'outtmpl': 'youtube_audio.m4a',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
    }]
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download([video_url])

audio_file = "youtube_audio.mp3"
print("✅ Audio downloaded and ready for processing.")

# Transcribe Speech to Text

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

device = "cuda:0" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, dtype=dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    dtype=dtype,
    device=device,
)

# Pass return_timestamps=True to handle long audio files
result = pipe(audio_file, return_timestamps=True)
full_transcript = result["text"]

In [None]:
print(full_transcript)

# Extract the main takeaways and summary using Llama

Sample with tokenizer and streamer

In [None]:
LLAMA = "meta-llama/Llama-3.2-3B-Instruct"

In [None]:
system_message = """
You Identifies key topics, claims, and themes in the Youtube video transcript and generate a transcript review in markdown format without code blocks.
"""

user_prompt = f"""
Below is an extract transcript of a YouTube health video blog.
Please write a review of the video, including:
- a summary of the content
- any myths that are debunked
- action items proposed in the content
- your indendependent analysis on the content accuracy

Transcription:
{full_transcript}
"""

messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_prompt}
  ]

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
streamer = TextStreamer(tokenizer)
model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)
outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)

# Content after summary

In [None]:
import re

# Decode the full output from the model, skipping special tokens
full_output_string = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Find the start of the assistant's actual response
assistant_response_start_marker = "<|start_header_id|>assistant<|end_header_id|>"
start_index = full_output_string.find(assistant_response_start_marker)
if start_index != -1:
    assistant_response = full_output_string[start_index + len(assistant_response_start_marker):].strip()
else:
    # Fallback if marker not found, though less likely with streamer
    assistant_response = full_output_string

# Extract and print the block of content after the Summary section
# We look for the start of the first section after the summary (e.g., Myths Debunked)
content_after_summary_block = ""
start_of_myths = assistant_response.find("**Myths Debunked**")
start_of_actions = assistant_response.find("**Action Items**")
start_of_analysis = assistant_response.find("**Independent Analysis**")

# Determine the earliest starting point among the sections after the summary
valid_starts = [idx for idx in [start_of_myths, start_of_actions, start_of_analysis] if idx != -1]
if valid_starts:
    earliest_start = min(valid_starts)
    content_after_summary_block = assistant_response[earliest_start:].strip()
    print(f"\n### Content after Summary:\n{content_after_summary_block}")
else:
    print("\nCould not locate the content after the summary section.")