In [None]:
# Run only if not already installed
%pip install youtube-transcript-api pytube pandas

  from .autonotebook import tqdm as notebook_tqdm


/usr/local/bin/python3

🔗 Video URL: https://www.youtube.com/watch?v=dQw4w9WgXcQ
📝 Summary:
 The YouTube video is a recording of Rick Astley's song, "Never Gonna Give You Up."  The transcript is largely the lyrics of the song, repeatedly listing the song's famous refrain:  "Never gonna give you up, never gonna let you down, never gonna run around and desert you."  The video likely features the audio and possibly the music video for the song.



In [1]:
import re
import sys
import pandas as pd
from pytube import YouTube
from youtube_transcript_api import YouTubeTranscriptApi

# Confirm which Python environment is running
print(f"Using Python from: {sys.executable}")

Using Python from: /usr/local/bin/python3


In [2]:
link_df = pd.read_csv("../data/youtube_topic_links.csv")  # make sure this file exists

print(f"Loaded {len(link_df)} video links")
link_df.head()


Loaded 5200 video links


Unnamed: 0,topic,url
0,Finance,https://www.youtube.com/watch?v=IKXiyApvKjI
1,Finance,https://www.youtube.com/watch?v=4yohVh4qcas
2,Finance,https://www.youtube.com/watch?v=C_UeYBBogPA
3,Finance,https://www.youtube.com/watch?v=WEDIj9JBTC8
4,Finance,https://www.youtube.com/watch?v=Izw-xaVkO0g


In [3]:
def extract_video_id(url):
    match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", url)
    return match.group(1) if match else None

#extract_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ")

In [4]:
def get_transcript(video_id):
    try:
        # Step 1: Try direct English transcript
        return " ".join([
            entry['text'] for entry in YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
        ])
    except Exception as e1:
        try:
            # Step 2: Try any translatable transcript
            transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
            for transcript in transcript_list:
                if transcript.is_translatable:
                    try:
                        translated = transcript.translate('en')
                        return " ".join([entry['text'] for entry in translated.fetch()])
                    except Exception as e2:
                        print(f"⚠️ Could not translate transcript for {video_id}: {e2}")
                        continue
            print(f"⚠️ No translatable transcript found for {video_id}")
            return None
        except Exception as e3:
            print(f"❌ Transcript Error for {video_id}: {e3}")
            return None

In [5]:
import requests

def get_video_title(url):
    oembed_url = f"https://www.youtube.com/oembed?url={url}&format=json"
    try:
        res = requests.get(oembed_url)
        res.raise_for_status()
        return res.json()['title']
    except Exception as e:
        print(f"oEmbed Title Error: {e}")
        return None


#get_video_title("https://www.youtube.com/watch?v=dQw4w9WgXcQ")


In [None]:
# Process each row and store data locally
data = []
count = 0

for i, row in link_df.iterrows():
    topic = row["topic"]
    url = row["url"]
    print(f"\n[{i+1}] Processing: {url} ({topic})")

    video_id = extract_video_id(url)
    title = get_video_title(url)
    transcript = get_transcript(video_id) if video_id else None

    if title and transcript:
        print(f"✅ {title[:50]} — transcript length: {len(transcript)} chars")
        data.append({
            "topic": topic,
            "url": url,
            "title": title,
            "transcript": transcript
        })
        print(count)
    else:
        count+=1
        print(count)
        print("⚠️ Skipped (missing title or transcript)")




[1] Processing: https://www.youtube.com/watch?v=IKXiyApvKjI (Finance)
✅ How much money is in your bank account?  🤔💰  #shor — transcript length: 603 chars

[2] Processing: https://www.youtube.com/watch?v=4yohVh4qcas (Finance)
✅ EMERGENCY DEBATE: They Lied About The Economy Reco — transcript length: 163724 chars

[3] Processing: https://www.youtube.com/watch?v=C_UeYBBogPA (Finance)
✅ My honest advice to someone who wants financial fr — transcript length: 17506 chars

[4] Processing: https://www.youtube.com/watch?v=WEDIj9JBTC8 (Finance)
✅ William Ackman: Everything You Need to Know About  — transcript length: 63176 chars

[5] Processing: https://www.youtube.com/watch?v=Izw-xaVkO0g (Finance)
✅ 10 Crucial Personal Finance Lessons That Transform — transcript length: 17926 chars

[6] Processing: https://www.youtube.com/watch?v=Dugn51K_6WA (Finance)
✅ Money and Finance: Crash Course Economics #11 — transcript length: 11889 chars

[7] Processing: https://www.youtube.com/watch?v=NBSy9GrFagE (Fi

KeyboardInterrupt: 

In [14]:
# Save collected data to CSV
df = pd.DataFrame(data)
csv_filename = "youtube_transcripts.csv"
df.to_csv(csv_filename, index=False)
print(f"\n📁 Data saved locally to: {csv_filename}")
df.head()



📁 Data saved locally to: youtube_transcripts.csv


Unnamed: 0,topic,url,title,transcript
0,Finance,https://www.youtube.com/watch?v=IKXiyApvKjI,How much money is in your bank account? 🤔💰 #...,how much money is in your bank account zero do...
1,Finance,https://www.youtube.com/watch?v=4yohVh4qcas,EMERGENCY DEBATE: They Lied About The Economy ...,I'm I'm sick of multi-millionaires telling kid...
2,Finance,https://www.youtube.com/watch?v=C_UeYBBogPA,My honest advice to someone who wants financia...,this is my honest advice for anyone who wants ...
3,Finance,https://www.youtube.com/watch?v=WEDIj9JBTC8,William Ackman: Everything You Need to Know Ab...,"Hi, I'm Bill Ackman. I'm the CEO of Pershing S..."
4,Finance,https://www.youtube.com/watch?v=Izw-xaVkO0g,10 Crucial Personal Finance Lessons That Trans...,hey guys welcome back to the channel in this v...
