In [1]:
# %pip install google-api-python-client
# %pip install python-dotenv

In [2]:
from googleapiclient.discovery import build
import pandas as pd
import time
import os
from dotenv import load_dotenv

# load environment variables from .env
load_dotenv()

# get API key from environment
API_KEY_1 = os.getenv("YOUTUBE_API_KEY_1")
API_KEY_2 = os.getenv("YOUTUBE_API_KEY_2")

# y API config
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

# confirm key status
if API_KEY_1 and API_KEY_2:
    print("Both API keys loaded from .env")
else:
    print("One or both API keys are missing. Check your .env file.")

Both API keys loaded from .env


In [3]:
TOPICS = [
    # Economy & Society
    "Finance", "Economics", "Healthcare", "Education", "Free Speech", "Gender Identity",

    # Tech & Science
    "Technology", "Artificial Intelligence", "Data Privacy", "Space Exploration", "Environmental Science",

    # Health & Wellness
    "Mental Health", "Physical Fitness", "Diet and Nutrition", "Veganism", "Parenting",

    # Culture & Lifestyle
    "Religion", "Social Media", "Entertainment", "Celebrity Culture", "Relationships",

    # Pop Content
    "Gaming", "Sports", "Music", "Movies and TV Shows", "Fashion and Style"
]

print(f"Loaded {len(TOPICS)} topics.")

Loaded 26 topics.


In [4]:
def get_top_video_urls(query, api_key, max_results=200):
    youtube = build("youtube", "v3", developerKey=api_key)
    video_urls = []
    next_page_token = None

    while len(video_urls) < max_results:
        search_response = youtube.search().list(
            q=query,
            type="video",
            part="id",
            maxResults=min(50, max_results - len(video_urls)),
            pageToken=next_page_token
        ).execute()

        for item in search_response["items"]:
            video_id = item["id"]["videoId"]
            video_url = f"https://www.youtube.com/watch?v={video_id}"
            video_urls.append(video_url)

        next_page_token = search_response.get("nextPageToken")
        if not next_page_token:
            break

        time.sleep(1)  # To avoid rate limit

    return video_urls

print("Search function ready for multiple API keys.")

Search function ready for multiple API keys.


In [7]:
all_data = []

for i, topic in enumerate(TOPICS):
    # use key 1 for first 15 topics, key 2 for the rest
    api_key = API_KEY_1 if i < 15 else API_KEY_2
    print(f"[{i+1}/{len(TOPICS)}] Collecting videos for: {topic} using {'API_KEY_1' if i < 15 else 'API_KEY_2'}")
    
    try:
        urls = get_top_video_urls(topic, api_key, max_results=200)
        print(f"Got {len(urls)} videos for {topic}")
        for url in urls:
            all_data.append({"topic": topic, "url": url})
    except Exception as e:
        print(f"Failed to fetch for {topic}: {str(e)}")

print(f"\nDone collecting. Total videos collected: {len(all_data)}")

[1/26] Collecting videos for: Finance using API_KEY_1
Got 200 videos for Finance
[2/26] Collecting videos for: Economics using API_KEY_1
Got 200 videos for Economics
[3/26] Collecting videos for: Healthcare using API_KEY_1
Got 200 videos for Healthcare
[4/26] Collecting videos for: Education using API_KEY_1
Got 200 videos for Education
[5/26] Collecting videos for: Free Speech using API_KEY_1
Got 200 videos for Free Speech
[6/26] Collecting videos for: Gender Identity using API_KEY_1
Got 200 videos for Gender Identity
[7/26] Collecting videos for: Technology using API_KEY_1
Got 200 videos for Technology
[8/26] Collecting videos for: Artificial Intelligence using API_KEY_1
Got 200 videos for Artificial Intelligence
[9/26] Collecting videos for: Data Privacy using API_KEY_1
Got 200 videos for Data Privacy
[10/26] Collecting videos for: Space Exploration using API_KEY_1
Got 200 videos for Space Exploration
[11/26] Collecting videos for: Environmental Science using API_KEY_1
Got 200 videos

In [9]:
df = pd.DataFrame(all_data)
csv_filename = "../data/youtube_topic_links.csv"
df.to_csv(csv_filename, index=False)

print(f"Saved all video links to {csv_filename}")
df.head()

Saved all video links to ../data/youtube_topic_links.csv


Unnamed: 0,topic,url
0,Finance,https://www.youtube.com/watch?v=IKXiyApvKjI
1,Finance,https://www.youtube.com/watch?v=4yohVh4qcas
2,Finance,https://www.youtube.com/watch?v=C_UeYBBogPA
3,Finance,https://www.youtube.com/watch?v=WEDIj9JBTC8
4,Finance,https://www.youtube.com/watch?v=Izw-xaVkO0g


In [None]:
# optional quick test with one topic

# test_topic = "Gaming"
# test_urls = get_top_video_urls(test_topic, max_results=10)
# print(f"🎮 Sample links for {test_topic}:")
# for url in test_urls:
#     print(url)