In [1]:
# import relevant libraries
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import os
import pandas as pd

In [2]:
# youtube api key
api_key = os.getenv("API_KEY")

# build api client
api_name = "youtube"
api_version = "v3"
youtube = build(api_name, api_version, developerKey=api_key)

In [3]:
# search for the first 100 video results for each query
queries = ["animal liberation", "vegan speech", "vegan outreach", "vegan lifestyle", "joey carbstrong", "earthling ed",
            "david ramms", "vegan gains", "wayne hsiung", "mic the vegan"]
video_ids = []

for query in queries:
    next_page_token = None

    for _ in range(2):
        request = youtube.search().list(
            part="id",
            type="video",
            q=query,
            maxResults=50,
            relevanceLanguage="en",
            order="relevance",
            pageToken=next_page_token
        )
        try:
            response = request.execute()
            video_ids += [item["id"].get("videoId") for item in response["items"]]
            next_page_token = response.get("nextPageToken")
        except HttpError as e:
            print(f"Error response status code: {e.status_code}, reason: {e.error_details}")
            raise
        if not next_page_token:
            break

In [4]:
# de-duplicate video list and remove None values
unique_vid_ids = list(set([vid for vid in video_ids if vid is not None]))

# scrape data on unique videos
video_data = []

# collect channel ids
channel_ids = []
for i in range(0, len(unique_vid_ids), 50):
    request = youtube.videos().list(
        part="snippet, contentDetails, statistics",
        id=",".join(unique_vid_ids[i:i+50]),
        maxResults=50
    )
    try:
        response = request.execute()

        for item in response["items"]:
            snippet = item["snippet"]
            content = item["contentDetails"]
            stats = item.get("statistics", {})
            channel_id = snippet["channelId"]

            data = {
                "video_id": item["id"],
                "title": snippet.get("title"),
                "description": snippet.get("description"),
                "published_at": snippet.get("publishedAt"),
                "channel_title": snippet.get("channelTitle"),
                "category_id": snippet.get("categoryId"),
                "tags": snippet.get("tags", []),
                "duration": content.get("duration"),
                "definition": content.get("definition"),
                "caption": content.get("caption"),
                "view_count": int(stats.get("viewCount", 0)),
                "like_count": int(stats.get("likeCount", 0)),
                "comment_count": int(stats.get("commentCount", 0)),
                "channel_id": channel_id
            }
            channel_ids.append(channel_id)
            video_data.append(data)
    except HttpError as e:
        print(f"Error response status code: {e.status_code}, reason: {e.error_details}")
        raise

In [5]:
# convert into a dataframe
video_df = pd.DataFrame(video_data)
video_df.head()

Unnamed: 0,video_id,title,description,published_at,channel_title,category_id,tags,duration,definition,caption,view_count,like_count,comment_count,channel_id
0,JZ5f2BiA2go,Are Low Fat Vegans Wrong?,Stack this discount code on top of Hume’s curr...,2025-05-16T20:56:00Z,Mic the Vegan,28,"[HumeHealth, SmartScale, Weightloss, Vegan, lo...",PT15M33S,hd,False,25587,1808,734,UCGJq0eQZoFSwgcqgxIE9MHw
1,CHXpmOPD9Os,Vegan Milk: India Is Ready For Veganism? With ...,Vegan milk: is India ready for veganism? The r...,2021-06-07T06:36:53Z,David Ramms,24,"[vegan milk india, vegan milk, vegan milk tea,...",PT1H29M38S,hd,False,2985,328,123,UCBhciyIVI2SzjXjIEPN4txg
2,Va2CekNsF9c,What is the greatest benefit of being vegan? #...,Sharing the number one benefit of being vegan ...,2023-04-16T03:13:54Z,HazVegan,24,"[outreach, cube, of, truth, anonymous, for, th...",PT30S,hd,False,4369,58,29,UCplPgkdifNuv0mrtWCqTV9w
3,liLxjPsbgig,"""You're DELUDED!"" Vegan & Farmer CLASH in LIVE...",Things get heated on GB news when Joey Carbstr...,2023-10-13T16:16:50Z,Joey Carbstrong,29,[],PT12M29S,hd,False,19852,1877,1029,UCG6usHVNuRbexyisxE27nDw
4,JMOZpfd7JfE,BLM Activists Discuss Speciesism | Edmonton Ve...,Two BLM Activists were intrigued by the cube o...,2020-08-09T20:03:10Z,Edmonton Vegan Outreach,29,"[Cube of Truth, Anonymous for the Voiceless, S...",PT4M50S,hd,False,458,62,8,UCsU5QtzfvVX2DFcmIYM3aRg


In [6]:
# save to csv
video_df.to_csv("data/videos_unfiltered.csv", index=False)

In [7]:
# scrape data on video categories
category_data = []

request = youtube.videoCategories().list(
        part="snippet",
        regionCode="US"
    )
try:
    response = request.execute()

    for item in response["items"]:
        data = {
            "category_id": item["id"],
            "category": item["snippet"]["title"]
        }
        category_data.append(data)
except HttpError as e:
    print(f"Error response status code: {e.status_code}, reason: {e.error_details}")
    raise

In [8]:
# convert into a dataframe
category_df = pd.DataFrame(category_data)
category_df.head()

Unnamed: 0,category_id,category
0,1,Film & Animation
1,2,Autos & Vehicles
2,10,Music
3,15,Pets & Animals
4,17,Sports


In [9]:
# save to csv
category_df.to_csv("data/categories.csv", index=False)

In [10]:
# de-duplicate channel id list
unique_channel_ids = list(set(channel_ids))

# scrape data on unique channels
channel_data = []
for i in range(0, len(unique_channel_ids), 50):
    request = youtube.channels().list(
        part="statistics",
        id=",".join(unique_channel_ids[i:i+50]),
        maxResults=50
    )
    try:
        response = request.execute()

        for item in response["items"]:
            stats = item["statistics"]

            data = {
                "channel_id": item["id"],
                "channel_view_count": int(stats.get("viewCount", 0)),
                "channel_sub_count": int(stats.get("subscriberCount", 0)),
                "channel_video_count": int(stats.get("videoCount", 0))
            }
            channel_data.append(data)
    except HttpError as e:
        print(f"Error response status code: {e.status_code}, reason: {e.error_details}")
        raise

In [11]:
# convert into a dataframe
channel_df = pd.DataFrame(channel_data)
channel_df.head()

Unnamed: 0,channel_id,channel_view_count,channel_sub_count,channel_video_count
0,UCyrXNIBuMX5aynvqnj0mWlQ,485920,2370,314
1,UCQT4gMmiJDWQds0xxaopb1Q,37977,52,84
2,UC3Mz8-a7fjpfgFH9GVVIpbA,4150134,30100,762
3,UCDKJdFer1phQI95UinPZehw,361533299,1450000,250
4,UCGa9KFF6xxu3w3grmYzx6Yw,206992,515,346


In [12]:
# save to csv
channel_df.to_csv("data/channels.csv", index=False)