In [1]:
# import relevant libraries
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import os
import pandas as pd

In [2]:
# youtube api key
api_key = os.getenv("API_KEY")

# build api client
api_name = "youtube"
api_version = "v3"
youtube = build(api_name, api_version, developerKey=api_key)

In [3]:
# search for the first 100 video results for each query
queries = ["animal liberation", "vegan speech", "vegan outreach", "vegan lifestyle", "joey carbstrong", "earthling ed",
            "david ramms", "vegan gains", "wayne hsiung", "mic the vegan"]
video_ids = []

for query in queries:
    next_page_token = None

    for _ in range(2):
        request = youtube.search().list(
            part="id",
            type="video",
            q=query,
            maxResults=50,
            relevanceLanguage="en",
            order="relevance",
            pageToken=next_page_token
        )
        try:
            response = request.execute()
            video_ids += [item["id"].get("videoId") for item in response["items"]]
            next_page_token = response.get("nextPageToken")
        except HttpError as e:
            print(f"Error response status code: {e.status_code}, reason: {e.error_details}")
            raise
        if not next_page_token:
            break

In [4]:
# de-duplicate video list and remove None values
unique_vid_ids = list(set([vid for vid in video_ids if vid is not None]))

# scrape data on unique videos
video_data = []

# collect channel ids
channel_ids = []
for i in range(0, len(unique_vid_ids), 50):
    request = youtube.videos().list(
        part="snippet, contentDetails, statistics",
        id=",".join(unique_vid_ids[i:i+50]),
        maxResults=50
    )
    try:
        response = request.execute()

        for item in response["items"]:
            snippet = item["snippet"]
            content = item["contentDetails"]
            stats = item.get("statistics", {})
            channel_id = snippet["channelId"]

            data = {
                "video_id": item["id"],
                "title": snippet.get("title"),
                "description": snippet.get("description"),
                "published_at": snippet.get("publishedAt"),
                "channel_title": snippet.get("channelTitle"),
                "category_id": snippet.get("categoryId"),
                "tags": snippet.get("tags", []),
                "duration": content.get("duration"),
                "definition": content.get("definition"),
                "caption": content.get("caption"),
                "view_count": int(stats.get("viewCount", 0)),
                "like_count": int(stats.get("likeCount", 0)),
                "comment_count": int(stats.get("commentCount", 0)),
                "channel_id": channel_id
            }
            channel_ids.append(channel_id)
            video_data.append(data)
    except HttpError as e:
        print(f"Error response status code: {e.status_code}, reason: {e.error_details}")
        raise

In [5]:
# convert into a dataframe
video_df = pd.DataFrame(video_data)
video_df.head()

Unnamed: 0,video_id,title,description,published_at,channel_title,category_id,tags,duration,definition,caption,view_count,like_count,comment_count,channel_id
0,i_wWK_n83LE,Wayne Hsiung Jailed,Original reel @DirectActionEverywhere : https:...,2023-11-05T00:21:07Z,SaLADS,22,[],PT34S,hd,False,2509,69,2,UCNZgGEAMXy0mweOYLHZPINw
1,2w5AWKDdV9g,This Speech Will Make You Go Vegan! - Clif Gra...,It's been a couple of years since I last spoke...,2022-04-08T20:56:09Z,Clif Grant,22,[],PT57M33S,hd,False,1556,237,75,UChD3chubIKyNo0Z2TppQ-Fw
2,iVXsjWxaTFQ,Animal Liberation Front (ALF) –Raid Interfauna...,17 March 1990 – ALF liberates more than 100 an...,2021-10-07T18:41:22Z,Alex Punx,29,"[animal liberation front, frente de liberacion...",PT51S,sd,False,1712,85,5,UCi919GSAtsLqFHjMnuWVm9Q
3,tKUv8JrrzSY,Vegan Kills More Animals 🤯| Ted Nugent,Ted Nugent Explains How Vegan Kills More Anima...,2023-01-31T12:30:32Z,Business Bulls,22,"[Business, success, investing, finance, entrep...",PT46S,hd,False,108193,5506,742,UC-YvsfPu6tlCAd8AQUyoWRg
4,yg1qRQt5z7c,Meat Eater Reacts to HEARTBREAKING Activism Fo...,A quick street conversation with a non-vegan m...,2020-11-16T19:00:08Z,Edmonton Vegan Outreach,29,"[Cube of Truth, Anonymous for the Voiceless, E...",PT2M21S,hd,False,378,44,1,UCsU5QtzfvVX2DFcmIYM3aRg


In [None]:
# save to csv
video_df.to_csv("data/videos_unprepared.csv", index=False)

In [7]:
# scrape data on video categories
category_data = []

request = youtube.videoCategories().list(
        part="snippet",
        regionCode="US"
    )
try:
    response = request.execute()

    for item in response["items"]:
        data = {
            "category_id": item["id"],
            "category": item["snippet"]["title"]
        }
        category_data.append(data)
except HttpError as e:
    print(f"Error response status code: {e.status_code}, reason: {e.error_details}")
    raise

In [8]:
# convert into a dataframe
category_df = pd.DataFrame(category_data)
category_df.head()

Unnamed: 0,category_id,category
0,1,Film & Animation
1,2,Autos & Vehicles
2,10,Music
3,15,Pets & Animals
4,17,Sports


In [9]:
# save to csv
category_df.to_csv("data/categories.csv", index=False)

In [10]:
# de-duplicate channel id list
unique_channel_ids = list(set(channel_ids))

# scrape data on unique channels
channel_data = []
for i in range(0, len(unique_channel_ids), 50):
    request = youtube.channels().list(
        part="snippet, statistics",
        id=",".join(unique_channel_ids[i:i+50]),
        maxResults=50
    )
    try:
        response = request.execute()

        for item in response["items"]:
            snippet = item["snippet"]
            stats = item["statistics"]

            data = {
                "channel_id": item["id"],
                "channel_name": snippet["title"],
                "channel_view_count": int(stats.get("viewCount", 0)),
                "channel_sub_count": int(stats.get("subscriberCount", 0)),
                "channel_video_count": int(stats.get("videoCount", 0))
            }
            channel_data.append(data)
    except HttpError as e:
        print(f"Error response status code: {e.status_code}, reason: {e.error_details}")
        raise

In [11]:
# convert into a dataframe
channel_df = pd.DataFrame(channel_data)
channel_df.head()

Unnamed: 0,channel_id,channel_name,channel_view_count,channel_sub_count,channel_video_count
0,UCvy1Mv8nsSohI2rMH7kK4ow,The Vegan Society,820334,8120,170
1,UC0iMjvN9QJUhNK5Ti6_FAjw,MehtaEthics,34557,576,37
2,UCV8d4At_1yUUgpsnqyDchrw,That Vegan Couple,26451145,133000,857
3,UCIUwTruYMuyEcnGiD_eHpLg,alvopenrescue,346940,475,45
4,UC7vTq7aY0zIbr8a2Fa0a_Zw,Fitshortie,854344432,1790000,733


In [12]:
# save to csv
channel_df.to_csv("data/channels.csv", index=False)