Here we will collect all the data from the youtube API and save it to a csv file in the project root directory.

In [120]:
# Import necessary libraries

import pandas as pd
from googleapiclient.discovery import build
from IPython.display import JSON
from dotenv import load_dotenv
import os

In [121]:
# Load API key from .env file

load_dotenv()
apiKey = os.getenv('API_KEY')


Here we will manually list all the youtube channels that we want to collect the data for. To do so we will need both the channel name and the channel ID

To find a youtube channel ID, you should:
1. go to the channel's youtube page, 
2. right click anywhere
3. select the option view page source
4. Press ctrl+f
5. search for ?channel_id

In [122]:
# Dictionary of the youtube channels 
# from which we will retrieve the data.
# The key is the channel name and the 
# value is the ID

channelIDDict = {
    "gorgonoid": "UCLfCo17TCjx7qf-JMhQioLQ",
    "noticiasMaromba": "UCmK5h2-a4CquS4nIxDN6j7g",
    'albert': "UCij0YPRA_vGwKwCyn_o9MLw"
}

Now we will create the necessary functions to retrieve the data from the api.

In [123]:
# Transforms the channel id dictionary to
# a string of the channel ID's separated
# by commas. Example: "id1, id2, id3"
def channelIDDictToCommaSeparatedIDString(channelIDDict):

    channelIDString = ""
    firstIDAdded = False
    for channel in channelIDDict:
        
        if firstIDAdded is False:
            channelIDString += channelIDDict[channel]
        else:
            channelIDString +=  ", " + channelIDDict[channel]

        firstIDAdded = True
    
    return channelIDString

In [124]:
# Builds youtube api
def buildYoutubeAPI():

    api_service_name = "youtube"
    api_version = "v3"

    # Get credentials and create an API client
    youtube = build(
        api_service_name, api_version, developerKey=apiKey)

    return youtube

In [125]:
# Gets the statistics of all channels
# included in the channel id string
def getAllChannelStatistics(channelIDString): 

    allChannelStatistics = []
    
    request = buildYoutubeAPI().channels().list(
        part="snippet,contentDetails,statistics",
        id=channelIDString
    )

    response = request.execute()

    # Loops through all channels and
    # adds data to dataframe
    for item in response['items']:
        channelStatistics = {
            "channelName": item['snippet']['title'],
            'subscriberCount': item['statistics']['subscriberCount'],
            'viewCount': item['statistics']['viewCount'],
            'videoCount': item['statistics']['videoCount'],
            'uploadedVideosPlaylistID': item['contentDetails']['relatedPlaylists']['uploads']
        }

        allChannelStatistics.append(channelStatistics)
        df = pd.DataFrame(allChannelStatistics)
    
    return df

Now that we have created getAllChannelStatistics() we can use it to retrieve information from the youtube channels we entered in the dictionary before. Below, we run a test to see if the channel information is being retrieved correctly.

In [126]:
commaSeparatedChannelIDs = channelIDDictToCommaSeparatedIDString(channelIDDict)

stats = getAllChannelStatistics(commaSeparatedChannelIDs)

stats

Unnamed: 0,channelName,subscriberCount,viewCount,videoCount,uploadedVideosPlaylistID
0,Albert Einstein,6860,947591,4,UUij0YPRA_vGwKwCyn_o9MLw
1,GORGONOID,666000,108342455,1576,UULfCo17TCjx7qf-JMhQioLQ
2,Notícias Maromba,717000,22017080,77,UUmK5h2-a4CquS4nIxDN6j7g


In [131]:
playlistID = 'UULfCo17TCjx7qf-JMhQioLQ'

def getUploadedVideoIDs(playlistID):

    videoIDs = []

    request = youtube.playlistItems().list(
        part="snippet,contentDetails",
        playlistId=playlistID,
        maxResults=50
    )
    response = request.execute()

    for item in response['items']:
        videoIDs.append(item['contentDetails']['videoId'])
    
    next_page_token = response.get('nextPageToken')
    while next_page_token is not None:
        request = youtube.playlistItems().list(
            part="contentDetails",
            playlistId=playlistID,
            maxResults=50,
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response['items']:
            videoIDs.append(item['contentDetails']['videoId'])

        next_page_token = response.get('nextPageToken')


    return videoIDs

videoIDs = getUploadedVideoIDs(playlistID)

len(videoIDs)

1566