In [1]:
import pandas as pd 
import re
import json


from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

In [2]:
def scrape_youtube(username):
    """
    Scrapes the latest 30 YouTube videos from a given YouTube username.
   
    Args:
        username(str): The username of the YouTube channel from which to scrape the videos.o
    
    Output:
        dataframe: A dataframe containing details of a video (title, duration, url, short description, viewer count)
    
    """
    rows_list = []
    
    url = 'https://www.youtube.com/@' + username + '/videos'

    # Prevent selenium from opening new chrome window
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--profile-directory=Default') 

    driver = webdriver.Chrome(options = options)
    driver.get(url)
    
    # Parse HTML data from the YouTube page to get a list of video information
    soup = BeautifulSoup(driver.page_source,'html.parser')
    elem_script = str([x for x in soup.find_all('script') if 'var ytInitialData' in str(x)][0])
    script = re.sub(re.compile('<.*?>'),'',elem_script)
    dict_list = [d.strip(';') for d in script.split('var ytInitialData = ')[-1].splitlines()]
    json_list = [json.loads(i) for i in dict_list][0]
    vid_list = json_list['contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content']['richGridRenderer']['contents']

    # Iterate through all video information in json format to get the necessary information
    for num_video in range(len(vid_list)):
        try:
            vid_data = vid_list[num_video]['richItemRenderer']['content']['videoRenderer']
            vid_title = vid_data['title']['runs'][0]['text']
            vid_duration = vid_data['lengthText']['simpleText']
            vid_url = 'https://www.youtube.com' + vid_data['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
            vid_short_desc = vid_data['descriptionSnippet']['runs'][0]['text']
            vid_count = int(vid_data['viewCountText']['simpleText'].split(' x')[0].replace('.', '').replace(',', ''))

            rows_list.append({'video_name': vid_title,
                              'video_url': vid_url,
                              'video_duration': vid_duration,
                              'video_short_desc': vid_short_desc,
                              'video_count': vid_count
                             })
        except KeyError:
            continue

    df = pd.DataFrame(rows_list)

    return df

In [3]:
df = scrape_youtube('Google')
df.head()

Unnamed: 0,video_name,video_url,video_duration,video_short_desc,video_count
0,Our Year of AI Innovation | labs.google,https://www.youtube.com/watch?v=i8AHRYdrvNU,1.06,labs.google is Google's home to experiment wit...,9356
1,One.org | Unlocking the power of data with Dat...,https://www.youtube.com/watch?v=N7YpWLmL6JU,2.1,Discover how ONE.org’s new Data Commons is tra...,8308
2,Accelerating the clean energy transition,https://www.youtube.com/watch?v=DZN2EO7TH-A,1.4,Learn how we’re working with utilities to crea...,22333
3,Google — Say hi to Gemini,https://www.youtube.com/watch?v=Bqn3SNyjsSE,0.31,The Gemini era is here – helping you do more w...,51972936
4,Made by Google Podcast S5E9 | Google AI phone ...,https://www.youtube.com/watch?v=3XpTk4G02Yo,21.41,Join us as we sit down with a special guest fr...,17967
