# TED Talk Webscraper

- Author: Congxin (David) Xu 
- Email: congxin.xu@richmond.edu
- Date: 2020-12-24

In [1]:
import selenium
from selenium.webdriver.chrome.options import Options
import pandas
import requests
from bs4 import BeautifulSoup
from time import sleep

# webdriver options
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")


url = 'https://www.youtube.com/c/TED/videos'
driver = selenium.webdriver.Chrome(options=options)

In [2]:
driver.get(url)

height = driver.execute_script("return document.documentElement.scrollHeight")
previousHeight = -1

while previousHeight < height:
    previousHeight = height
    driver.execute_script(f'window.scrollTo(0,{height + 10000})')
    sleep(1)
    height = driver.execute_script("return document.documentElement.scrollHeight")

vidElements = driver.find_elements_by_id('thumbnail')
vid_urls = []
for v in vidElements:
    vid_urls.append(v.get_attribute('href'))

In [3]:
vid_urls[0:5]

['https://www.youtube.com/watch?v=klXVQsbhFsE',
 'https://www.youtube.com/watch?v=dKob6b8QzkU',
 'https://www.youtube.com/watch?v=TST0CsV8LHI',
 'https://www.youtube.com/watch?v=2hCUq6ScWME',
 'https://www.youtube.com/watch?v=BcOey28XXIw']

In [25]:
len(vid_urls)

3546

In [26]:
title = list()
speaker = list()
release_date = list()
likes = list()
dislikes = list()
views = list()

for link in vid_urls:
    # Create driver
    driver.get(link)
    sleep(2)
    
    # Get page_source
    video = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Title
    title += [video.find_all(class_ = 'style-scope ytd-video-primary-info-renderer')[0].\
        find_all('yt-formatted-string')[1].get_text().split("|")[0].strip()]
    
    # Speaker
    try:
        video.find_all(class_ = 'style-scope ytd-video-primary-info-renderer')[0].\
            find_all('yt-formatted-string')[1].get_text().split("|")[1].strip()
    except:
        speaker += ['NA']
    else: 
        speaker += [video.find_all(class_ = 'style-scope ytd-video-primary-info-renderer')[0].\
            find_all('yt-formatted-string')[1].get_text().split("|")[1].strip()]
    
    # Release Date
    release_date += [video.find_all(class_ = 'style-scope ytd-video-primary-info-renderer')[0].\
        find_all('yt-formatted-string')[2].get_text()]
    
    # Likes
    try:
        video.find_all(class_ = 'style-scope ytd-video-primary-info-renderer')[0].\
            find_all('yt-formatted-string')[3]['aria-label']
    except:
        likes += ['NA']
    else: 
        likes += [video.find_all(class_ = 'style-scope ytd-video-primary-info-renderer')[0].\
            find_all('yt-formatted-string')[3]['aria-label'].split(" ")[0]]
    # Dislikes
    try:
        video.find_all(class_ = 'style-scope ytd-video-primary-info-renderer')[0].\
            find_all('yt-formatted-string')[4]['aria-label']
    except:
        dislikes += ['NA']
    else: 
        dislikes += [video.find_all(class_ = 'style-scope ytd-video-primary-info-renderer')[0].\
            find_all('yt-formatted-string')[4]['aria-label'].split(" ")[0]]
    # Views
    views += [video.find_all(class_ = 'style-scope ytd-video-primary-info-renderer')[0].\
        find_all('span')[0].get_text().split(" ")[0]]

KeyError: 'aria-label'

In [39]:
# Create a data frame for return
d = {'Title': title, 'Speaker': speaker, 'Release Date': release_date, 'Likes': likes, 
     'Dislikes': dislikes, 'Views': views}
df = pandas.DataFrame(data=d)
df

Unnamed: 0,Title,Speaker,Release Date,Likes,Dislikes,Views
0,How to foster true diversity and inclusion at ...,Rosalind G. Brewer,"Dec 22, 2020",646,943,34894
1,Why monkeys (and humans) are wired for fairness,Sarah Brosnan,"Dec 21, 2020",1840,88,44702
2,A stellar history of modern astronomy,Emily Levesque,"Dec 18, 2020",1669,46,58948
3,An aerialist on listening to your body's signals,Adie Delaney,"Dec 17, 2020",1601,40,40475
4,A playful exploration of gender performance,Jo Michael Rezes,"Dec 15, 2020",380,112,21306
...,...,...,...,...,...,...
1165,A summer school kids actually want to attend,Karim Abouelnaga,"May 29, 2017",2021,86,81684
1166,There's no shame in taking care of your mental...,Sangu Delle,"May 26, 2017",5689,81,234793
1167,How (and why) Russia hacked the US election,Laura Galante,"May 25, 2017",1637,1913,123174
1168,This is what democracy looks like,Anthony D. Romero,"May 24, 2017",1975,565,90070


In [40]:
driver.quit()
df.to_csv('TED_Talk_Data_From_Youtube.csv')