# TED Talks Scraper

In [10]:
import pandas as pd
import re

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time
import random

import pickle

## Soup Maker

In [2]:
class SoupMaker:
    """Make soup objects and sleep your machine."""
    def sleep_short(self):
        """Suspends execution time between .5 - 2 seconds."""
        return time.sleep(random.uniform(.5, 2))

    def sleep_long(self):
        """Suspends execution time between 4 - 6 seconds."""
        return time.sleep(random.uniform(4, 6))

    def make_soup(self, url):
        """Returns soup object from a URL."""
        # generate random user-agent
        user_agent = {'User-agent': UserAgent().random}
        # request page and make soup
        page = requests.get(url, headers=user_agent)
        soup = BeautifulSoup(page.content, 'lxml')
        return soup


## Talk Features

In [42]:
class TalkFeatures(SoupMaker):
    """Class to get TED talk features."""
    def get_talk_id(self, soup):
        """Returns the talk_id provided by TED."""
        talk_id = re.search(r"(?<=\"current_talk\":)\"(\d+)\"", soup.text).group(1)
        return talk_id

    def get_title(self, soup):
        """Returns the title of the talk."""
        title_tag = soup.find(attrs={'name': 'title'}).attrs['content']
        title = title_tag.split(':')[1].strip()
        return title

    def get_speakers(self, soup):
        """Returns dict of all speakers per talk."""
        speaker_tag = re.findall(r"(?<=\"speakers\":).*?\"}]", soup.text)[0]
        # convert to DataFrame
        speakers_df = pd.read_json(speaker_tag)
        full_name_raw = (speakers_df.loc[:, 'firstname'] + ' '
                     + speakers_df.loc[:, 'middleinitial'] + ' '
                     + speakers_df.loc[:, 'lastname'])
        full_name_clean = full_name_raw.str.replace('\s+', ' ')
        # transform series to a dict
        speakers = full_name_clean.to_dict()
        return speakers

    def get_occupations(self, soup):
        """Returns list of the occupation(s) of the speaker(s) per talk."""
        occupations_tag = re.findall(r"(?<=\"speakers\":).*?\"}]", soup.text)[0]
        # convert json to DataFrame
        occupations_series = pd.read_json(occupations_tag)['description']
        if occupations_series.all():
            # clean and create dict
            occupations = occupations_series.str.lower().str.split(', ')
            occupations = occupations.to_dict()
        else:
            occupations = None
        return occupations

    def get_about_speakers(self, soup):
        """Returns dict with each 'About the Speaker' blurb per talk."""
        speaker_tag = re.findall(r"(?<=\"speakers\":).*?\"}]", soup.text)[0]
        # convert to DataFrame
        about_series = pd.read_json(speaker_tag)['whotheyare']
        if about_series.all():
            # transform series to a dict
            about_speakers = about_series.to_dict()
        else:
            about_speakers = None
        return about_speakers

    def get_views(self, soup):
        """Returns viewed count per talk."""
        view_count = re.search(r"(?<=\"viewed_count\":)\d+", soup.text).group(0)
        return view_count

    def get_recorded_date(self, soup):
        """Returns date a talk was recorded."""
        recorded_at = re.search(r"(?<=\"recorded_at\":)\"(.*?)T", soup.text).group(1)
        return recorded_at

    def get_published_date(self, soup):
        """Returns date a talk was published in TED.com."""
        published_at = soup.find(attrs={'itemprop': 'uploadDate'}).attrs['content']
        return published_at

    def get_event(self, soup):
        """Returns name of the event in which the talk was given."""
        event = re.search(r"(?<=\"event\":)\"(.*?)\"", soup.text).group(1)
        return event
    
    def get_native_lang(self, soup):
        """Returns native language code for each talk as a string."""
        native_lang = re.search(r'(?<=nativeLanguage\":\")[\w-]+', soup.text).group(0)
        return native_lang
    
    def get_available_lang(self, soup):
        """Returns list of all available languages (lang codes) for a talk."""
        languages = re.findall(r'(?<=languageCode\":\")[\w-]+', soup.text)
        clean_lang = sorted(list(set(languages)))
        return clean_lang

    def get_comments_count(self, soup):
        """Return the count of comments per talk."""
        try:
            comments_count = re.search(r"(?<=\"count\":)(\d+)", soup.text).group(1)
        except AttributeError:
            comments_count = None
        return comments_count

    def get_duration(self, soup):
        """Returns duration of a talk (format ex: 12M43S)"""
        duration_tag = soup.find(attrs={'itemprop': 'duration'}).attrs['content']
        duration = duration_tag.split('PT')[1]
        return duration

    def get_duration_sec(self, soup):
        """Returns duration of a talk in seconds."""
        duration =  re.search(r"(?<=\"duration\":)(\d+)", soup.text).group(1)
        return duration

    def get_topic_tags(self, soup):
        """Returns list of tags (topics) per talk."""
        match_obj = re.search(r"\"tag\":\"(.*?)\"", soup.text)
        tags = match_obj.group(1).split(',')
        return tags

    def get_related_talks(self, soup):
        """Returns dict (keys: id & title) of related talks."""
        related_tag = re.search(r"(?<=\"related_talks\":).*?]", soup.text).group(0)
        related_series = pd.read_json(related_tag)
        related_talks = related_series.loc[:, ['id', 'title']].to_dict()
        return related_talks

    def get_talk_url(self, soup):
        """Returns url for each talk as a string."""
        talk_tag = soup.find(attrs={'property': 'og:url'}).attrs['content']
        talk_url = talk_tag.split('/transcript')[0]
        return talk_url

    def get_talk_description(self, soup):
        """Returns description of the talk."""
        desc_tag = soup.find(attrs={'property': 'og:description'}).attrs['content']
        talk_desc = desc_tag.split(': ', 1)[1]
        return talk_desc

    def get_transcript(self, soup):
        """Returns talk's transcript as a single string.""" 
        transcript = ''
        transcript_strings = []
        for div in soup.find_all('div', class_="Grid__cell flx-s:1 p-r:4"):
            for p in div.find_all('p'):
                # add every string in the transcript to a list
                transcript_strings.append(" ".join(p.text.split()))
            else:
                # after all strings have been added, create a single transcript string
                transcript = " ".join(transcript_strings)
        return transcript


## TED Scraper

In [43]:
class TEDscraper(TalkFeatures):
    """Gets urls and scrapes TED talk data in specified language
    Attributes:
        lang (str): Language code. Defaults to 'en'.
        urls (list): URLs to be scraped. Defaults to 'all'.
        exclude (bool): Exclude transcript. Defaults to False.
        ted_dict (dict): Dict to store ted talk features after scraping
        dict_id (int): Index of nested dict in 'ted_dict'.
        failed_counter: Counts urls that failed to get scraped
    """
    def __init__(self, lang='en', urls='all', exclude_transcript=False):
        self.lang = lang
        self.urls = urls
        self.exclude = exclude_transcript
        self.ted_dict = {}
        self.dict_id = 0
        self.failed_counter = 0

    def get_languages(self):
        """Returns DataFrame of all language codes supported by TED."""
        lang_url = 'https://www.ted.com/participate/translate/our-languages'
        soup = self.make_soup(lang_url)
        lang_list = []
        lang_tags = soup.find_all('div', class_='h9')
        for tag in lang_tags:
            if tag.a == None:
                continue
            else:
                lang_code = re.search(r'(?<=\=)[\w-]+', tag.a['href']).group(0)
                lang_name = tag.text
                lang_list.append([lang_code] + [lang_name])
        lang_df = pd.DataFrame(data=lang_list, columns=['code', 'language'])
        return lang_df

    def get_max_page(self):
        """Returns max pagination number from www.ted.com/talks."""
        page_num = [1]
        # make soup from ted.com/talks with specified language
        soup = self.make_soup('https://www.ted.com/talks?language='
                              + self.lang + '&page=1&sort=newest')
        # iterate through each pagination element and get the max
        page_elem = soup.find_all('a', class_='pagination__item pagination__link')
        for element in page_elem:
            page_num.append(int(element.text))
        return max(page_num)
    
    def get_all_url_paths(self):
        """Returns list of all the talk url paths available in www.ted.com/talks"""
        url_path_list = []
        # construct url with lang code specified by the user
        talks_url = ('https://www.ted.com/talks?language='
                    + self.lang + '&page=')
        # set range from 1 to the max page in the pagination element
        page_range = range(self.get_max_page())
        # iterate through each page and get the url for each talk
        for i in page_range:
            # try a second attempt if first attempt fails
            for attempt in range(2):
                try:
                    talks_page_url = talks_url + str(i) + '&sort=newest'
                    soup = self.make_soup(talks_page_url)
                    # delay between searches
                    self.sleep_short()
                    for div in soup.find_all('div', attrs={'class': 'media__image'}):
                        for a in div.find_all('a'):
                            url_path_list.append(a.get('href'))
                except:
                    # delay before continuing to second attempt
                    self.sleep_long()
                # break from attempts loop if no exceptions are raised
                else:
                    break
        return url_path_list

    def get_all_urls(self):
        """Returns list of complete urls for each talk's transcript page."""
        url_list = []
        for url in self.get_all_url_paths():
            url_list.append(('https://www.ted.com'
                             + url.replace(
                                 # to replace
                                 '?language=' + self.lang,
                                 # replace with
                                 '/transcript' + '?language=' + self.lang)
                            ))
        return url_list
    
    def clean_urls(self, urls):
        """Returns list of clean urls from urls the user inputs."""
        clean_urls = []
        for idx, url in enumerate(urls):
            if url.startswith('https://www.ted.com/talks'):
                parts = url.split('/')
                joined = '/'.join(parts[:5])
                clean = joined.split('?')
                lang = clean[0] + '/transcript?language=' + self.lang
                clean_urls.append(lang)
            else:
                print(f'bad url @ {idx} >> {url}')
                continue
        return clean_urls
    
    def scrape_all_features(self, soup):
        """Scrapes all features to a nested dict."""
        # create nested dict
        self.ted_dict[self.dict_id] = {}
        nested_dict = self.ted_dict[self.dict_id]
        # add the features to the nested dict
        nested_dict['talk_id'] = self.get_talk_id(soup)
        nested_dict['title'] = self.get_title(soup)
        nested_dict['speakers'] = self.get_speakers(soup)
        nested_dict['occupations'] = self.get_occupations(soup)
        nested_dict['about_speakers'] = self.get_about_speakers(soup)
        nested_dict['views'] = self.get_views(soup)
        nested_dict['recorded_date'] = self.get_recorded_date(soup)
        nested_dict['published_date'] = self.get_published_date(soup)
        nested_dict['event'] = self.get_event(soup)
        nested_dict['native_lang'] = self.get_native_lang(soup)
        nested_dict['available_lang'] = self.get_available_lang(soup)
        nested_dict['comments'] = self.get_comments_count(soup)
        nested_dict['duration'] = self.get_duration(soup)
        nested_dict['duration_sec'] = self.get_duration_sec(soup)
        nested_dict['topic_tags'] = self.get_topic_tags(soup)
        nested_dict['related_talks'] = self.get_related_talks(soup)
        nested_dict['talk_url'] = self.get_talk_url(soup)
        nested_dict['talk_description'] = self.get_talk_description(soup)
        # add transcript if param is set to False (default)
        if not self.exclude:
            nested_dict['transcript'] = self.get_transcript(soup)
        return nested_dict

    def get_data(self):
        """Returns nested dictionary of features from each talk's transcript page."""
        print("Getting all urls...")
        # define url attribute
        if self.urls == 'all':
            urls = self.get_all_urls()
        else:
            if isinstance(self.urls, list):
                urls = self.clean_urls(self.urls)
            else:
                print("'urls' param needs to be a list")
        print(f"Scraping {len(urls)} TED talks in '{self.lang}'...")
#         print(f"Estimated scrape time is {(1.5*len(urls)/60)} minutes\n")
        # iterate through each ted talk transcript url
        for url in urls:
            # make soup
            soup = self.make_soup(url)
            # taste soup
            taster = soup.title.text
            bad_soup = re.search(r'404: Not Found', taster)
            if bad_soup:
                print(f"\nBad soup! TED might not have this talk available in "
                      f"'{self.lang}'. Check the url\n{url}\n")
                self.failed_counter += 1
                continue
            # delay between searches
            self.sleep_short()
            # try up to three attempts to scrape data
            for attempt in range(1, 3+1):
                try:
                    # create nested dict
                    self.ted_dict[self.dict_id] = {}
                    # scrape features and add to a nested dict
                    self.scrape_all_features(soup)
                    # indicate successful scrape
                    print(self.dict_id, url)
                    # add 1 to create a new nested dict
                    self.dict_id += 1
                except Exception as e:
                    # if the last attempt fails, update the failed counter
                    # and print the exception & talk url
                    if attempt == 3:
                        self.failed_counter += 1
                        print(f'position: {self.dict_id}, exception: {e}, url: {url}\n')
                        continue
                    # delay before another attempt
                    self.sleep_long()
                # break if no exceptions are raised
                else:
                    break
        print(f"""\nTed.com scraping results:
            \n\t• Successful: {self.dict_id}
            \n\t• Failed: {self.failed_counter}\n""")
        return self.ted_dict
    
    def to_dataframe(self, ted_dict):
        """Creates DataFrame object from dict."""
        df = pd.DataFrame.from_dict(ted_dict, orient='index')
        return df


# Testing

Output types
https://pandas.pydata.org/pandas-docs/stable/reference/frame.html#serialization-io-conversion

In [56]:
#Troubleshooting
# exception: Unmatched ''"' when when decoding 'string', url: https://www.ted.com/talks/michael_anti_behind_the_great_firewall_of_china/transcript?language=bo
unmatched_urls = [
                  'https://www.ted.com/talks/michael_anti_behind_the_great_firewall_of_china/transcript?language=bo',
#                   'https://www.ted.com/talks/isaac_mizrahi_how_the_button_changed_fashion/transcript?language=hi',
#                   'https://www.ted.com/talks/herman_narula_the_transformative_power_of_video_games/transcript?language=hi'
                 ]


In [57]:
unmatched_scraper = TEDscraper(lang='bo', urls=unmatched_urls)

In [58]:
unmatched_scraper.get_data()

Getting all urls...
Scraping 1 TED talks in 'bo'...
0 https://www.ted.com/talks/michael_anti_behind_the_great_firewall_of_china/transcript?language=bo

Ted.com scraping results:
            
	• Successful: 1
            
	• Failed: 0



{0: {'talk_id': '1523',
  'title': 'མ་ཨི་ཁལ་ཨེན་ཊིཿ་རྒྱ་ནག་གི་ཆ་འཕྲིན་ལྕགས་རིའི་ལྟག་རྒྱབ་ཏུ།',
  'speakers': {0: 'Michael Anti'},
  'occupations': {0: ['blogger']},
  'about_speakers': {0: "Michael Anti (Zhao Jing), a key figure in China's new journalism, explores the growing power of the Chinese internet."},
  'views': '1660316',
  'recorded_date': '2012-06-29',
  'published_date': '2012-07-30T15:03:53+00:00',
  'event': 'TEDGlobal 2012',
  'native_lang': 'en',
  'available_lang': ['ar',
   'bg',
   'bo',
   'cs',
   'da',
   'de',
   'el',
   'en',
   'es',
   'fa',
   'fr',
   'fr-ca',
   'he',
   'hu',
   'id',
   'it',
   'ja',
   'ko',
   'nl',
   'pl',
   'pt',
   'pt-br',
   'ro',
   'ru',
   'sl',
   'sq',
   'sr',
   'vi',
   'zh',
   'zh-cn',
   'zh-tw'],
  'comments': '479',
  'duration': '18M51S',
  'duration_sec': '1131',
  'topic_tags': ['Asia',
   'china',
   'global issues',
   'journalism',
   'social media',
   'technology'],
  'related_talks': {'id': {0: 1236, 1: 57

In [46]:
# all features
unmatched_scraper.get_talk_id(soup)
unmatched_scraper.get_title(soup)
unmatched_scraper.get_speakers(soup)
unmatched_scraper.get_occupations(soup)
unmatched_scraper.get_about_speakers(soup)
unmatched_scraper.get_views(soup)
unmatched_scraper.get_recorded_date(soup)
unmatched_scraper.get_published_date(soup)
unmatched_scraper.get_event(soup)
unmatched_scraper.get_native_lang(soup)
unmatched_scraper.get_available_lang(soup)
unmatched_scraper.get_comments_count(soup)
unmatched_scraper.get_duration(soup)
unmatched_scraper.get_duration_sec(soup)
unmatched_scraper.get_topic_tags(soup)
unmatched_scraper.get_related_talks(soup)
unmatched_scraper.get_talk_url(soup)
unmatched_scraper.get_talk_description(soup)
unmatched_scraper.get_transcript(soup)

unmatched_scraper.get_title(soup)

'ट्रांसफॉर्मेटिव पावर ऑफ वीडियो गेम्स'

In [24]:
def make_soup(url):
    """Returns soup object from a URL."""
    # generate random user-agent
    user_agent = {'User-agent': UserAgent().random}
    # request page and make soup
    page = requests.get(url, headers=user_agent)
    soup = BeautifulSoup(page.content, 'lxml')
    return soup

In [None]:
urls = [
    'https://www.ted.com/talks/sarah_kaminsky_my_father_the_forger/',
    'https://www.ted.com/talks/jorge_drexler_poetry_music_and_identity/transcript',
    'https://www.ted.com/talks/sir_ken_robinson_do_schools_kill_creativity/',
    'https://www.ted.com/talks/paul_mceuen_and_marc_miskin_tiny_robots_with_giant_potential/transcript',
    'https://www.ted.com/talks/antara_raychaudhuri_and_iseult_gillespie_the_legend_of_annapurna_hindu_goddess_of_nourishment/',
    'https://www.ted.com/talks/diana_reiss_peter_gabriel_neil_gershenfeld_and_vint_cerf_the_interspecies_internet_an_idea_in_progress/',
]

In [None]:
url = ['https://www.ted.com/talks/diana_reiss_peter_gabriel_neil_gershenfeld_and_vint_cerf_the_interspecies_internet_an_idea_in_progress/']


In [None]:
get_speakers_2(soup)

In [None]:
test = TEDscraper(urls=url)
test.get_data()

## Create DataFrame

In [None]:
# Create DataFrame
df = pd.DataFrame.from_dict(ted_dict, orient='index')

# Pickle DataFrame
df.to_pickle('data/first_df.pkl')

print(f'Shape: {df.shape}')