In [2]:
# Imports

import mechanicalsoup
import json
import hashlib
from bs4 import BeautifulSoup
import sys

In [3]:
# variable instantiation

BASE_URL = 'https://www.bbc.co.uk'

SCRAPING_SUFFIX = '/iplayer/a-z/'

browser = mechanicalsoup.StatefulBrowser()

In [4]:
# initail browser page

browser.open(BASE_URL + SCRAPING_SUFFIX)


navigation = browser.get_current_page().find('ul', attrs={'class': 'scrollable-nav__track'})

navigation_list = navigation.find_all('li')

navigation_list = [x.a['href'] for x in navigation_list if x.a is not None ]

print(navigation_list)

['/iplayer/a-z/a', '/iplayer/a-z/b', '/iplayer/a-z/c', '/iplayer/a-z/d', '/iplayer/a-z/e', '/iplayer/a-z/f', '/iplayer/a-z/g', '/iplayer/a-z/h', '/iplayer/a-z/i', '/iplayer/a-z/j', '/iplayer/a-z/k', '/iplayer/a-z/l', '/iplayer/a-z/m', '/iplayer/a-z/n', '/iplayer/a-z/o', '/iplayer/a-z/p', '/iplayer/a-z/q', '/iplayer/a-z/r', '/iplayer/a-z/s', '/iplayer/a-z/t', '/iplayer/a-z/u', '/iplayer/a-z/v', '/iplayer/a-z/w', '/iplayer/a-z/y', '/iplayer/a-z/z', '/iplayer/a-z/0-9']


In [5]:

def iplayer_atoz_page_extractor(program_selection):
    '''arguement is soup div tag for a program.
    Returns program title, program synopsis, no of
    episodes available, and the link to the latest episode'''
    # Program Title
    title = program_selection.find('p',
            attrs={'class':
                'list-content-item__title'}).get_text()
    # Program Synopsis
    synopsis = program_selection.find('p',
            attrs={'class':
                'list-content-item__synopsis'}).get_text()
    # Link to latest episode
    latest_episode_url = program_selection.find('a',
            href=True)['href']
    # Number of episodes available
    episodes_available = program_selection.find('div',
            attrs={'class': 'list-content-item__sublabels'})
    
    
    return title, synopsis, latest_episode_url


In [6]:
def get_page(browser, url):
            browser.open(url)
            return browser.get_current_page()

In [7]:
def programme_website_extractor(web_page):
    
    program_website_url = web_page.find('a',
            attrs={'class': 'lnk'},
            text='Programme website')['href']
    program_credits_url = web_page.find('a',
            attrs={'class': 'lnk'},
            text='Credits')
    
    credits_available = bool(program_credits_url)
    
    if credits_available:
        program_credits_url = program_credits_url['href']
    
    return program_website_url, program_credits_url, credits_available

In [8]:
def episode_page_extractor(web_page, credits_available):
    
    # Credits
    if credits_available:
        credits = web_page.find('table', attrs={'class': 'table'})
        if credits:
            credits_dict = {}
            for row in credits.find_all('tr'):
                person = row.find_all('span')
                if len(person) > 1:
                    json_credits = [x.get_text() for x in person]
                    credits_dict[json_credits[0]] = json_credits[1]

    # Genre and format
    
    genre_format = web_page.find(
        'div', attrs={'class': 'footer__similar b-g-p component'})

    if genre_format is not None:
        sim = genre_format.find_all('div')
        genre_format_list = []

        for i in sim:
            genre_format_list.append(
                [[x.get_text(), x['href']] for x in i.find_all('a')])
            
        genre = []
        sub_genre = []
        prog_frmat = []

        for i in range(len(genre_format_list)):
            for j in range(len(genre_format_list[i])):
                if i == 0:
                    if j < 1:
                        genre.append(genre_format_list[i][j])
                    else: 
                        sub_genre.append(genre_format_list[i][j])
                else:
                    prog_frmat.append(genre_format_list[i][j])

        print('genre: ', genre)
        print('sub genre: ', sub_genre)
        print('format: ',prog_frmat)
    
    # get time left to watch
        
    left_to_watch = web_page.find(
        'div', attrs={'class': 'grid 1/3@bpw 1/4@bpe'})

    if left_to_watch is not None:
        left_to_watch_items = left_to_watch.find_all(
            'p', attrs={'class': 'episode-panel__meta'})

        if left_to_watch.find(
                'div', attrs={'class': "episode-panel__meta"}) is None:
            if left_to_watch_items[0].span is None:
                days_left = left_to_watch_items[0].get_text()
            else:
                days_left = left_to_watch_items[0].span.get_text()
            duration = left_to_watch_items[1].get_text()

            print('Left to watch: ', days_left,'duration: ' ,duration)
    
    # get long synopsis
    
    long_synopsis = web_page.find(
        'div', attrs={'class': 'synopsis-toggle__long'})

    if long_synopsis is not None:
        long_synopsis_paragraphs = [
            x.get_text() for x in long_synopsis.find_all('p')
        ]

        print('long_synopsis: ',long_synopsis_paragraphs)
    
    # Broadcast information
    main_broadcast = web_page.find(
        'div',
        attrs={
            'class':
            'grid 1/3@bpw2 1/3@bpe map__column map__column--2 map__column--last'
        })

    if main_broadcast is not None:
        date_last_aired = main_broadcast.find(
            'span',
            attrs={'class': 'broadcast-event__date text-base timezone--date'})
        time_last_aired = main_broadcast.find(
            'span', attrs={'class': 'timezone--time'})
        channel = main_broadcast.find(
            'div',
            attrs={
                'class':
                'programme__service box-link__elevated micro text--subtle'
            })
        if channel is not None:
            channel_text = channel.find('a').get_text()
            print('Channel: ', channel_text)
        if channel is not None:
            channel_link = channel.find('a')['href']
            print('Channel link: ', channel_link)
        if date_last_aired is not None:
            date_last_aired = date_last_aired.get_text()
            print('date_aired: ', date_last_aired)
        if time_last_aired is not None:
            time_last_aired = time_last_aired.get_text()
            print('time_aired: ', time_last_aired)
    
    # build dictionary 
        
    return 0

In [9]:
def recommendation_extraction(web_page):
    if web_page is not None:
        page_items = web_page.find('ol', attrs={'class': 'highlight-box-wrapper'})
        print('found list of recommendations')
        if page_items is not None: 
            list_items = page_items.find_all('li')
            for item in list_items:
                item_info = item.find('div', attrs={'class': 'programme__body'})
                link_1 = item_info.h4.a['href']
                link_2 = item_info.h4.a['resource']
                title = item_info.h4.a.get_text()
                synop = item_info.p.get_text()
                
                print('recommendations:', link_1, link_2, 'title: ',title, synop)
    
    

In [10]:
def episode_available_extraction(web_page):
    episodes_link = web_page.find(
        'a', attrs={
            'class': 'br-nav__link',
            'data-linktrack': 'nav_episodes'
        })
    
    if episodes_link is not None:
        episodes_link = episodes_link['href']

        print(episodes_link)
    
        episodes_page = get_page(browser, BASE_URL + episodes_link)
        
        episode_list_extractor(episodes_page)
        
        episode_pagination = episodes_page.find('ol', attrs={'class': 'nav nav--banner pagination delta'})
        
        # TODO loop through pagination
        # if present
        if episode_pagination:
            page_list = pagination.find_all('li', attrs={'class': 'pagination__page'})
            page_links = [x.a['href'] for x in page_list if x.a is not None]
            
            for endpoint in page_links:
                url = curr_url + endpoint

                browser.open(url)

                episodes_page = browser.get_current_page()
                episode_list_extractor(episodes_page)

In [11]:
def episode_list_extractor(web_page):
    episodes_available_list = web_page.find(
            'div', attrs={'class': 'br-box-page programmes-page'})
    episodes_container_list = episodes_available_list.find_all(
        'div',
        attrs={
            'class':
            'programme programme--tv programme--episode block-link highlight-box--list br-keyline br-blocklink-page br-page-linkhover-onbg015--hover'
        })

    available_episodes = web_page.find('span', attrs={'class': 'hidden grid-visible@bpb2 grid-visible@bpw'})
    print('available episodes: ', available_episodes.get_text())

    #TODO: fix this implimentation
    for item in episodes_container_list:
        # link
        item_headder = item.find(
            'div', attrs={'class': 'cta cta__overlay'})
        item_link = item_headder.a['href']
        # time left
        item_time_left = item_headder.a['title']
        # title
        item_body = item.find('div', attrs={'class': 'programme__body'})

        print(item_link, item_time_left)

        if item_body is not None:
            # num of episodes
            try: 
                episode_oneline_synopsis = item_body.p.get_text()
                print('episode_oneline_synopsis: ' ,episode_oneline_synopsis)
            except:
                pass
            try:
                episode_no = item_body.p.abbr['title']
                print('episode_no: ', episode_no)
            except:
                pass
            try:
                episode_title = item_body.find('span', attrs={'class': 'programme__title gamma'}).get_text()
                print(episode_title)
            except:
                pass


In [13]:
def upcoming_episodes():
    next_on_suffix = 'broadcasts/upcoming/'
    browser.open( + next_on_suffix)
    next_on = browser.get_current_page()
    next_on_section = next_on.find('ol', attrs={'class':'highlight-box-wrapper'}) 
    
    next_up_dict = {}
    
    for item in next_on_section.find_all('li'):
        broadcast_info = item.find('div', attrs={'class':'programme__body programme__body--flush'})
        broadcast_info_tag = broadcast_info.find('div', attrs={'class': 'broadcast-event__time beta'})
        
        broadcast_date = broadcast_info_tag['title']
        broadcast_day = broadcast_info_tag.find('span', attrs={'class':'broadcast-event__date text-base timezone--date'}).get_text()
        broadcast_time = broadcast_info_tag.find('span',attrs = {'class': 'timezone--time'}).get_text()
        
        broadcast_channel = broadcast_info.find('div', attrs={'class': 'programme__service box-link__elevated micro text--subtle'})

        channel = broadcast_channel.a.get_text()
        channel_url = broadcast_channel.a['href']
        
        program_info = item.find('div', attrs={'class': 'grid 7/12 2/3@bpb2 3/4@bpw 5/6@bpw2 5/6@bpe'})
        program_title_info = program_info.a
        
        program_id = program_info.div['data-pid']
        program_link = program_title_info['href']
        program_title = program_title_info.find('span', attrs={'class': 'programme__title gamma'}).get_text()
        series = program_title_info.find('span', attrs={'class': 'programme__subtitle centi'}).get_text()
        program_synopsis = program_info.p.get_text()
        
        next_up_dict[program_id] = {'program_title': program_title.encode('utf-8'),
                                    'series': series.encode('utf-8'),
                                    'program_synopsis': program_synopsis.encode('utf-8').strip(),
                                    'program_link': program_link.encode('utf-8'),
                                    'channel': {'name': channel.encode('utf-8'),
                                              'link': channel_link.encode('utf-8')},
                                    'broadcast': {'date': broadcast_date.encode('utf-8'),
                                                 'day': broadcast_day.encode('utf-8'),
                                                 'time': broadcast_time.encode('utf-8')}}
        
        
    print(next_up_dict)

In [14]:
for suffix in navigation_list:
    browser.open(BASE_URL + suffix)
    
    program_selection = browser.get_current_page().find_all(
    'li', attrs={"class": "grid__item"})
    
    for program_box in program_selection: 
        program_title, program_synopsis, latest_episode_url = iplayer_atoz_page_extractor(program_box) 
        
        print(program_title, program_synopsis, latest_episode_url)
        
        # get latest url page
        
        latest_episode_page = get_page(browser, BASE_URL + latest_episode_url)
        
        program_website_url, program_credits_url, credits_available = programme_website_extractor(latest_episode_page)
        print(program_website_url, program_credits_url, credits_available)
        
        if credits_available:
            program_website_page = get_page(browser, BASE_URL + program_credits_url)
        else: 
            program_website_page = get_page(browser, BASE_URL + program_website_url)
        
        # TODO add program webpage extraction for programs without credits for genre 
        
        # TODO assign variables from this function
        episode_page_extractor(program_website_page, credits_available)
        
        recommendation_page = get_page(browser, BASE_URL + program_website_url + '/recommendations')
        recommendation_extraction(recommendation_page)
        
        # TODO assign variables
        episode_available_extraction(get_page(browser, BASE_URL + program_website_url))
        
        
        

Abadas Rhaglen animeiddio i blant ifanc yn canolbwyntio ar gyflwyno geiriau newydd. Animation ... /iplayer/episode/p02f7d2x/abadas-sgi
/programmes/p02b4jth None False
found list of recommendations
/programmes/p02b4jth/episodes
available episodes:  (7)
https://www.bbc.co.uk/iplayer/episode/p02f7d2x 25 o ddyddiau ar ôl i wylio (Sad 23 Chwefror 2019, 09:00)
episode_oneline_synopsis:  
Mae'r Abadas yn chwarae ym mhyllau mwdlyd yr ardd pan ddaw Ben ar eu traws a'u gwahodd ...

Sgi
https://www.bbc.co.uk/iplayer/episode/p02f7cfh 23 o ddyddiau ar ôl i wylio (Iau 21 Chwefror 2019, 09:00)
episode_oneline_synopsis:  
Mae gan Ben air Abada newydd sbon: 'melin wynt'. Ela gaiff ei dewis i fynd i chwilio am...

Melin Wynt
https://www.bbc.co.uk/iplayer/episode/p02dzdxj 18 o ddyddiau ar ôl i wylio (Sad 16 Chwefror 2019, 09:00)
episode_oneline_synopsis:  
Tybed a fydd gair heddiw, 'anrheg' yn helpu Ela gan nad oes ganddi degan arbennig? Ela'...

Anrheg
https://www.bbc.co.uk/iplayer/episode/p02dtfvr 16 o

/programmes/b0bxbvtl/episodes
available episodes:  (3)
https://www.bbc.co.uk/iplayer/episode/b0bxc1sn 2 months left to watch (Thu 28 March 2019, 22:00)
episode_oneline_synopsis:  
3/3 Hercule faces a race against time as he realises that only he holds the key to the case.

episode_no:  Episode 3 of 3
Episode 3
https://www.bbc.co.uk/iplayer/episode/b0bxc1h2 2 months left to watch (Thu 28 March 2019, 22:00)
episode_oneline_synopsis:  
2/3 As the ABC killer promises to spill more blood, Hercule's identity comes into question.

episode_no:  Episode 2 of 3
Episode 2
https://www.bbc.co.uk/iplayer/episode/b0bxbvkg 2 months left to watch (Thu 28 March 2019, 22:00)
episode_oneline_synopsis:  
1/3 1933. Hercule Poirot receives letters threatening murder, but no-one will listen.

episode_no:  Episode 1 of 3
Episode 1
Absolutely Fabulous: The Movie Eddy and Patsy are still living the high life, but a setback sees them going on the run. /iplayer/episode/b08b7blz/absolutely-fabulous-the-movie
/progr

KeyboardInterrupt: 