In [3]:
# Imports

import mechanicalsoup
import json
import hashlib
from bs4 import BeautifulSoup
import sys

In [4]:
# variable instantiation

BASE_URL = 'https://www.bbc.co.uk'

SCRAPING_SUFFIX = '/iplayer/a-z/'

browser = mechanicalsoup.StatefulBrowser()

In [5]:
# initail browser page

browser.open(BASE_URL + SCRAPING_SUFFIX)


navigation = browser.get_current_page().find('ul', attrs={'class': 'scrollable-nav__track'})

navigation_list = navigation.find_all('li')

navigation_list = [x.a['href'] for x in navigation_list if x.a is not None ]

print(navigation_list)

['/iplayer/a-z/a', '/iplayer/a-z/b', '/iplayer/a-z/c', '/iplayer/a-z/d', '/iplayer/a-z/e', '/iplayer/a-z/f', '/iplayer/a-z/g', '/iplayer/a-z/h', '/iplayer/a-z/i', '/iplayer/a-z/j', '/iplayer/a-z/k', '/iplayer/a-z/l', '/iplayer/a-z/m', '/iplayer/a-z/n', '/iplayer/a-z/o', '/iplayer/a-z/p', '/iplayer/a-z/q', '/iplayer/a-z/r', '/iplayer/a-z/s', '/iplayer/a-z/t', '/iplayer/a-z/u', '/iplayer/a-z/v', '/iplayer/a-z/w', '/iplayer/a-z/x', '/iplayer/a-z/y', '/iplayer/a-z/z', '/iplayer/a-z/0-9']


In [6]:

def iplayer_atoz_page_extractor(program_selection):
    '''arguement is soup div tag for a program.
    Returns program title, program synopsis, no of
    episodes available, and the link to the latest episode'''
    # Program Title
    title = program_selection.find('p',
            attrs={'class':
                'list-content-item__title'}).get_text()
    # Program Synopsis
    synopsis = program_selection.find('p',
            attrs={'class':
                'list-content-item__synopsis'}).get_text()
    # Link to latest episode
    latest_episode_url = program_selection.find('a',
            href=True)['href']
    # Number of episodes available
    episodes_available = program_selection.find('div',
            attrs={'class': 'list-content-item__sublabels'})
    
    
    return title, synopsis, latest_episode_url


In [7]:
def get_page(browser, url):
            browser.open(url)
            return browser.get_current_page()

In [8]:
def programme_website_extractor(web_page):
    
    program_website_url = web_page.find('a',
            attrs={'class': 'lnk'},
            text='Programme website')['href']
    program_credits_url = web_page.find('a',
            attrs={'class': 'lnk'},
            text='Credits')
    
    credits_available = bool(program_credits_url)
    
    if credits_available:
        program_credits_url = program_credits_url['href']
    
    return program_website_url, program_credits_url, credits_available

In [47]:
def episode_page_extractor(web_page, credits_available):
    
    # Credits
    if credits_available:
        credits = web_page.find('table', attrs={'class': 'table'})
        if credits:
            credits_dict = {}
            for row in credits.find_all('tr'):
                person = row.find_all('span')
                if len(person) > 1:
                    json_credits = [x.get_text() for x in person]
                    credits_dict[json_credits[0]] = json_credits[1]

    # Genre and format
    
    genre_format = web_page.find(
        'div', attrs={'class': 'footer__similar b-g-p component'})
    
    if genre_format is None:
        genre_format = web_page.find('div', attrs={'class': 'islet--horizontal footer__programmes footer__service-island'})

    if genre_format is not None:
        sim = genre_format.find_all('div')
        genre_format_list = []

        for i in sim:
            genre_format_list.append(
                [[x.get_text(), x['href']] for x in i.find_all('a')])
            
        genre = []
        sub_genre = []
        prog_frmat = []
        genre_format_dict = {'genre': {}}

        for i in range(len(genre_format_list)):
            for j in range(len(genre_format_list[i])):
                if i == 0:
                    if j < 1:
                        genre.append(genre_format_list[i][j])
                        genre_format_dict['genre'].update({'main': {str(j): genre_format_list[i][j][0],
                                                              'link': genre_format_list[i][j][1]}})
                    else: 
                        sub_genre.append(genre_format_list[i][j])
                        genre_format_dict['genre'].update({'sub_genre': {str(j): genre_format_list[i][j][0],
                                                                   'link': genre_format_list[i][j][1]}})
                else:
                    if j < 1:
                        genre_format_dict.update({'format': {str(j): genre_format_list[i][j][0],
                                                       'link': genre_format_list[i][j][1]}})
                    else:
                        genre_format_dict['format'].update({'sub_format': {str(j): genre_format_list[i][j][0],
                                                                     'link': genre_format_list[i][j][1]}})

        print(genre_format_dict)
    
    # get time left to watch
        
    left_to_watch = web_page.find(
        'div', attrs={'class': 'grid 1/3@bpw 1/4@bpe'})

    if left_to_watch is not None:
        left_to_watch_items = left_to_watch.find_all(
            'p', attrs={'class': 'episode-panel__meta'})

        if left_to_watch.find(
                'div', attrs={'class': "episode-panel__meta"}) is None:
            if left_to_watch_items[0].span is None:
                days_left = left_to_watch_items[0].get_text()
            else:
                days_left = left_to_watch_items[0].span.get_text()
            duration = left_to_watch_items[1].get_text()

            print('Left to watch: ', days_left,'duration: ' ,duration)
    
    # get long synopsis
    
    long_synopsis = web_page.find(
        'div', attrs={'class': 'synopsis-toggle__long'})

    if long_synopsis is not None:
        long_synopsis_paragraphs = [
            x.get_text() for x in long_synopsis.find_all('p')
        ]

        print('long_synopsis: ',long_synopsis_paragraphs)
    
    # Broadcast information
    main_broadcast = web_page.find(
        'div',
        attrs={
            'class':
            'grid 1/3@bpw2 1/3@bpe map__column map__column--2 map__column--last'
        })

    if main_broadcast is not None:
        date_last_aired = main_broadcast.find(
            'span',
            attrs={'class': 'broadcast-event__date text-base timezone--date'})
        time_last_aired = main_broadcast.find(
            'span', attrs={'class': 'timezone--time'})
        channel = main_broadcast.find(
            'div',
            attrs={
                'class':
                'programme__service box-link__elevated micro text--subtle'
            })
        if channel is not None:
            channel_text = channel.find('a').get_text()
            print('Channel: ', channel_text)
        if channel is not None:
            channel_link = channel.find('a')['href']
            print('Channel link: ', channel_link)
        if date_last_aired is not None:
            date_last_aired = date_last_aired.get_text()
            print('date_aired: ', date_last_aired)
        if time_last_aired is not None:
            time_last_aired = time_last_aired.get_text()
            print('time_aired: ', time_last_aired)
    
    # build dictionary 
        
    return 0

In [10]:
def recommendation_extraction(web_page):
    if web_page is not None:
        page_items = web_page.find('ol', attrs={'class': 'highlight-box-wrapper'})
        print('found list of recommendations')
        if page_items is not None: 
            list_items = page_items.find_all('li')
            for item in list_items:
                item_info = item.find('div', attrs={'class': 'programme__body'})
                link_1 = item_info.h4.a['href']
                link_2 = item_info.h4.a['resource']
                title = item_info.h4.a.get_text()
                synop = item_info.p.get_text()
                
                print('recommendations:', link_1, link_2, 'title: ',title, synop)
    
    

In [11]:
def episode_available_extraction(web_page):
    episodes_link = web_page.find(
        'a', attrs={
            'class': 'br-nav__link',
            'data-linktrack': 'nav_episodes'
        })
    
    if episodes_link is not None:
        episodes_link = episodes_link['href']

        print(episodes_link)
    
        episodes_page = get_page(browser, BASE_URL + episodes_link)
        
        episode_list_extractor(episodes_page)
        
        episode_pagination = episodes_page.find('ol', attrs={'class': 'nav nav--banner pagination delta'})
        
        # TODO loop through pagination
        # if present
        if episode_pagination:
            page_list = pagination.find_all('li', attrs={'class': 'pagination__page'})
            page_links = [x.a['href'] for x in page_list if x.a is not None]
            
            for endpoint in page_links:
                url = curr_url + endpoint

                browser.open(url)

                episodes_page = browser.get_current_page()
                episode_list_extractor(episodes_page)

In [12]:
def episode_list_extractor(web_page):
    episodes_available_list = web_page.find(
            'div', attrs={'class': 'br-box-page programmes-page'})
    episodes_container_list = episodes_available_list.find_all(
        'div',
        attrs={
            'class':
            'programme programme--tv programme--episode block-link highlight-box--list br-keyline br-blocklink-page br-page-linkhover-onbg015--hover'
        })

    available_episodes = web_page.find('span', attrs={'class': 'hidden grid-visible@bpb2 grid-visible@bpw'})
    print('available episodes: ', available_episodes.get_text())

    #TODO: fix this implimentation
    for item in episodes_container_list:
        # link
        item_headder = item.find(
            'div', attrs={'class': 'cta cta__overlay'})
        item_link = item_headder.a['href']
        # time left
        item_time_left = item_headder.a['title']
        # title
        item_body = item.find('div', attrs={'class': 'programme__body'})

        print(item_link, item_time_left)

        if item_body is not None:
            # num of episodes
            try: 
                episode_oneline_synopsis = item_body.p.get_text()
                print('episode_oneline_synopsis: ' ,episode_oneline_synopsis)
            except:
                pass
            try:
                episode_no = item_body.p.abbr['title']
                print('episode_no: ', episode_no)
            except:
                pass
            try:
                episode_title = item_body.find('span', attrs={'class': 'programme__title gamma'}).get_text()
                print(episode_title)
            except:
                pass


In [13]:
def upcoming_episodes():
    next_on_suffix = 'broadcasts/upcoming/'
    browser.open( + next_on_suffix)
    next_on = browser.get_current_page()
    next_on_section = next_on.find('ol', attrs={'class':'highlight-box-wrapper'}) 
    
    next_up_dict = {}
    
    for item in next_on_section.find_all('li'):
        broadcast_info = item.find('div', attrs={'class':'programme__body programme__body--flush'})
        broadcast_info_tag = broadcast_info.find('div', attrs={'class': 'broadcast-event__time beta'})
        
        broadcast_date = broadcast_info_tag['title']
        broadcast_day = broadcast_info_tag.find('span', attrs={'class':'broadcast-event__date text-base timezone--date'}).get_text()
        broadcast_time = broadcast_info_tag.find('span',attrs = {'class': 'timezone--time'}).get_text()
        
        broadcast_channel = broadcast_info.find('div', attrs={'class': 'programme__service box-link__elevated micro text--subtle'})

        channel = broadcast_channel.a.get_text()
        channel_url = broadcast_channel.a['href']
        
        program_info = item.find('div', attrs={'class': 'grid 7/12 2/3@bpb2 3/4@bpw 5/6@bpw2 5/6@bpe'})
        program_title_info = program_info.a
        
        program_id = program_info.div['data-pid']
        program_link = program_title_info['href']
        program_title = program_title_info.find('span', attrs={'class': 'programme__title gamma'}).get_text()
        series = program_title_info.find('span', attrs={'class': 'programme__subtitle centi'}).get_text()
        program_synopsis = program_info.p.get_text()
        
        next_up_dict[program_id] = {'program_title': program_title.encode('utf-8'),
                                    'series': series.encode('utf-8'),
                                    'program_synopsis': program_synopsis.encode('utf-8').strip(),
                                    'program_link': program_link.encode('utf-8'),
                                    'channel': {'name': channel.encode('utf-8'),
                                              'link': channel_link.encode('utf-8')},
                                    'broadcast': {'date': broadcast_date.encode('utf-8'),
                                                 'day': broadcast_day.encode('utf-8'),
                                                 'time': broadcast_time.encode('utf-8')}}
        
        
    print(next_up_dict)

In [48]:
for suffix in navigation_list:
    browser.open(BASE_URL + suffix)
    
    program_selection = browser.get_current_page().find_all(
    'li', attrs={"class": "grid__item"})
    
    for program_box in program_selection: 
        program_title, program_synopsis, latest_episode_url = iplayer_atoz_page_extractor(program_box) 
        
        print(program_title, program_synopsis, latest_episode_url)
        
        # get latest url page
        
        latest_episode_page = get_page(browser, BASE_URL + latest_episode_url)
        
        program_website_url, program_credits_url, credits_available = programme_website_extractor(latest_episode_page)
        print(program_website_url, program_credits_url, credits_available)
        
        if credits_available:
            program_website_page = get_page(browser, BASE_URL + program_credits_url)
        else: 
            program_website_page = get_page(browser, BASE_URL + program_website_url)
        
        # TODO add program webpage extraction for programs without credits for genre 
        
        # TODO assign variables from this function
        episode_page_extractor(program_website_page, credits_available)
        
        recommendation_page = get_page(browser, BASE_URL + program_website_url + '/recommendations')
        recommendation_extraction(recommendation_page)
        
        # TODO assign variables
        episode_available_extraction(get_page(browser, BASE_URL + program_website_url))
        
        
        

(u'Abadas', u'Rhaglen animeiddio i blant ifanc yn canolbwyntio ar gyflwyno geiriau newydd. Animation ...', '/iplayer/episode/p02fbwcv/abadas-trwmped')
('/programmes/p02b4jth', None, False)
{'genre': {'main': {'0': u"Children's", 'link': '/programmes/genres/childrens'}}}
found list of recommendations
('recommendations:', '/programmes/p02b4sp5', 'https://www.bbc.co.uk/programmes/p02b4sp5', 'title: ', u'  Yn yr Ardd  \u2014 Cyfres 2, Tylwyth Teg ', u' Mae dant Fflach yn rhydd. Tybed a fydd y tylwyth teg yn galw heibio? Fflach has a wobbl... ')
('recommendations:', '/programmes/p04x702k', 'https://www.bbc.co.uk/programmes/p04x702k', 'title: ', u'  Ynys Broc M\xf4r Lili  \u2014 Cyfres 1, Harbwr cwcis ', u" Mae Lili'n dod o hyd i declyn torri bisgedi ar y traeth ac yn penderfynu y byddai'n bra... ")
('recommendations:', '/programmes/p06y72c5', 'https://www.bbc.co.uk/programmes/p06y72c5', 'title: ', u"  Blero'n Mynd i Ocido  \u2014 Cyfres 2, 9 ", u" Pan fo'r haul yn diflannu, mae Blero a'i ff

('/programmes/b08b7blz', '/programmes/b08b7blz#credits', True)
{'genre': {'main': {'0': u'Comedy', 'link': '/programmes/genres/comedy'}, 'sub_genre': {'1': u'Sitcoms', 'link': '/programmes/genres/comedy/sitcoms'}}, 'format': {'0': u'Films', 'link': '/programmes/formats/films'}}
('Left to watch: ', u'1 day left to watch', 'duration: ', u'\n1 hour, 31 minutes\n            ')
('long_synopsis: ', [u'Worried about her finances when her book fails to find a publisher, Patsy is given the tip that Kate Moss has left her PR company and will be looking for new representation. Things start to go awry when the plot to approach Kate goes horribly wrong, and Patsy and Edina find themselves vilified by the entire nation and having to flee the country.'])
('Channel: ', u'BBC One')
('Channel link: ', 'https://www.bbc.co.uk/bbcone')
('date_aired: ', u"New Year's Day 2019")
('time_aired: ', u'01:50')
found list of recommendations
('recommendations:', '/programmes/p05dsqxl', 'https://www.bbc.co.uk/program

/programmes/b052hdnr/episodes
('available episodes: ', u'(1)')
('https://www.bbc.co.uk/iplayer/episode/b054fm6n', '6 days left to watch (Mon 04 February 2019, 13:00)')
('episode_oneline_synopsis: ', u"\n6/6 Rachel Khoo heads to Malaysia to explore one of the world's most exciting cuisines.\n")
('episode_no: ', 'Episode 6 of 6')
Rachel Khoo's Malaysia
(u'Adam Curtis', u'Documentary films by Adam Curtis.', '/iplayer/episode/p04b183c/adam-curtis-hypernormalisation')
('/programmes/p04bkttz', '/programmes/p04b183c#credits', True)
{'genre': {'main': {'0': u'Factual', 'link': '/programmes/genres/factual'}, 'sub_genre': {'3': u'Politics', 'link': '/programmes/genres/factual/politics'}}, 'format': {'0': u'Documentaries', 'sub_format': {'1': u'Films', 'link': '/programmes/formats/films'}, 'link': '/programmes/formats/documentaries'}}
('long_synopsis: ', [u'We live in a time of great uncertainty and confusion. Events keep happening that seem inexplicable and out of control. Donald Trump, Brexit, 

(u"A.Dot's Story of Grime", u'1Xtra\u2019s A.Dot gets under the skin of the grime scene and sets up the ultimate Clash.', '/iplayer/episode/p04fj2t0/adots-story-of-grime')
('/programmes/p04fj2t0', None, False)
{'genre': {'main': {'0': u'Music', 'link': '/programmes/genres/music'}, 'sub_genre': {'1': u'Hip Hop, RnB & Dancehall', 'link': '/programmes/genres/music/hiphoprnbanddancehall'}}, 'format': {'0': u'Documentaries', 'link': '/programmes/formats/documentaries'}}
('long_synopsis: ', [u"Part of this year's Black & British season, A.Dot\u2019s Story of Grime tells the story of Grime through the art and culture of \u2018the Clash\u2019 \u2013 a war of words between MC\u2019s that leaves the best artist standing.", u"Presented by 1Xtra\u2019s A.Dot, it charts her attempt to set up the ultimate Grime Clash/Royal Rumble - \u2018GRIMEAGGEDON' - allowing her to get under the skin of the modern grime scene along the way. ", u"Offering his backing, contacts and advice to Dotty is clashing expe

('/programmes/b01cc6m6', '/programmes/b01p1127#credits', True)
found list of recommendations
(u'Afoot Again In The Past', u"Investigating Britain's architectural heritage.", '/iplayer/episode/p03lxghc/afoot-again-in-the-past-ickworth-house')
('/programmes/p03lc0cb', '/programmes/p03lxghc#credits', True)
{'genre': {'main': {'0': u'Factual', 'link': '/programmes/genres/factual'}, 'sub_genre': {'link': '/programmes/genres/factual/homesandgardens/homes', '4': u'Homes'}}}
('Left to watch: ', u'\n                                    Available now\n                            ', 'duration: ', u'\n5 minutes\n            ')
('Channel: ', u'BBC Two')
('Channel link: ', 'https://www.bbc.co.uk/bbctwo')
('date_aired: ', u'Fri 28 Feb 2003')
('time_aired: ', u'11:25')
found list of recommendations
/programmes/p03lc0cb/episodes
('available episodes: ', u'(6)')
(u'Africa', u"Eye to eye with the unknown - the world's wildest continent.", '/iplayer/episode/b01rrxdh/africa-the-greatest-show-on-earth')
('/p

('available episodes: ', u'(9)')
('https://www.bbc.co.uk/iplayer/episode/p061t4sq', u'25 o ddyddiau ar \xf4l i wylio (Sul 24 Chwefror 2019, 07:30)')
('episode_oneline_synopsis: ', u"\nM\xf4r-ladron o Ysgol y Ffwrnes, Llanelli sy'n ymuno \xe2 Ben Dant a Cadi i herio Capten Cnec...\n")
Ysgol y Ffwrnes, Llanelli
('https://www.bbc.co.uk/iplayer/episode/p061252p', u'23 o ddyddiau ar \xf4l i wylio (Gwen 22 Chwefror 2019, 07:25)')
('episode_oneline_synopsis: ', u'\nHeddiw, mae mwy o f\xf4r-ladron o Ysgol L\xf4n Las, Abertawe yn ymuno \xe2 Ben Dant a Cadi i her...\n')
Ysgol Lôn Las, Llansamlet (2)
('https://www.bbc.co.uk/iplayer/episode/p060bfn4', u'18 o ddyddiau ar \xf4l i wylio (Sul 17 Chwefror 2019, 07:30)')
('episode_oneline_synopsis: ', u"\nM\xf4r-ladron o Ysgol Rhyd y Grug, Aberfan sy'n ymuno \xe2 Ben Dant a Cadi i herio Capten Cne...\n")
Ysgol Rhyd Y Grug, Aberfan
('https://www.bbc.co.uk/iplayer/episode/p05zlsp3', u'16 o ddyddiau ar \xf4l i wylio (Gwen 15 Chwefror 2019, 07:25)')
('episo

('available episodes: ', u'(2)')
('https://www.bbc.co.uk/iplayer/episode/b04yq4zn', '29 days left to watch (Wed 27 February 2019, 19:00)')
('episode_oneline_synopsis: ', u'\n2/26 Amazing facts about Dotty, a spotted dolphin who lives with close family in the ocean.\n')
('episode_no: ', 'Episode 2 of 26')
Dotty the Spotted Dolphin
('https://www.bbc.co.uk/iplayer/episode/b04xhv00', '22 days left to watch (Wed 20 February 2019, 19:00)')
('episode_oneline_synopsis: ', u'\n1/26 Amazing facts about Rogan, a young joey living in the Australian outback.\n')
('episode_no: ', 'Episode 1 of 26')
Rogan the Red Kangaroo
(u"Alaska: Earth's Frozen Kingdom", u'A look at the stories of pioneering Alaskans over the course of a year.', '/iplayer/episode/b0536hxc/alaska-earths-frozen-kingdom-3-winter')
('/programmes/b0520nyz', '/programmes/b0536hxc#credits', True)
{'genre': {'main': {'0': u'Factual', 'link': '/programmes/genres/factual'}, 'sub_genre': {'2': u'Nature & Environment', 'link': '/programmes/ge

KeyboardInterrupt: 