In [31]:
import requests
from bs4 import BeautifulSoup as bs 
from typing import Tuple, List
import re

In [32]:
MAIN_PAGE_URL = "https://www.bbc.com/news"
WEBSITE_URL = "https://www.bbc.com"

In [33]:
def get_text_from_article(url:str) -> str:
    """Parse bbc article and return text in list of strings"""
    article = requests.get(url)
    soup = bs(article.content, "html.parser")
    body = soup.find_all("div", {'data-component': 'text-block'})
    text = [p.text for p in body] 
    text = "".join(text)
    return text

In [34]:
def get_link_name_pairs_articles_from_main_page() -> List[Tuple[str, str]]:
    """Parses the news articles available on the main page"""
    pattern = re.compile(r'^gs-c-promo-heading')
    page = requests.get(MAIN_PAGE_URL)
    soup = bs(page.content, "html.parser")
    tags = soup.find_all('a',class_ = pattern)
    result = []
    for tag in tags:
        if tag and tag.find('h3') and tag.get('href') and "news" in tag.get('href'):
            result.append((tag.find('h3').get_text(),tag.get('href')))
    
    return result
    

In [35]:
def get_link_name_pairs_articles_from_category_page(url: str) -> List[Tuple[str, str]]:
    """Parse the news articles available for a specific category"""
    page = requests.get(url)
    soup = bs(page.content, "html.parser")
    tags = soup.find_all('div', {"type" : "article"})
    result = []
    for tag in tags:
        result.append((tag.find('a').get_text(),tag.find('a').get('href')))
    return result

In [36]:
get_link_name_pairs_articles_from_category_page("https://bbc.com/news/technology")

[('China to increase curbs on video gaming industry',
  '/news/technology-67801091'),
 ('High-speed train company Hyperloop One shuts down',
  '/news/technology-67801235'),
 ('Spider-Man 2 maker angered by massive hack', '/news/newsbeat-67805736'),
 ("The mystery of Boris Johnson's disappearing WhatsApps",
  '/news/technology-67780595'),
 ('Social media platform X suffers global outages', '/news/business-67783058'),
 ('Porn viewers in EU may have to prove their age',
  '/news/technology-67771794'),
 ('End of an era for electronics giant Toshiba', '/news/business-67757333'),
 ('Nasa beams cat video from deep space with laser',
  '/news/technology-67721671'),
 ('Amazon to make Warhammer 40,000 shows and movies',
  '/news/business-67753779'),
 ('AI cannot patent inventions, UK Supreme Court says',
  '/news/technology-67772177'),
 ('Founder of EV start-up Nikola sentenced for fraud',
  '/news/business-67752125'),
 ('EU takes action against X over disinformation',
  '/news/technology-677492

In [37]:
def get_link_name_pairs_categories() -> List[Tuple[str,str]]:
    """Retreives the categories of the news available on BBC and the corresponding path"""
    main_page = requests.get(MAIN_PAGE_URL)

    soup = bs(main_page.content, "html.parser")
    first_tag = soup.find('div', class_ = 'gs-u-display-none gs-u-display-block@m nw-o-news-wide-navigation')
    tags = first_tag.find_all('a', class_ = 'nw-o-link')
    result = [(tag.get_text(), tag.get('href')) for tag in tags if tag and tag.get_text() and tag.get('href')]
    return result[1::]# skipping the first element since it represents the main page

In [38]:
categories = get_link_name_pairs_categories()

In [40]:
categories

[('Israel-Gaza war', '/news/topics/c2vdnvdg6xxt'),
 ('War in Ukraine', '/news/world-60525350'),
 ('Climate', '/news/topics/cmj34zmwm1zt'),
 ('Video', '/news/av/10462520'),
 ('World', '/news/world'),
 ('UK', '/news/uk'),
 ('Business', '/news/business'),
 ('Tech', '/news/technology'),
 ('Science', '/news/science_and_environment'),
 ('Entertainment & Arts', '/news/entertainment_and_arts'),
 ('Health', '/news/health'),
 ('World News TV', '/news/world_radio_and_tv'),
 ('In Pictures', '/news/in_pictures'),
 ('BBC Verify', '/news/reality_check'),
 ('Newsbeat', '/news/newsbeat')]

In [44]:
link =  WEBSITE_URL + categories[1][1]
link, categories[1][0]

('https://www.bbc.com/news/world-60525350', 'War in Ukraine')

In [47]:
articles = get_link_name_pairs_articles_from_category_page(link)
articles

[('Russia confirms damage to warship in Black Sea',
  '/news/world-europe-67821515'),
 ('Ukraine denies Russian capture of town near Donetsk',
  '/news/world-europe-67820916'),
 ('The Ukrainian frontline city Russia could seize again',
  '/news/world-europe-67801203'),
 ('Ukraine celebrates first Christmas on 25 December',
  '/news/world-europe-67816987'),
 ('Ukraine says it downed three Russian warplanes',
  '/news/world-europe-67809239'),
 ("Ukraine's new Christmas is more than just a date",
  '/news/world-europe-67801204'),
 ('The Ukrainians back at home this Christmas', '/news/newsbeat-67753427'),
 ('War in maps: Ukraine and Russia trade front line blows',
  '/news/world-europe-60506682'),
 ('How Zelensky yachts fable influenced US aid debate',
  '/news/world-us-canada-67766964'),
 ('Can Ukraine recover its occupied territory? Our teams answer your questions',
  '/news/live/world-middle-east-67751758'),
 ('Putin tells Russia his war objectives are unchanged',
  '/news/world-europe-

In [58]:
articles[0][0], get_text_from_article(WEBSITE_URL + articles[0][1])

('Russia confirms damage to warship in Black Sea',
 'Russia has confirmed one of its warships has been damaged in a Ukrainian attack on a Black Sea port.The airstrike took place at Feodosiya in Russian-occupied Crimea early on Tuesday morning. Russia\'s Ministry of Defence said the large landing ship Novocherkassk was struck by Ukrainian aircraft carrying guided missiles.The head of the Ukrainian Air Force said earlier its warplanes had destroyed the ship. One person was killed in the attack, according to the Russian-installed head of Crimea, Sergei Aksyonov. Several others were reportedly hurt. Six buildings were damaged and a small number of people had to be taken to temporary accommodation centres, Mr Aksyonov added.The port\'s transport operations are said to be functioning as normal after the area was cordoned off, while a fire caused by the attack was contained.Footage purportedly showing a huge explosion in the port was shared by Ukrainian air force commander Lt Gen Mykola Olesh

In [None]:
example = "sdfjo\'s"
a = example.replace("\\'s","'s")

In [8]:
categories = get_link_name_pairs_categories()
category_name, category_path = categories[0]
print('Articles for the following category:', category_name, WEBSITE_URL + category_path)
get_link_name_pairs_articles_from_main_page()

Articles for the following category: Israel-Gaza war https://www.bbc.com/news/topics/c2vdnvdg6xxt


[('Russia confirms damage to warship in Black Sea',
  '/news/world-europe-67821515'),
 ('UN says no let-up in Israeli air strikes in Gaza',
  '/news/world-middle-east-67820866'),
 ('Plane held over trafficking fears lands in India',
  '/news/world-europe-67820072'),
 ('Five bodies found in France - police look for father',
  '/news/world-europe-67821163'),
 ("Hanif Kureishi: I've become a reluctant dictator",
  '/news/entertainment-arts-67787423'),
 ('15 celebrity flops and fails of 2023', '/news/entertainment-arts-64315561'),
 ('Ukraine denies Russian capture of town near Donetsk',
  '/news/world-europe-67820916'),
 ('Girl, 7, with brain tumour meets Queen for tea', '/news/uk-67820876'),
 ('Afghans marry in mass ceremony in bid to cut costs',
  '/news/world-asia-67821162'),
 ('Russian opposition leader Navalny found, says team',
  '/news/world-europe-67820068'),
 ('UK has official white Christmas despite mild temperatures',
  '/news/uk-scotland-67815533'),
 ('Russian opposition leader