In [2]:
!pip install bs4 requests

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting requests
  Downloading requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Collecting beautifulsoup4 (from bs4)
  Downloading beautifulsoup4-4.12.2-py3-none-any.whl (142 kB)
     ---------------------------------------- 0.0/143.0 kB ? eta -:--:--
     -------------------------------------- 143.0/143.0 kB 2.8 MB/s eta 0:00:00
Collecting charset-normalizer<4,>=2 (from requests)
  Downloading charset_normalizer-3.3.1-cp311-cp311-win_amd64.whl.metadata (33 kB)
Collecting idna<4,>=2.5 (from requests)
  Downloading idna-3.4-py3-none-any.whl (61 kB)
     ---------------------------------------- 0.0/61.5 k

In [3]:
from bs4 import BeautifulSoup, Tag, NavigableString
import requests

In [4]:
CNN_NEWS = "https://edition.cnn.com/world" # this url directs to the world news section 
MAINPAGE = "https://edition.cnn.com"

In [5]:
def get_soup(url : str)-> BeautifulSoup:
    article = requests.get(url).text
    return BeautifulSoup(article)

def get_content(article_soup : BeautifulSoup):
    paragraphs = [paragraph.text for paragraph in article_soup.find_all(is_paragraph)]
    string = ""
    for paragraph in paragraphs:
        string = f"{string} {paragraph}"
    return string

'''def get_news(article_url : str) -> str:
    article_soup = get_soup(article_url)
    return get_content(article_soup)'''

def is_article(tag) -> bool:
    return tag.has_attr("data-open-link") and tag.has_attr("data-word-count")


def get_article_links(article_soup : BeautifulSoup):
    results = article_soup.find_all(is_article)
    results_list = list(results)
    results_list = [f"{MAINPAGE}{result.attrs['data-open-link']}" for result in results]
    '''for result in results:
        #print(dir(result))
        print(f"{MAINPAGE}{result.attrs['data-open-link']}")'''
    return results_list

def get_headline(article_soup : BeautifulSoup):
    return article_soup.find("h1").text

def is_paragraph(tag : Tag) -> bool:
    return tag.has_attr("data-component-name") and tag.name == "p"

def is_author(tag : Tag) -> bool:
    return tag.get_attribute_list("class")[0] == "byline__name"

def get_authors(article_soup : BeautifulSoup):
    author_tags = article_soup.find_all(is_author)
    names = [tag.string for tag in author_tags]
    return names

def is_date(tag : Tag):
    return tag.get_attribute_list("class")[0] == "timestamp"

def get_date(article_soup : BeautifulSoup):
    date_tag = article_soup.find(is_date)
    return date_tag.text[19:52] # this slicing is done in order to just get the parts of the string which are relevant for us

def get_read_time(article_soup : BeautifulSoup):
    read_time_string = article_soup.find_all("div", attrs={"class" : ["headline__sub-description"]})[0].text[15:28]
    return read_time_string # the slice just returns the string with the read time, it shouldn't have two digits though

In [6]:
class Article():
    def __init__(self, headline, contents, authors, date, read_time):
        self.headline = headline
        self.contents = contents
        self.authors = authors
        self.date = date
        self.read_time = read_time
    
    def __str__(self):
        string : str = ""
        return f"{self.headline}  by {self.authors}  {self.read_time}\n {self.contents} \n"

In [7]:
# This code cell lists out article links from the main page
mainpage_soup = get_soup(MAINPAGE)
articles = get_article_links(mainpage_soup)
print(articles)

['https://edition.cnn.com/israel-gaza-hamas-war-thursday-intl-hnk/index.html', 'https://edition.cnn.com/2023/10/26/us/lewiston-maine-shootings-thursday/index.html', 'https://edition.cnn.com/2023/10/24/weather/hurricane-otis-acapulco-mexico/index.html', 'https://edition.cnn.com/2023/10/25/politics/trump-former-aides-turn-against-him/index.html', 'https://edition.cnn.com/2023/10/26/asia/china-shenzhou-17-launch-youngest-crew-scn-intl-hnk/index.html', 'https://edition.cnn.com/2023/10/26/asia/japan-transgender-sterilization-ruling-lgbtq-intl-hnk/index.html', 'https://edition.cnn.com/2023/10/25/sport/tiger-woods-liv-golf-intl-spt-shipnuck-book/index.html', 'https://edition.cnn.com/israel-gaza-hamas-war-thursday-intl-hnk/index.html', 'https://edition.cnn.com/2023/10/26/middleeast/israel-hamas-war-ground-offensive-gaza-intl/index.html', 'https://edition.cnn.com/2023/10/25/middleeast/al-jazeera-journalists-family-killed-in-gaza-strike-says-al-jazeera/index.html', 'https://edition.cnn.com/2023/

In [8]:
# This code cell returns the news paragraphs inside a specific article. By changing the link to another
# CNN article, the whole process should still work.
ARTICLE = articles[3]
article_soup = get_soup(ARTICLE)
get_content(article_soup)

' \nDonald Trump’s wealth, power and fame acted like a magnet for new associates keen to enter his orbit. But now, key figures who sought a share of his reflected glory are turning against him to save themselves.\n   \n      The ex-president absorbed a trio of blows Tuesday that worsened his legal peril and underscored how the 2024 election – in which he is the front-runner for the GOP nomination – will play out in the courts rather than traditional voting battlegrounds.\n   \n      In the most significant development, ABC News reported that Trump’s former chief of staff, Mark Meadows, had met federal prosecutors multiple times and had categorically undermined the ex-president’s narrative about a stolen election. Meadows was the gatekeeper to the Oval Office in the critical days when Trump was allegedly plotting to steal the 2020 election after voters rejected his bid for a second term. CNN has reached out to Meadows’ attorney for comment.\n   \n      In another damaging twist, former 

In [9]:
# Below, the authors of a given article are given.
get_authors(article_soup)

['Stephen Collinson']

In [10]:
date_string = get_date(article_soup)

In [11]:
date_string

'6:59 AM EDT, Wed October 25, 2023'

In [12]:
get_read_time(article_soup)

'6 minute read'

In [13]:
headline = get_headline(article_soup)
contents = get_content(article_soup)
authors = get_authors(article_soup)
date = get_date(article_soup)
read_time = get_read_time(article_soup)

article = Article(headline, contents, authors, date, read_time)

In [14]:
print(article)


      Trump rages as former acolytes turn against him under legal heat
      by ['Stephen Collinson']  6 minute read
  
Donald Trump’s wealth, power and fame acted like a magnet for new associates keen to enter his orbit. But now, key figures who sought a share of his reflected glory are turning against him to save themselves.
   
      The ex-president absorbed a trio of blows Tuesday that worsened his legal peril and underscored how the 2024 election – in which he is the front-runner for the GOP nomination – will play out in the courts rather than traditional voting battlegrounds.
   
      In the most significant development, ABC News reported that Trump’s former chief of staff, Mark Meadows, had met federal prosecutors multiple times and had categorically undermined the ex-president’s narrative about a stolen election. Meadows was the gatekeeper to the Oval Office in the critical days when Trump was allegedly plotting to steal the 2020 election after voters rejected his bid for a 