In [93]:
from bs4 import BeautifulSoup, Tag, NavigableString
import requests

In [94]:
CNN_NEWS = "https://edition.cnn.com/world" # this url directs to the world news section 
MAINPAGE = "https://edition.cnn.com"

In [95]:
def get_soup(url : str)-> BeautifulSoup:
    article = requests.get(url).text
    return BeautifulSoup(article)

def get_content(article_soup : BeautifulSoup):
    paragraphs = [paragraph.text for paragraph in article_soup.find_all(is_paragraph)]
    string = ""
    for paragraph in paragraphs:
        string = f"{string} {paragraph}"
    return string

'''def get_news(article_url : str) -> str:
    article_soup = get_soup(article_url)
    return get_content(article_soup)'''

def is_article(tag) -> bool:
    return tag.has_attr("data-open-link") and tag.has_attr("data-word-count")


def get_article_links(article_soup : BeautifulSoup):
    results = article_soup.find_all(is_article)
    results_list = list(results)
    results_list = [f"{MAINPAGE}{result.attrs['data-open-link']}" for result in results]
    '''for result in results:
        #print(dir(result))
        print(f"{MAINPAGE}{result.attrs['data-open-link']}")'''
    return results_list

def get_headline(article_soup : BeautifulSoup):
    return article_soup.find("h1").text

def is_paragraph(tag : Tag) -> bool:
    return tag.has_attr("data-component-name") and tag.name == "p"

def is_author(tag : Tag) -> bool:
    return tag.get_attribute_list("class")[0] == "byline__name"

def get_authors(article_soup : BeautifulSoup):
    author_tags = article_soup.find_all(is_author)
    names = [tag.string for tag in author_tags]
    return names

def is_date(tag : Tag):
    return tag.get_attribute_list("class")[0] == "timestamp"

def get_date(article_soup : BeautifulSoup):
    date_tag = article_soup.find(is_date)
    return date_tag.text[19:52] # this slicing is done in order to just get the parts of the string which are relevant for us

def get_read_time(article_soup : BeautifulSoup):
    read_time_string = article_soup.find_all("div", attrs={"class" : ["headline__sub-description"]})[0].text[15:28]
    return read_time_string # the slice just returns the string with the read time, it shouldn't have two digits though

In [96]:
class Article():
    def __init__(self, headline, contents, authors, date, read_time):
        self.headline = headline
        self.contents = contents
        self.authors = authors
        self.date = date
        self.read_time = read_time
    
    def __str__(self):
        string : str = ""
        return f"{self.headline}  by {self.authors} \n {self.contents} \n"

In [97]:
# This code cell lists out article links from the main page
mainpage_soup = get_soup(MAINPAGE)
articles = get_article_links(mainpage_soup)
print(articles)

['https://edition.cnn.com/2023/10/24/middleeast/israel-hamas-gaza-war-tuesday-intl-hnk/index.html', 'https://edition.cnn.com/2023/10/23/politics/tom-emmer-2020-election-kfile/index.html', 'https://edition.cnn.com/2023/10/24/politics/paul-whelan-blinken-cnn-interview/index.html', 'https://edition.cnn.com/2023/10/24/energy/iea-oil-gas-coal-demand-peak-2030/index.html', 'https://edition.cnn.com/2023/10/24/us/tyonna-major-us-gun-deaths-gdpr/index.html', 'https://edition.cnn.com/2023/10/24/health/lonely-adults-gallup-poll-wellness/index.html', 'https://edition.cnn.com/2023/10/24/business-food/sand-to-green-desert-morocco-spc-intl/index.html', 'https://edition.cnn.com/2023/10/23/world/apollo-17-moon-age-crystals-scn/index.html', 'https://edition.cnn.com/2023/10/24/middleeast/israel-hostages-freed-lifshitz-cooper-intl-hnk/index.html', 'https://edition.cnn.com/2023/10/24/middleeast/israel-hamas-gaza-war-tuesday-intl-hnk/index.html', 'https://edition.cnn.com/2023/10/23/middleeast/gaza-hospitals

In [98]:
# This code cell returns the news paragraphs inside a specific article. By changing the link to another
# CNN article, the whole process should still work.
ARTICLE = articles[3]
article_soup = get_soup(ARTICLE)
get_content(article_soup)

' \n      Global demand for oil, natural gas and coal — and the carbon pollution they generate — are expected to peak later this decade, according to a new report by the International Energy Agency.\n   \n      Driving the shift will be the “phenomenal rise” of clean sources of energy, the Paris-based agency said in its annual World Energy Outlook report, published Tuesday. \n   \n      The agency said it expects there to be nearly 10 times as many electric cars on the road globally by the end of the decade, and for renewables to account for almost half of the global energy mix, up from 30% today. \n   \n      Slowing growth in China will also depress demand for fossil fuels. The world’s second biggest economy — and largest energy consumer — has reached an “inflection point,” the IEA said, with its total energy demand set to peak around the middle of the decade. China is also becoming a “clean energy powerhouse,” and accounted for more than half of electric vehicle sales worldwide last

In [99]:
# Below, the authors of a given article are given.
get_authors(article_soup)

['Anna Cooban']

In [100]:
date_string = get_date(article_soup)

In [101]:
date_string

'5:56 AM EDT, Tue October 24, 2023'

In [102]:
get_read_time(article_soup)

'3 minute read'

In [104]:
headline = get_headline(article_soup)
contents = get_content(article_soup)
authors = get_authors(article_soup)
date = get_date(article_soup)
read_time = get_read_time(article_soup)

article = Article(headline, contents, authors, date, read_time)

In [106]:
print(article)


      ‘Unstoppable’ energy transition means demand for oil, gas, and coal set to peak by 2030
      by ['Anna Cooban'] 
  
      Global demand for oil, natural gas and coal — and the carbon pollution they generate — are expected to peak later this decade, according to a new report by the International Energy Agency.
   
      Driving the shift will be the “phenomenal rise” of clean sources of energy, the Paris-based agency said in its annual World Energy Outlook report, published Tuesday. 
   
      The agency said it expects there to be nearly 10 times as many electric cars on the road globally by the end of the decade, and for renewables to account for almost half of the global energy mix, up from 30% today. 
   
      Slowing growth in China will also depress demand for fossil fuels. The world’s second biggest economy — and largest energy consumer — has reached an “inflection point,” the IEA said, with its total energy demand set to peak around the middle of the decade. China is al