##### This notebook implements an entire workflow which scrapes the news articles.

In [60]:
# ToDos for 27.10.2002
"""
Get all articles from the mainpage

From the articles, get everything you can get your hands on.

Turn the article to dictionary form
"""

'\nGet all articles from the mainpage\n\nFrom the articles, get everything you can get your hands on.\n\nTurn the article to dictionary form\n'

In [61]:
from bs4 import BeautifulSoup, Tag, NavigableString
import requests
from typing import List

In [62]:
CNN_NEWS = "https://edition.cnn.com/world" # this url directs to the world news section 
MAINPAGE = "https://edition.cnn.com"

In [63]:
def get_soup(url : str)-> BeautifulSoup:
    article = requests.get(url).text
    return BeautifulSoup(article)

def get_content(article_soup : BeautifulSoup):
    paragraphs = [paragraph.text for paragraph in article_soup.find_all(is_paragraph)]
    string = ""
    for paragraph in paragraphs:
        string = f"{string} {paragraph}"
    return string

def get_article_links(mainpage_soup : BeautifulSoup) -> list:
    results = mainpage_soup.find_all(name="a", attrs={"data-link-type" : "article"})
    results = [f'{MAINPAGE}{result.attrs["href"]}' for result in results] 
    return results

def get_headline(article_soup : BeautifulSoup):
    return article_soup.find("h1").text

def is_paragraph(tag : Tag) -> bool:
    return tag.has_attr("data-component-name") and tag.name == "p"

def is_author(tag : Tag) -> bool:
    return tag.get_attribute_list("class")[0] == "byline__name"

def get_authors(article_soup : BeautifulSoup):
    author_tags = article_soup.find_all(is_author)
    names = [tag.string for tag in author_tags]
    return names

def is_date(tag : Tag):
    return tag.get_attribute_list("class")[0] == "timestamp"

def get_date(article_soup : BeautifulSoup):
    date_tag = article_soup.find(is_date)
    return date_tag.text[19:52] # this slicing is done in order to just get the parts of the string which are relevant for us

def get_read_time(article_soup : BeautifulSoup):
    read_time_string = article_soup.find_all("div", attrs={"class" : ["headline__sub-description"]})[0].text[15:28]
    return read_time_string # the slice just returns the string with the read time, it shouldn't have two digits though

In [64]:
class Article():
    def __init__(self, headline, contents, authors, date, read_time):
        self.headline = headline
        self.contents = contents
        self.authors = authors
        self.date = date
        self.read_time = read_time
    
    def __str__(self):
        string : str = ""
        return f"{self.headline}  by {self.authors}  {self.read_time}\n {self.contents} \n"
    
def create_article_from_link(link : str) ->Article:
    article_soup = get_soup(link)
    headline = get_headline(article_soup)
    content = get_content(article_soup)
    author = get_authors(article_soup)
    date = get_date(article_soup)
    read_time = get_read_time(article_soup)

    return Article(headline, content, author, date, read_time)

In [65]:
mainpage_soup = get_soup(MAINPAGE)
links = get_article_links(mainpage_soup)

In [73]:
#articles : List[Article] = [create_article_from_link(link) for link in links]

article = create_article_from_link(links[2])

article.__dict__


{'headline': '\n      Former Chinese Premier Li Keqiang dead at 68: state media\n    ',
 'authors': ['Simone McCarthy', 'Laura He', 'Steven Jiang'],
 'date': '4:46 AM EDT, Fri October 27, 2023',
 'read_time': '6 minute read'}