##### This notebook implements an entire workflow which scrapes the news articles.

In [112]:
# ToDos for 27.10.2002
"""
Get all articles from the mainpage

From the articles, get everything you can get your hands on.

Turn the article to dictionary form
"""

'\nGet all articles from the mainpage\n\nFrom the articles, get everything you can get your hands on.\n\nTurn the article to dictionary form\n'

In [113]:
from bs4 import BeautifulSoup, Tag, NavigableString
import requests
from typing import List

In [114]:
CNN_NEWS = "https://edition.cnn.com/world" # this url directs to the world news section 
MAINPAGE = "https://edition.cnn.com"

In [115]:
def get_soup(url : str)-> BeautifulSoup:
    article = requests.get(url).text
    return BeautifulSoup(article)

def get_content(article_soup : BeautifulSoup):
    paragraphs = [paragraph.text for paragraph in article_soup.find_all(is_paragraph)]
    string = ""
    for paragraph in paragraphs:
        string = f"{string} {paragraph}"
    return string

def get_article_links(mainpage_soup : BeautifulSoup) -> list:
    results = mainpage_soup.find_all(name="a", attrs={"data-link-type" : "article"})
    results = [f'{MAINPAGE}{result.attrs["href"]}' for result in results] 
    return results

def get_headline(article_soup : BeautifulSoup):
    return article_soup.find("h1").text

def is_paragraph(tag : Tag) -> bool:
    return tag.has_attr("data-component-name") and tag.name == "p"

def is_author(tag : Tag) -> bool:
    return tag.get_attribute_list("class")[0] == "byline__name"

def get_authors(article_soup : BeautifulSoup):
    author_tags = article_soup.find_all(is_author)
    names = [tag.string for tag in author_tags]
    return names

def is_date(tag : Tag):
    return tag.get_attribute_list("class")[0] == "timestamp"

def get_date(article_soup : BeautifulSoup):
    date_tag = article_soup.find(is_date)
    return date_tag.text[19:52] # this slicing is done in order to just get the parts of the string which are relevant for us

def get_read_time(article_soup : BeautifulSoup):
    read_time_tag = article_soup.find("div", attrs={"class" : ["headline__sub-description"]})   #[15:28] the slicing caused an error so I removed it for testing purposes
    if read_time_tag is None:
        return ""
    else: 
        return read_time_tag.text

In [116]:
class Article():
    def __init__(self, headline, contents, authors, date, read_time, url):
        self.headline = headline
        self.contents = contents
        self.authors = authors
        self.date = date
        self.read_time = read_time
        self.url = url

        # add url (possibly as id)
    
    def __str__(self):
        string : str = ""
        return f"{self.headline}  by {self.authors}  {self.read_time}\n {self.contents} \n"
    
def create_article_from_link(link : str) ->Article:
    article_soup = get_soup(link)
    headline = get_headline(article_soup)
    content = get_content(article_soup)
    author = get_authors(article_soup)
    date = get_date(article_soup)
    read_time = get_read_time(article_soup)
    url = link

    return Article(headline, content, author, date, read_time, url)

In [117]:
mainpage_soup = get_soup(MAINPAGE)
links = get_article_links(mainpage_soup)

In [118]:
articles = [create_article_from_link(link) for link in links.copy()]
articles

[<__main__.Article at 0x1b0b4e13810>,
 <__main__.Article at 0x1b0b9166450>,
 <__main__.Article at 0x1b0bbcf2310>,
 <__main__.Article at 0x1b0bc132bd0>,
 <__main__.Article at 0x1b0bec0af90>,
 <__main__.Article at 0x1b0bf749b50>,
 <__main__.Article at 0x1b0c112efd0>,
 <__main__.Article at 0x1b0cee76550>,
 <__main__.Article at 0x1b0e0651b90>,
 <__main__.Article at 0x1b0c6e93ad0>,
 <__main__.Article at 0x1b0fa46acd0>,
 <__main__.Article at 0x1b0eff8bb90>,
 <__main__.Article at 0x1b0a753d790>,
 <__main__.Article at 0x1b0f5e19490>,
 <__main__.Article at 0x1b0f5e3f910>,
 <__main__.Article at 0x1b0ada074d0>,
 <__main__.Article at 0x1b0b4cd3cd0>,
 <__main__.Article at 0x1b0ad728fd0>,
 <__main__.Article at 0x1b0f7ce1310>,
 <__main__.Article at 0x1b0aad7d850>,
 <__main__.Article at 0x1b0ec7ba010>,
 <__main__.Article at 0x1b098da0fd0>,
 <__main__.Article at 0x1b0ac52e2d0>,
 <__main__.Article at 0x1b0e9941150>,
 <__main__.Article at 0x1b0e992f910>,
 <__main__.Article at 0x1b0ac265c10>,
 <__main__.A

In [119]:
# Please work please work please work please work please work please work please work 
article_dicts = [article.__dict__ for article in articles]
article_dicts

[{'headline': '\n      EU stops short of calling for ceasefire in Gaza ahead of UN vote as Gaza crisis escalates\n    ',
  'contents': ' \n      European Union leaders have stopped short of calling for a ceasefire in Gaza, instead appealing\xa0for humanitarian “pauses” to provide aid, as the UN warned its operations were being “paralyzed” by Israel’s bombardment of the besieged enclave.\n   \n      The communique, released after meetings Thursday in Brussels, follows several failed attempts by the UN Security Council to pass a resolution on the Israel-Hamas war, with member states preparing to vote on another draft resolution – this time put forward by Jordan on behalf of Arab states – on Friday.\n   \n      Jordan’s Foreign Minister\xa0Ayman Safadi told diplomats gathered at the United Nations Assembly Hall that “collective punishment is not self-defense.” The resolution calls for a “cessation of hostilities,” the release of hostages, and the rejection of “any attempts at forced trans