##### This part implements an entire workflow which scrapes the news articles.

In [8]:
# ToDos for 27.10.2002
"""
Get all articles from the mainpage

From the articles, get everything you can get your hands on.

Turn the article to dictionary form
"""

'\nGet all articles from the mainpage\n\nFrom the articles, get everything you can get your hands on.\n\nTurn the article to dictionary form\n'

In [9]:
from bs4 import BeautifulSoup, Tag, NavigableString
import requests
from typing import List

In [10]:
MAINPAGE = "edition.cnn.com"
HTTPS_SUFFIX = "https://"
MAINPAGE_LINK = f"{HTTPS_SUFFIX}{MAINPAGE}"

In [11]:
class Image():
    def __init__(self, url : str, description : str):
        self.url = url
        self.description = description

    def __str__(self):
        return f'Photo description: {self.description}'

class Article():
    def __init__(self, headline : str, contents : str, authors : str, date : str, read_time : str, url : str, image : Image):
        self.headline = headline
        self.contents = contents
        self.authors = authors
        self.date = date
        self.read_time = read_time
        self.url = url
        self.image = image

        # add url (possibly as id)
    
    def __str__(self):
        string : str = ""
        return f"{self.headline}  by {self.authors}  {self.read_time}\n {self.contents} \n"

In [12]:
def get_soup(url : str)-> BeautifulSoup:
    article = requests.get(url).text
    if article:
        return BeautifulSoup(article)
    else:
        print("Error fetching the soup object")

def get_content(article_soup : BeautifulSoup):
    paragraphs = [paragraph.text for paragraph in article_soup.find_all(is_paragraph)]
    string = ""
    for paragraph in paragraphs:
        string = f"{string} {paragraph}"
    return string

def get_article_links(mainpage_soup : BeautifulSoup) -> list:
    results = mainpage_soup.find_all(name="a", attrs={"data-link-type" : "article"})
    results = [f'{MAINPAGE_LINK}{result.attrs["href"]}' for result in results]
    print(results)
    return results

def get_headline(article_soup : BeautifulSoup):
    return article_soup.find("h1").text

def get_authors(article_soup : BeautifulSoup):
    author_tags = article_soup.find_all(is_author)
    names = [tag.string for tag in author_tags]
    return names

def get_date(article_soup : BeautifulSoup):
    date_tag = article_soup.find(is_date)
    return date_tag.text[19:52] # this slicing is done in order to just get the parts of the string which are relevant for us

def get_read_time(article_soup : BeautifulSoup):
    read_time_tag = article_soup.find("div", attrs={"class" : ["headline__sub-description"]})   #[15:28] the slicing caused an error so I removed it for testing purposes
    if read_time_tag is None:
        return ""
    else: 
        return read_time_tag.text
    
def get_image(article_soup : BeautifulSoup) -> Image:
    image_tag = article_soup.find(is_image)
    if image_tag is not None:
        return Image(url=image_tag['src'], description=image_tag['alt']) # modify this
    else:
        return Image("", "")

def is_paragraph(tag : Tag) -> bool:
    return tag.has_attr("data-component-name") and tag.name == "p"

def is_author(tag : Tag) -> bool:
    return tag.get_attribute_list("class")[0] == "byline__name"

def is_date(tag : Tag):
    return tag.get_attribute_list("class")[0] == "timestamp"

def is_image(tag : Tag) -> bool:
    return tag.has_attr("src") and tag.has_attr("alt") and tag.name == "img"

def create_article_from_link(link : str) ->Article:
    article_soup = get_soup(link)
    headline = get_headline(article_soup)
    content = get_content(article_soup)
    author = get_authors(article_soup)
    date = get_date(article_soup)
    read_time = get_read_time(article_soup)
    url = link
    image = get_image(article_soup)

    return Article(headline, content, author, date, read_time, url, image)

In [13]:
mainpage_soup = get_soup(MAINPAGE_LINK)
links = get_article_links(mainpage_soup)


['https://edition.cnn.com/2023/10/31/middleeast/israel-gaza-hamas-war-tuesday-intl-hnk/index.html', 'https://edition.cnn.com/2023/10/31/europe/russia-ukraine-donetsk-volnovakha-family-killed-intl-hnk/index.html', 'https://edition.cnn.com/2023/10/30/entertainment/jennifer-aniston-matthew-perry-death/index.html', 'https://edition.cnn.com/2023/10/31/business/carlsberg-russia-business-stolen/index.html', 'https://edition.cnn.com/style/best-celebrity-halloween-costumes-2023/index.html', 'https://edition.cnn.com/travel/article/space-toilet-space-perspective-scn/index.html', 'https://edition.cnn.com/travel/lonely-planets-top-places-to-go-in-2024/index.html', 'https://edition.cnn.com/2023/10/31/sport/hwang-in-tae-nba-referee-south-korea-spt-intl/index.html', 'https://edition.cnn.com/2023/10/30/middleeast/shani-louk-dead-israel-intl/index.html', 'https://edition.cnn.com/2023/10/31/middleeast/israel-gaza-hamas-war-tuesday-intl-hnk/index.html', 'https://edition.cnn.com/2023/10/31/europe/dagestan-

In [14]:
articles = [create_article_from_link(link) for link in links.copy()]
#articles

##### This section filters the scraped articles to exclude articles which have already been previously scraped.

In [15]:
%pip install pandas

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [16]:
import json
from pandas import read_json

def read_string_from_file(filename : str):
    f = open(filename, "r")
    articles_string = f.read()
    return articles_string

#jsons_string = read_string_from_file("exported_from_database/articles.json")

articles_dataframe = read_json("exported_from_database/articles.json")

In [17]:
print(len(articles))
for article in articles.copy():
    print(article.url[8:])
    if article.url[8:] in articles_dataframe["urlid"]:
        print("Article already present in the database")
        articles.remove(article)
len(articles)

140
edition.cnn.com/2023/10/31/middleeast/israel-gaza-hamas-war-tuesday-intl-hnk/index.html
edition.cnn.com/2023/10/31/europe/russia-ukraine-donetsk-volnovakha-family-killed-intl-hnk/index.html
edition.cnn.com/2023/10/30/entertainment/jennifer-aniston-matthew-perry-death/index.html
edition.cnn.com/2023/10/31/business/carlsberg-russia-business-stolen/index.html
edition.cnn.com/style/best-celebrity-halloween-costumes-2023/index.html
edition.cnn.com/travel/article/space-toilet-space-perspective-scn/index.html
edition.cnn.com/travel/lonely-planets-top-places-to-go-in-2024/index.html
edition.cnn.com/2023/10/31/sport/hwang-in-tae-nba-referee-south-korea-spt-intl/index.html
edition.cnn.com/2023/10/30/middleeast/shani-louk-dead-israel-intl/index.html
edition.cnn.com/2023/10/31/middleeast/israel-gaza-hamas-war-tuesday-intl-hnk/index.html
edition.cnn.com/2023/10/31/europe/dagestan-riot-putin-hamas-balancing-act-analysis-intl-hnk/index.html
edition.cnn.com/2023/10/30/middleeast/gaza-israel-bombar

140

In [18]:
# Please work please work please work please work please work please work please work 
article_dicts = [article.__dict__ for article in articles]
article_dicts

[{'headline': '\n      Gaza population being ‘dehumanized’ UN agency warns as Netanyahu rejects ceasefire calls\n    ',
  'contents': ' \n      The entire population of Gaza is “being dehumanized,” the chief of the main UN agency operating there told the UN Security Council Monday, as pressure intensified on the besieged strip with Israeli Prime Minister Benjamin Netanyahu ruling out a ceasefire, saying “this\xa0is a time for war.”\n   \n      Philippe Lazzarini, commissioner general of the UN Relief and Works Agency (UNRWA) told the Security Council that thousands of children killed in Israeli airstrikes in Gaza in the past three weeks “cannot be collateral damage.”\n   \n      Major UN agencies are calling for a humanitarian ceasefire to allow deliveries of aid  for more than 2 million civilians trapped with scarce supplies of food, water and medical equipment, and for the safe release of 240 hostages that Israel believes are being held by Hamas, the militant group that controls the 

##### This is where I try to connect to the postgres database and execute the insert statements

In [19]:
%pip install psycopg2

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [20]:
import psycopg2

In [21]:
conn = psycopg2.connect(dbname="postgres",user="postgres", password="postgres", port="5432", host="localhost")
conn.autocommit = True
cursor = conn.cursor()

In [24]:
print(len(article_dictionary))

7


In [25]:
for article_dictionary in article_dicts:
    try:
        cursor.execute('''INSERT INTO Articles(URLId, Headline, Contents, Authors, UploadDate, ReadTime, ImageURL, ImageDescription) 
                       VALUES (%s, %s, %s, %s, %s, %s, %s, %s);''', 
                       (article_dictionary["url"][8:], article_dictionary["headline"], article_dictionary["contents"], article_dictionary["authors"], article_dictionary["date"], article_dictionary["read_time"], article_dictionary["image"].url[8:], article_dictionary["image"].description)
                       )
        
    except Exception:
        print("Duplicate detected, skipping to next article.")

Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.


Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to ne

In [None]:
'''
Notes: The urls are saved without the https:// prefix, seeing as I got an error while doing so.
All textual datatypes have been saved as text, so that has to be done better in the future. Best solution is to convert
the attributes.
The execute statement does not check whether a record is present in the table or not.
The solution I can think of now is to export the data of the database into a json format.
After that, each time the program is started, the URLIDs are extracted and compared against
the news that are scraped and the articles found in both are removed from the scraped articles.
This ensures that the articles added to the database are the new ones.
'''

'\nNotes: The urls are saved without the https:// prefix, seeing as I got an error while doing so.\nAll textual datatypes have been saved as text, so that has to be done better in the future. Best solution is to convert\nthe attributes.\nThe execute statement does not check whether a record is present in the table or not.\nThe solution I can think of now is to export the data of the database into a json format.\nAfter that, each time the program is started, the URLIDs are extracted and compared against\nthe news that are scraped and the articles found in both are removed from the scraped articles.\nThis ensures that the articles added to the database are the new ones.\n'