##### This part implements an entire workflow which scrapes the news articles.

In [1]:
# ToDos for 27.10.2002
"""
Get all articles from the mainpage

From the articles, get everything you can get your hands on.

Turn the article to dictionary form
"""

'\nGet all articles from the mainpage\n\nFrom the articles, get everything you can get your hands on.\n\nTurn the article to dictionary form\n'

In [2]:
from bs4 import BeautifulSoup, Tag, NavigableString
import requests
from typing import List
from datetime import datetime
import pandas as pd

In [3]:
MAINPAGE = "edition.cnn.com"
HTTPS_SUFFIX = "https://"
MAINPAGE_LINK = f"{HTTPS_SUFFIX}{MAINPAGE}"

In [4]:
class Image():
    def __init__(self, url : str, description : str):
        self.url = url
        self.description = description

    def __str__(self):
        return f'Photo description: {self.description}'

class Article():
    def __init__(self, headline : str, content : str, authors : List[str], upload_timestamp : pd.Timestamp, read_time : int, url : str, image : Image):
        self.headline = headline
        self.content = content
        self.authors = authors
        self.upload_timestamp = upload_timestamp
        self.read_time = read_time
        self.url = url
        self.image = image


        self.scraping_timestamp = pd.to_datetime(datetime.now())

    
    def __str__(self):
        string : str = ""
        return f"{self.headline}  by {self.authors}  {self.read_time}\n {self.content} \n"

In [5]:
def get_soup(url : str)-> BeautifulSoup | None:
    article = requests.get(url).text
    if article:
        return BeautifulSoup(article)
    else:
        print("Error fetching the soup object")

def get_content(article_soup : BeautifulSoup):
    paragraphs = [paragraph.text for paragraph in article_soup.find_all(is_paragraph)]
    string = ""
    for paragraph in paragraphs:
        string = f"{string} {paragraph}"
    return string

def get_article_links(mainpage_soup : BeautifulSoup) -> list:
    results = mainpage_soup.find_all(name="a", attrs={"data-link-type" : "article"})
    results = [f'{MAINPAGE_LINK}{result.attrs["href"]}' for result in results]
    print(results)
    return results

def get_headline(article_soup : BeautifulSoup):
    return article_soup.find("h1").text

def get_authors(article_soup : BeautifulSoup) -> List[str]:
    author_tags = article_soup.find_all(is_author)
    names = [tag.string for tag in author_tags]
    return names

def get_date(article_soup : BeautifulSoup) -> pd.Timestamp:
    date_tag = article_soup.find(is_date)
    date_string = date_tag.text

    # the following code just extracts the datetime from the given date
    splitted_date = date_string.split(",")
    unstructured_time = splitted_date[0].split("\n") # the time is in the 3rd index, look down
    time = unstructured_time[2].lstrip()
    datetime_string_format = f"{time.split(' ')[0]} {time.split(' ')[1]},{splitted_date[-2]},{splitted_date[-1].rstrip()}"
    print(datetime_string_format)
    datetime_correct = pd.to_datetime(datetime_string_format)
    
    return datetime_correct

# since the read time is not stored in the database, this line is obsolete
def get_read_time(article_soup : BeautifulSoup):
    # a read time of 0 is used to signify an article whose reading time could not be fetched.
    read_time_tag = article_soup.find("div", attrs={"class" : ["headline__sub-description"]})  #[15:28] the slicing caused an error so I removed it for testing purposes
    if read_time_tag is None:
        return ""
    read_time : int = extract_read_time_from_string(read_time_tag.text)
    return read_time

def get_image(article_soup : BeautifulSoup) -> Image:
    image_tag = article_soup.find(is_image)
    if image_tag is not None:
        return Image(url=image_tag['src'], description=image_tag['alt']) # modify this
    else:
        return Image("", "")

def is_paragraph(tag : Tag) -> bool:
    return tag.has_attr("data-component-name") and tag.name == "p"

def is_author(tag : Tag) -> bool:
    return tag.get_attribute_list("class")[0] == "byline__name"

def is_date(tag : Tag):
    return tag.get_attribute_list("class")[0] == "timestamp"

def is_image(tag : Tag) -> bool:
    return tag.has_attr("src") and tag.has_attr("alt") and tag.name == "img"

def extract_read_time_from_string(read_time_string : str) -> int:
    # returns 0 if the read_time integer cannot be successfully extracted
    read_time = 0
    for character in read_time_string:
        try:
            read_time = int(character)
        except:
            pass
    return read_time

def extract_upload_time(upload_time_string : str)-> datetime:
    print(upload_time_string)

def create_article_from_link(link : str) ->Article:
    article_soup = get_soup(link)
    headline = get_headline(article_soup)
    content = get_content(article_soup)
    author = get_authors(article_soup)
    date = get_date(article_soup)
    read_time = get_read_time(article_soup)
    url = link
    image = get_image(article_soup)

    return Article(headline, content, author, date, read_time, url, image)

In [6]:
mainpage_soup = get_soup(MAINPAGE_LINK)
links = get_article_links(mainpage_soup)

['https://edition.cnn.com/2023/11/08/world/eu-recommends-ukraine-accession-talks-intl/index.html', 'https://edition.cnn.com/2023/11/08/politics/takeaways-republican-debate/index.html', 'https://edition.cnn.com/2023/11/08/politics/ivanka-trump-fraud-trial-takeaways/index.html', 'https://edition.cnn.com/2023/11/09/travel/worlds-newest-island-forms-in-japanese-archipelago-scn/index.html', 'https://edition.cnn.com/2023/11/08/travel/south-korea-bedbug-infestation-intl-hnk/index.html', 'https://edition.cnn.com/2023/11/09/india/india-delhi-toxic-foam-pollution-yamuna-intl-hnk/index.html', 'https://edition.cnn.com/2023/11/09/middleeast/un-rights-chief-israel-hamas-war-crimes-hnk-intl/index.html', 'https://edition.cnn.com/2023/11/08/world/palestinians-fleeing-south-gaza-city-unbearable-situation/index.html', 'https://edition.cnn.com/2023/11/08/politics/biden-administration-israel-hamas-war/index.html', 'https://edition.cnn.com/2023/11/08/opinions/opinion-why-biden-supports-israel-so-wholehearte

In [7]:
articles = []
for link in links:
    try:
        article = create_article_from_link(link)
        articles.append(article)
    except Exception as e:
        print(f"Following error: {str(e)}")

10:31 AM, Wed November 8, 2023
5:23 AM, Thu November 9, 2023
7:59 AM, Thu November 9, 2023
9:52 PM, Wed November 8, 2023
2:01 AM, Thu November 9, 2023
3:34 AM, Thu November 9, 2023
7:38 AM, Thu November 9, 2023
3:46 AM, Thu November 9, 2023
6:18 PM, Wed November 8, 2023
1:47 PM, Wed November 8, 2023
4:00 AM, Thu November 9, 2023
6:34 AM, Thu November 9, 2023
6:34 AM, Thu November 9, 2023
9:14 AM, Thu November 9, 2023
9:14 AM, Thu November 9, 2023
5:23 AM, Thu November 9, 2023
5:23 AM, Thu November 9, 2023
1:00 AM, Thu November 9, 2023
7:59 AM, Thu November 9, 2023
7:13 AM, Thu November 9, 2023
3:34 AM, Thu November 9, 2023
6:56 PM, Wed November 8, 2023
6:52 AM, Thu November 9, 2023
9:15 PM, Wed November 8, 2023
9:52 PM, Wed November 8, 2023
9:52 PM, Wed November 8, 2023
7:46 AM, Thu November 9, 2023
2:01 AM, Thu November 9, 2023
6:26 AM, Thu November 9, 2023
6:07 AM, Thu November 9, 2023
9:27 AM, Thu November 9, 2023
9:27 AM, Thu November 9, 2023
12:07 PM, Wed November 8, 2023
12:07 PM

##### This is where I try to connect to the postgres database and execute the insert statements

In [9]:
%pip install psycopg2

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [10]:
import psycopg2, pandas as pd

In [11]:
conn = psycopg2.connect(dbname="postgres",user="postgres", password="postgres", port="5432", host="localhost")
conn.autocommit = True
cursor = conn.cursor()

In [17]:
def filter_articles(articles : List[Article]) -> List[Article]:
    new_articles = []
    articles_already_present = pd.read_sql(sql="SELECT * FROM Articles", con=conn)
    urls_of_old_articles = list(articles_already_present["urlid"])
    print(urls_of_old_articles)
    count_of_old_articles = 0
    for article in articles:
        if article.url[8:] not in urls_of_old_articles:
            new_articles.append(article)
        else:
            count_of_old_articles = count_of_old_articles + 1
    print(f"Count of old articles: {count_of_old_articles}")

    



    return new_articles

new_articles = filter_articles(articles)

['edition.cnn.com/2023/11/08/world/eu-recommends-ukraine-accession-talks-intl/index.html', 'edition.cnn.com/2023/11/08/politics/takeaways-republican-debate/index.html', 'edition.cnn.com/2023/11/08/politics/ivanka-trump-fraud-trial-takeaways/index.html', 'edition.cnn.com/2023/11/09/travel/worlds-newest-island-forms-in-japanese-archipelago-scn/index.html', 'edition.cnn.com/2023/11/08/travel/south-korea-bedbug-infestation-intl-hnk/index.html', 'edition.cnn.com/2023/11/09/india/india-delhi-toxic-foam-pollution-yamuna-intl-hnk/index.html', 'edition.cnn.com/2023/11/09/middleeast/un-rights-chief-israel-hamas-war-crimes-hnk-intl/index.html', 'edition.cnn.com/2023/11/08/world/palestinians-fleeing-south-gaza-city-unbearable-situation/index.html', 'edition.cnn.com/2023/11/08/politics/biden-administration-israel-hamas-war/index.html', 'edition.cnn.com/2023/11/08/opinions/opinion-why-biden-supports-israel-so-wholeheartedly/index.html', 'edition.cnn.com/2023/11/09/opinions/palestinian-lives-humanita

  articles_already_present = pd.read_sql(sql="SELECT * FROM Articles", con=conn)


In [13]:
article_dicts = [article.__dict__ for article in new_articles] # this dictionary only contains the articles that are not in the database already

In [27]:
articles_dataframe = pd.DataFrame(article_dicts)
#articles_dataframe.drop(articles_dataframe.columns[0], axis=1)
articles_dataframe.set_index("url")
articles
# filter existing articles using url because there are multiple articles

Unnamed: 0_level_0,headline,content,authors,upload_timestamp,read_time,image,scraping_timestamp
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
https://edition.cnn.com/2023/11/08/us/alabama-first-execution-nitrogen-gas-2024/index.html,\n Alabama sets a date to execute a death...,\n Alabama has scheduled the nation’s fi...,"[Devon M. Sayers, Emma Tucker]",2023-11-09 09:14:00,3,Photo description: Devon M. Sayers,2023-11-09 14:34:19.184457
https://edition.cnn.com/2023/11/08/us/alabama-first-execution-nitrogen-gas-2024/index.html,\n Alabama sets a date to execute a death...,\n Alabama has scheduled the nation’s fi...,"[Devon M. Sayers, Emma Tucker]",2023-11-09 09:14:00,3,Photo description: Devon M. Sayers,2023-11-09 14:34:19.552694
https://edition.cnn.com/2023/11/09/tech/apple-cofounder-steve-wozniak-hospitalized-intl-hnk/index.html,\n Apple co-founder Steve Wozniak hospita...,\n Apple co-founder Steve Wozniak was ho...,[],2023-11-09 01:00:00,1,Photo description: Co-founder of Apple Steve W...,2023-11-09 14:34:21.169499
https://edition.cnn.com/2023/11/09/europe/russia-missile-civilian-ship-ukraine-odesa-intl/index.html,\n Russia launches deadly missile strike ...,\n A Russian missile struck a cargo ship...,"[Christian Edwards, Kosta Gak, Mariya Knight]",2023-11-09 07:13:00,3,Photo description: Aftermath of Russian missil...,2023-11-09 14:34:22.488991
https://edition.cnn.com/2023/11/08/politics/high-end-brothel-network-arrests/index.html,\n DOJ announces arrests in ‘high-end bro...,\n Three individuals have been arrested ...,[Holmes Lybrand],2023-11-08 18:56:00,3,Photo description: Holmes Lybrand,2023-11-09 14:34:23.807369
...,...,...,...,...,...,...,...
https://edition.cnn.com/travel/heartsong-lodge-dollywood-dolly-parton/index.html,\n Inside Dolly Parton’s new resort lodge...,\n When a petite blonde known for high h...,[Marnie Hunter],2023-11-06 08:03:00,9,Photo description: <strong>Dollywood's HeartSo...,2023-11-09 14:35:25.612046
https://edition.cnn.com/2023/11/05/us/robert-card-lewiston-shooting-maine-yellow-flag-law/index.html,\n The Maine gunman was a ‘textbook case’...,"\n The signs were there: hearing voices,...",[Emma Tucker],2023-11-05 23:36:00,3,Photo description: Police close Lincoln Street...,2023-11-09 14:35:26.222089
https://edition.cnn.com/2023/11/03/asia/canada-china-helicopter-interception-south-china-sea-hnk-intl/index.html,\n Exclusive: Chinese jet fired flares cl...,\n A Chinese warplane fired flares in fr...,[Brad Lendon],2023-11-06 01:00:00,6,Photo description: Brad Lendon,2023-11-09 14:35:27.162391
https://edition.cnn.com/2023/11/02/world/hawaii-akikiki-honeycreeper-extinction-c2e-spc-scn-intl/index.html,\n Mosquitoes are driving these birds to ...,\n Editor’s Note: Call to Earth is a CNN edi...,[Nell Lewis],2023-11-06 04:25:00,4,Photo description: Call to Earth,2023-11-09 14:35:27.870374


In [15]:
for article_dictionary in article_dicts:
    print(article_dictionary)
    cursor.execute('''INSERT INTO Articles(urlId, headline, content, authors, uploadTimestamp, imageURL, imageDescription, scrapingTimeStamp) 
                       VALUES (%s, %s, %s, %s, %s, %s, %s, %s);''',
                   (article_dictionary["url"][8:],
                    article_dictionary["headline"],
                    article_dictionary["content"],
                    article_dictionary["authors"],
                    article_dictionary["upload_timestamp"].to_pydatetime(),
                    article_dictionary["image"].url[8:],
                    article_dictionary["image"].description,
                    article_dictionary["scraping_timestamp"].to_pydatetime()
                    )
                   )

{'headline': '\n      Alabama sets a date to execute a death row inmate by nitrogen gas, governor says, a method never used in the US\n    ', 'content': ' \n      Alabama has scheduled the nation’s first execution by nitrogen hypoxia, an alternative to lethal injection, its Republican governor said.\n   \n      Kenneth Eugene Smith’s execution by lethal injection was abruptly canceled in November after the state couldn’t properly set the IV line before the warrant for execution expired. He asked the state to be put to death by nitrogen gas rather than lethal injection after what he called a botched execution.\n   \n      Smith’s execution now is set to take place between January 25 and 26, according to a news release from Gov. Kay Ivey.\n   \n      Death by nitrogen hypoxia deprives the brain and body of oxygen, so the inmate\xa0would\xa0die by suffocation,\xa0according to the Death Penalty Information Center, a non-profit that monitors, analyzes and disseminates information about capi

UniqueViolation: duplicate key value violates unique constraint "articles_pkey"
DETAIL:  Key (urlid)=(edition.cnn.com/2023/11/08/us/alabama-first-execution-nitrogen-gas-2024/index.html) already exists.


In [None]:
'''
Notes: The urls are saved without the https:// prefix, seeing as I got an error while doing so.
All textual datatypes have been saved as text, so that has to be done better in the future. Best solution is to convert
the attributes.
The execute statement does not check whether a record is present in the table or not.
The solution I can think of now is to export the data of the database into a json format.
After that, each time the program is started, the URLIDs are extracted and compared against
the news that are scraped and the articles found in both are removed from the scraped articles.
This ensures that the articles added to the database are the new ones.
'''

'\nNotes: The urls are saved without the https:// prefix, seeing as I got an error while doing so.\nAll textual datatypes have been saved as text, so that has to be done better in the future. Best solution is to convert\nthe attributes.\nThe execute statement does not check whether a record is present in the table or not.\nThe solution I can think of now is to export the data of the database into a json format.\nAfter that, each time the program is started, the URLIDs are extracted and compared against\nthe news that are scraped and the articles found in both are removed from the scraped articles.\nThis ensures that the articles added to the database are the new ones.\n'