##### This part implements an entire workflow which scrapes the news articles.

In [7]:
# ToDos for 27.10.2002
"""
Get all articles from the mainpage

From the articles, get everything you can get your hands on.

Turn the article to dictionary form
"""

'\nGet all articles from the mainpage\n\nFrom the articles, get everything you can get your hands on.\n\nTurn the article to dictionary form\n'

In [8]:
from bs4 import BeautifulSoup, Tag, NavigableString
import requests
from typing import List
from datetime import datetime
import pandas as pd

In [9]:
MAINPAGE = "edition.cnn.com"
HTTPS_SUFFIX = "https://"
MAINPAGE_LINK = f"{HTTPS_SUFFIX}{MAINPAGE}"

In [10]:
class Image():
    def __init__(self, url : str, description : str):
        self.url = url
        self.description = description

    def __str__(self):
        return f'Photo description: {self.description}'

class Article():
    def __init__(self, headline : str, content : str, authors : List[str], upload_timestamp : pd.Timestamp, read_time : int, url : str, image : Image):
        self.headline = headline
        self.content = content
        self.authors = authors
        self.upload_timestamp = upload_timestamp
        self.read_time = read_time
        self.url = url
        self.imageUrl = image.url
        self.description = image.description


        self.scraping_timestamp = pd.to_datetime(datetime.now())

    
    def __str__(self):
        string : str = ""
        return f"{self.headline}  by {self.authors}  {self.read_time}\n {self.content} \n"

In [11]:
def get_soup(url : str)-> BeautifulSoup | None:
    article = requests.get(url).text
    if article:
        return BeautifulSoup(article)
    else:
        print("Error fetching the soup object")

def get_content(article_soup : BeautifulSoup):
    paragraphs = [paragraph.text for paragraph in article_soup.find_all(is_paragraph)]
    string = ""
    for paragraph in paragraphs:
        string = f"{string} {paragraph}"
    return string

def get_article_links(mainpage_soup : BeautifulSoup) -> list:
    results = mainpage_soup.find_all(name="a", attrs={"data-link-type" : "article"})
    results = [f'{MAINPAGE_LINK}{result.attrs["href"]}' for result in results]
    print(results)
    return results

def get_headline(article_soup : BeautifulSoup):
    return article_soup.find("h1").text

def get_authors(article_soup : BeautifulSoup) -> List[str]:
    author_tags = article_soup.find_all(is_author)
    names = [tag.string for tag in author_tags]
    return names

def get_date(article_soup : BeautifulSoup) -> pd.Timestamp:
    date_tag = article_soup.find(is_date)
    date_string = date_tag.text

    # the following code just extracts the datetime from the given date
    splitted_date = date_string.split(",")
    unstructured_time = splitted_date[0].split("\n") # the time is in the 3rd index, look down
    time = unstructured_time[2].lstrip()
    datetime_string_format = f"{time.split(' ')[0]} {time.split(' ')[1]},{splitted_date[-2]},{splitted_date[-1].rstrip()}"
    print(datetime_string_format)
    datetime_correct = pd.to_datetime(datetime_string_format)
    
    return datetime_correct

# since the read time is not stored in the database, this line is obsolete
def get_read_time(article_soup : BeautifulSoup):
    # a read time of 0 is used to signify an article whose reading time could not be fetched.
    read_time_tag = article_soup.find("div", attrs={"class" : ["headline__sub-description"]})  #[15:28] the slicing caused an error so I removed it for testing purposes
    if read_time_tag is None:
        return ""
    read_time : int = extract_read_time_from_string(read_time_tag.text)
    return read_time

def get_image(article_soup : BeautifulSoup) -> Image:
    image_tag = article_soup.find(is_image)
    if image_tag is not None:
        return Image(url=image_tag['src'], description=image_tag['alt']) # modify this
    else:
        return Image("", "")

def is_paragraph(tag : Tag) -> bool:
    return tag.has_attr("data-component-name") and tag.name == "p"

def is_author(tag : Tag) -> bool:
    return tag.get_attribute_list("class")[0] == "byline__name"

def is_date(tag : Tag):
    return tag.get_attribute_list("class")[0] == "timestamp"

def is_image(tag : Tag) -> bool:
    return tag.has_attr("src") and tag.has_attr("alt") and tag.name == "img"

def extract_read_time_from_string(read_time_string : str) -> int:
    # returns 0 if the read_time integer cannot be successfully extracted
    read_time = 0
    for character in read_time_string:
        try:
            read_time = int(character)
        except:
            pass
    return read_time

def extract_upload_time(upload_time_string : str)-> datetime:
    print(upload_time_string)

def create_article_from_link(link : str) ->Article:
    article_soup = get_soup(link)
    headline = get_headline(article_soup)
    content = get_content(article_soup)
    author = get_authors(article_soup)
    date = get_date(article_soup)
    read_time = get_read_time(article_soup)
    url = link
    image = get_image(article_soup)

    return Article(headline, content, author, date, read_time, url, image)

In [12]:
mainpage_soup = get_soup(MAINPAGE_LINK)
links = get_article_links(mainpage_soup)

['https://edition.cnn.com/2023/11/08/world/eu-recommends-ukraine-accession-talks-intl/index.html', 'https://edition.cnn.com/2023/11/08/politics/takeaways-republican-debate/index.html', 'https://edition.cnn.com/2023/11/08/politics/ivanka-trump-fraud-trial-takeaways/index.html', 'https://edition.cnn.com/2023/11/09/travel/worlds-newest-island-forms-in-japanese-archipelago-scn/index.html', 'https://edition.cnn.com/2023/11/08/travel/south-korea-bedbug-infestation-intl-hnk/index.html', 'https://edition.cnn.com/2023/11/09/india/india-delhi-toxic-foam-pollution-yamuna-intl-hnk/index.html', 'https://edition.cnn.com/2023/11/09/politics/israel-pauses-gaza-white-house/index.html', 'https://edition.cnn.com/2023/11/09/middleeast/un-rights-chief-israel-hamas-war-crimes-hnk-intl/index.html', 'https://edition.cnn.com/2023/11/09/opinions/palestinian-lives-humanitarian-catastrophe-gaza-almadhoun/index.html', 'https://edition.cnn.com/2023/11/09/sport/luis-diaz-father-released-eln-spt-intl/index.html', 'ht

In [13]:
articles = []
for link in links:
    try:
        article = create_article_from_link(link)
        articles.append(article)
    except Exception as e:
        print(f"Following error: {str(e)}")

10:31 AM, Wed November 8, 2023
5:23 AM, Thu November 9, 2023
7:59 AM, Thu November 9, 2023
9:52 PM, Wed November 8, 2023
2:01 AM, Thu November 9, 2023
3:34 AM, Thu November 9, 2023
11:26 AM, Thu November 9, 2023
7:38 AM, Thu November 9, 2023
4:00 AM, Thu November 9, 2023
11:30 AM, Thu November 9, 2023
11:30 AM, Thu November 9, 2023
9:14 AM, Thu November 9, 2023
9:14 AM, Thu November 9, 2023
5:23 AM, Thu November 9, 2023
5:23 AM, Thu November 9, 2023
11:39 AM, Thu November 9, 2023
6:34 AM, Thu November 9, 2023
1:00 AM, Thu November 9, 2023
7:59 AM, Thu November 9, 2023
7:13 AM, Thu November 9, 2023
3:34 AM, Thu November 9, 2023
6:56 PM, Wed November 8, 2023
10:13 AM, Thu November 9, 2023
10:25 AM, Thu November 9, 2023
10:25 AM, Thu November 9, 2023
9:52 PM, Wed November 8, 2023
11:32 AM, Thu November 9, 2023
7:46 AM, Thu November 9, 2023
2:01 AM, Thu November 9, 2023
9:27 AM, Thu November 9, 2023
9:27 AM, Thu November 9, 2023
12:07 PM, Wed November 8, 2023
12:07 PM, Wed November 8, 2023

##### This is where I try to connect to the postgres database and execute the insert statements

In [14]:
%pip install psycopg2
%pip install sqlalchemy

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [15]:
import psycopg2, pandas as pd
from sqlalchemy import create_engine
from getpass import getpass

PASSWORD = getpass()


engine = create_engine(f'postgresql://postgres:{PASSWORD}@localhost:5432/postgres')

In [16]:
conn = psycopg2.connect(dbname="postgres",user="postgres", password="postgres", port="5432", host="localhost")
conn.autocommit = True
cursor = conn.cursor()

In [18]:
def filter_articles(articles : List[Article]) -> List[Article]:
    new_articles = []
    articles_already_present = pd.read_sql_table("Articles", con=engine)
    urls_of_old_articles = list(articles_already_present["url"])
    print(urls_of_old_articles)
    count_of_old_articles = 0
    for article in articles:
        if article.url[8:] not in urls_of_old_articles:
            new_articles.append(article)
        else:
            count_of_old_articles = count_of_old_articles + 1
    print(f"Count of old articles: {count_of_old_articles}")

    return new_articles

new_articles = filter_articles(articles) # filtered against existing articles in database

[]
Count of old articles: 0


In [19]:
article_dicts = [article.__dict__ for article in new_articles] # this dictionary only contains the articles that are not in the database already

In [20]:
articles_dataframe = pd.DataFrame(article_dicts)
articles_dataframe.set_index("url")
new_articles_dataframe = articles_dataframe.drop_duplicates(subset="url", keep="first")
#filtered the duplicates out

In [21]:
new_articles_dataframe = new_articles_dataframe.drop(columns=["read_time"])

In [23]:
for named_tuple in new_articles_dataframe.itertuples():
    print(named_tuple)

    cursor.execute('''INSERT INTO Articles (urlId, headline, content, authors, uploadTimestamp, imageURL, imageDescription, scrapingTimeStamp)
                           VALUES (%s, %s, %s, %s, %s, %s, %s, %s);''',
                       (named_tuple.url,
                        named_tuple.headline,
                        named_tuple.content,
                        named_tuple.authors,
                        named_tuple.upload_timestamp,
                        named_tuple.imageUrl,
                        named_tuple.description,
                        named_tuple.scraping_timestamp
                        )
                    )

Pandas(Index=0, headline='\n      Ukraine is ready to start process of joining European Union, Commission says\n    ', content=' \n      \ufeffUkraine’s ambitions of joining the European Union received an important boost on Wednesday when the bloc’s executive body said detailed negotiations should begin next year.\n   \n      The European Commission said in a report that so-called accession talks should finally start,\xa0nearly 18 months since the bloc accepted Ukraine as a candidate state. The same report\xa0recommended that the process should also begin with\xa0Moldova,\xa0which borders Ukraine.\n   \n      On Wednesday, European Commission President Ursula von der Leyen said: “Today is a historic day, because today the Commission recommends that the Council opens accession negotiations with Ukraine and with Moldova.”\n   \n      Von der Leyen was speaking on the same day that the Commission published a report suggesting to EU member states that accession talks should finally start, 

UndefinedTable: relation "articles" does not exist
LINE 1: INSERT INTO Articles 
                    ^


In [None]:
'''for article_dictionary in article_dicts:
    print(article_dictionary)
    cursor.execute('''"""INSERT INTO Articles(urlId, headline, content, authors, uploadTimestamp, imageURL, imageDescription, scrapingTimeStamp) 
                       VALUES (%s, %s, %s, %s, %s, %s, %s, %s);"""''',
                   (article_dictionary["url"][8:],
                    article_dictionary["headline"],
                    article_dictionary["content"],
                    article_dictionary["authors"],
                    article_dictionary["upload_timestamp"].to_pydatetime(),
                    article_dictionary["image"].url[8:],
                    article_dictionary["image"].description,
                    article_dictionary["scraping_timestamp"].to_pydatetime()
                    )
                   )
'''

{'headline': '\n      Apple co-founder Steve Wozniak hospitalized in Mexico City, source says\n    ', 'content': ' \n      Apple co-founder Steve Wozniak was hospitalized\xa0in Mexico City\xa0on Wednesday, a source from the organizers of the World Business Forum (WBF), an event he attended in the country’s capital, told CNN En Español.\n   \n      The source said Wozniak, 73, was taken to the hospital at 3 p.m. local time after fainting minutes before his participation at the event.\n   \n      CNN is working to get more details on his current health status.\n   \n      Wozniak\xa0is\xa0the tech genius behind the early Apple computers who worked alongside Steve Jobs to launch and grow the company.\n   \n      Wozniak, or “Woz” as he’s known in Silicon Valley circles, famously designed the Apple I and Apple II computers, which revolutionized personal computing and established Apple as a market leader.\n   \n      Wozniak was the technical engineer behind these systems, while Jobs served

UniqueViolation: duplicate key value violates unique constraint "articles_pkey"
DETAIL:  Key (urlid)=(edition.cnn.com/style/carolyn-bessette-kennedy-fashion-legacy/index.html) already exists.


In [None]:
'''
Notes: The urls are saved without the https:// prefix, seeing as I got an error while doing so.
All textual datatypes have been saved as text, so that has to be done better in the future. Best solution is to convert
the attributes.
The execute statement does not check whether a record is present in the table or not.
The solution I can think of now is to export the data of the database into a json format.
After that, each time the program is started, the URLIDs are extracted and compared against
the news that are scraped and the articles found in both are removed from the scraped articles.
This ensures that the articles added to the database are the new ones.
'''

'\nNotes: The urls are saved without the https:// prefix, seeing as I got an error while doing so.\nAll textual datatypes have been saved as text, so that has to be done better in the future. Best solution is to convert\nthe attributes.\nThe execute statement does not check whether a record is present in the table or not.\nThe solution I can think of now is to export the data of the database into a json format.\nAfter that, each time the program is started, the URLIDs are extracted and compared against\nthe news that are scraped and the articles found in both are removed from the scraped articles.\nThis ensures that the articles added to the database are the new ones.\n'