##### This part implements an entire workflow which scrapes the news articles.

In [2]:
# ToDos for 27.10.2002
"""
Get all articles from the mainpage

From the articles, get everything you can get your hands on.

Turn the article to dictionary form
"""

'\nGet all articles from the mainpage\n\nFrom the articles, get everything you can get your hands on.\n\nTurn the article to dictionary form\n'

In [3]:
from bs4 import BeautifulSoup, Tag, NavigableString
import requests
from typing import List
from datetime import datetime

In [4]:
MAINPAGE = "edition.cnn.com"
HTTPS_SUFFIX = "https://"
MAINPAGE_LINK = f"{HTTPS_SUFFIX}{MAINPAGE}"

In [5]:
class Image():
    def __init__(self, url : str, description : str):
        self.url = url
        self.description = description

    def __str__(self):
        return f'Photo description: {self.description}'

class Article():
    def __init__(self, headline : str, contents : str, authors : str, date : str, read_time : str, url : str, image : Image):
        self.headline = headline
        self.contents = contents
        self.authors = authors
        self.date = date
        self.read_time = read_time
        self.url = url
        self.image = image


        self.timestamp = datetime.now()

    
    def __str__(self):
        string : str = ""
        return f"{self.headline}  by {self.authors}  {self.read_time}\n {self.contents} \n"

In [6]:
def get_soup(url : str)-> BeautifulSoup:
    article = requests.get(url).text
    if article:
        return BeautifulSoup(article)
    else:
        print("Error fetching the soup object")

def get_content(article_soup : BeautifulSoup):
    paragraphs = [paragraph.text for paragraph in article_soup.find_all(is_paragraph)]
    string = ""
    for paragraph in paragraphs:
        string = f"{string} {paragraph}"
    return string

def get_article_links(mainpage_soup : BeautifulSoup) -> list:
    results = mainpage_soup.find_all(name="a", attrs={"data-link-type" : "article"})
    results = [f'{MAINPAGE_LINK}{result.attrs["href"]}' for result in results]
    print(results)
    return results

def get_headline(article_soup : BeautifulSoup):
    return article_soup.find("h1").text

def get_authors(article_soup : BeautifulSoup):
    author_tags = article_soup.find_all(is_author)
    names = [tag.string for tag in author_tags]
    return names

def get_date(article_soup : BeautifulSoup):
    date_tag = article_soup.find(is_date)
    date_string = date_tag.text
    date = extract_date_from_string(date_string)
    return date

def get_read_time(article_soup : BeautifulSoup):
    # a read time of 0 is used to signify an article whose reading time could not be fetched.
    read_time_tag = article_soup.find("div", attrs={"class" : ["headline__sub-description"]})  #[15:28] the slicing caused an error so I removed it for testing purposes
    if read_time_tag is None:
        return ""
    read_time : int = extract_read_time_from_string(read_time_tag.text)
    return read_time

def get_image(article_soup : BeautifulSoup) -> Image:
    image_tag = article_soup.find(is_image)
    if image_tag is not None:
        return Image(url=image_tag['src'], description=image_tag['alt']) # modify this
    else:
        return Image("", "")

def is_paragraph(tag : Tag) -> bool:
    return tag.has_attr("data-component-name") and tag.name == "p"

def is_author(tag : Tag) -> bool:
    return tag.get_attribute_list("class")[0] == "byline__name"

def is_date(tag : Tag):
    return tag.get_attribute_list("class")[0] == "timestamp"

def is_image(tag : Tag) -> bool:
    return tag.has_attr("src") and tag.has_attr("alt") and tag.name == "img"

def extract_read_time_from_string(read_time_string : str) -> int:
    # returns 0 if the read_time integer cannot be successfully extracted
    read_time = 0
    for character in read_time_string:
        try:
            read_time =  int(character)
        except:
            pass
    return read_time

def extract_date_from_string(date_string : str):
    # returns 0 if no date could be extracted
    # python function to remove white spaces in front of word
    # FIX
    date = 0
    for i in range(0, len(date_string)+1):
        for j in range(i, len(date_string)+1):
            try:
                date = datetime.strptime(date_string[i:j], "%B %d, %Y")
                break
            except:
                pass
    return date

def create_article_from_link(link : str) ->Article:
    article_soup = get_soup(link)
    headline = get_headline(article_soup)
    content = get_content(article_soup)
    author = get_authors(article_soup)
    date = get_date(article_soup)
    read_time = get_read_time(article_soup)
    url = link
    image = get_image(article_soup)

    return Article(headline, content, author, date, read_time, url, image)

In [7]:
mainpage_soup = get_soup(MAINPAGE_LINK)
links = get_article_links(mainpage_soup)

['https://edition.cnn.com/2023/11/05/europe/zelensky-ukraine-stalemate-criticism-intl/index.html', 'https://edition.cnn.com/2023/11/05/middleeast/israel-strikes-gaza-women-children-health-crisis-mime-intl/index.html', 'https://edition.cnn.com/2023/11/05/africa/gabon-demands-rare-african-mask-back-after-sale-intl/index.html', 'https://edition.cnn.com/2023/11/04/australia/australia-domestic-violence-5-deaths-10-days-intl-hnk/index.html', 'https://edition.cnn.com/travel/article/guy-fawkes-bonfire-night/index.html', 'https://edition.cnn.com/2023/11/05/sport/coco-gauff-jessica-pegula-wta-finals-spt-intl/index.html', 'https://edition.cnn.com/2023/11/05/sport/jayson-tatum-franchise-history-boston-celtics-spt-intl/index.html', 'https://edition.cnn.com/2023/11/05/politics/blinken-makes-unannounced-visit-to-iraq/index.html', 'https://edition.cnn.com/2023/11/05/middleeast/israel-strikes-gaza-women-children-health-crisis-mime-intl/index.html', 'https://edition.cnn.com/2023/11/05/middleeast/blast-a

In [8]:
articles = [create_article_from_link(link) for link in links.copy()]
#delay scraping intensity to not get banned

In [9]:
# Please work please work please work please work please work please work please work 
article_dicts = [article.__dict__ for article in articles]

##### This is where I try to connect to the postgres database and execute the insert statements

In [11]:
%pip install psycopg2

Collecting psycopg2
  Downloading psycopg2-2.9.9-cp311-cp311-win_amd64.whl.metadata (4.5 kB)
Downloading psycopg2-2.9.9-cp311-cp311-win_amd64.whl (1.2 MB)
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   -- ------------------------------------- 0.1/1.2 MB 1.7 MB/s eta 0:00:01
   ------- -------------------------------- 0.2/1.2 MB 3.5 MB/s eta 0:00:01
   ------------------- -------------------- 0.6/1.2 MB 4.4 MB/s eta 0:00:01
   ---------------------------------------  1.2/1.2 MB 7.4 MB/s eta 0:00:01
   ---------------------------------------  1.2/1.2 MB 7.4 MB/s eta 0:00:01
   ---------------------------------------- 1.2/1.2 MB 4.9 MB/s eta 0:00:00
Installing collected packages: psycopg2
Successfully installed psycopg2-2.9.9
Note: you may need to restart the kernel to use updated packages.


In [18]:
import psycopg2

In [19]:
conn = psycopg2.connect(dbname="postgres",user="postgres", password="postgres", port="5432", host="localhost")
conn.autocommit = True
cursor = conn.cursor()

OperationalError: connection to server at "localhost" (::1), port 5432 failed: FATAL:  password authentication failed for user "postgres"


In [None]:
for article_dictionary in article_dicts:
    try:
        cursor.execute('''INSERT INTO Articles(URLId, Headline, Contents, Authors, UploadDate, ReadTime, ImageURL, ImageDescription, ScrapingTimeStamp) 
                       VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);''', 
                       (article_dictionary["url"][8:], 
                       article_dictionary["headline"], 
                       article_dictionary["contents"], 
                       article_dictionary["authors"], 
                       article_dictionary["date"], 
                       article_dictionary["read_time"], 
                       article_dictionary["image"].url[8:], 
                       article_dictionary["image"].description, 
                       article_dictionary["timestamp"])
                       )
        
    except Exception as ex:
        print("Duplicate detected, skipping to next article." + str(ex))

Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to next article.
Duplicate detected, skipping to ne

In [None]:
'''
Notes: The urls are saved without the https:// prefix, seeing as I got an error while doing so.
All textual datatypes have been saved as text, so that has to be done better in the future. Best solution is to convert
the attributes.
The execute statement does not check whether a record is present in the table or not.
The solution I can think of now is to export the data of the database into a json format.
After that, each time the program is started, the URLIDs are extracted and compared against
the news that are scraped and the articles found in both are removed from the scraped articles.
This ensures that the articles added to the database are the new ones.
'''

'\nNotes: The urls are saved without the https:// prefix, seeing as I got an error while doing so.\nAll textual datatypes have been saved as text, so that has to be done better in the future. Best solution is to convert\nthe attributes.\nThe execute statement does not check whether a record is present in the table or not.\nThe solution I can think of now is to export the data of the database into a json format.\nAfter that, each time the program is started, the URLIDs are extracted and compared against\nthe news that are scraped and the articles found in both are removed from the scraped articles.\nThis ensures that the articles added to the database are the new ones.\n'