In [1]:
import pandas as pd
import selenium

from datetime import date
from datetime import datetime

import requests
from bs4 import BeautifulSoup

from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait

from crawler import scraper, parser, utils
from main_crawler import Crawler, Output, Input

from IPython.display import HTML

In [2]:
today = date.today()
now = datetime.now()

input = Input(today, now)

chromedriver_loc = input.get_chromedriver_loc()
sources = input.get_sources()
sources_out_of_order = input.get_sources_out_of_order()
sources_elements = input.get_sources_elements()
search_name, search_terms = input.get_search_terms()
date_init = datetime.strptime('20240928', '%Y%m%d').date()
date_end = datetime.now()

output = Output(search_name)

crawler = Crawler(chromedriver_loc,
                    sources,
                    sources_out_of_order,
                    sources_elements,
                    search_terms,
                    date_init,
                    date_end,
                    today,
                    now,
                    output,
                    headless=False)

Storing results as -> data/20240928_stju_articles.csv
Storing articles in -> data/articles/


In [21]:
def buttons(journal: str) -> None:
    # We click the cookies and notifications buttons in the event we just opened the webpage and there are cookies and
    # notifications pop ups that would not allow us to see the information on the page
    cookies_loc = crawler.sources_elements.loc[journal, 'cookies']
    notifs_loc = crawler.sources_elements.loc[journal, 'notifs']
    # If the button hasn't been already pressed, journal is in cookies (meaning there is a button to press)
    # and the button is there:
    if not False and cookies_loc != '-' and driver.find_elements(By.XPATH, cookies_loc):
        cookies_accept = driver.find_element(By.XPATH, cookies_loc)
        driver.execute_script("arguments[0].click();", cookies_accept)
        crawler.cookies_clicked = True

    if not False and notifs_loc != '-' and driver.find_elements(By.XPATH, notifs_loc):
        notifications = driver.find_element(By.XPATH, notifs_loc)
        driver.execute_script("arguments[0].click();", notifications)
        crawler.notifs_clicked = True

def opening_url_actions(journal: str) -> None:  # journal specific !!
    # Each journal has some actions we need to do after opening the url with the driver (sortin, adverts, etc.)

    match journal:
        case 'periodic':
            # If there is an advertisement pop_up that stops us from accessing the journal, we click the "Access journal"
            # button to access the journal
            access = True
            try:
                access_journal = WebDriverWait(driver, 15).until(EC.presence_of_element_located((
                    By.XPATH, '//div[@class="interstitial__link"]/a')))
            except TimeoutException:
                access = False

            if access:
                driver.execute_script("arguments[0].click();", access_journal)
                driver.implicitly_wait(15)

            buttons(journal)



# Altaveu

## Getting content

In [4]:
url = "https://www.altaveu.com/actualitat/successos/turista-circulava-mes-doble-alcohol-permes-dona-fills-menors-cotxe_60635_102.html"

In [5]:
#soup = self.static_methods.get_soup(link)
try:
    response = requests.get(url)
except:
    print(f"Cannot access {url} right now. Please try again later.")
    soup = False

soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
# subtitle, content = get_content(self, journal, soup)
opening = ""
if soup.find('div', class_="c-mainarticle__opening"):
    # In l'Altaveu, we have to first find the opening in case there's one, as that is part of the content of the article
    opening = soup.find('div', class_="c-mainarticle__opening").text
#print(soup.find('div', class_="c-mainarticle__body"))
paragraphs = soup.find('div', class_="c-mainarticle__body").find_all('p', recursive=False)
print(len(paragraphs))
for paragraph in paragraphs:
    print(paragraph.section)
    #print(f"\n\nPARAGRAPH:\n{paragraph}")
content = opening + '\n'.join([par.text for par in paragraphs if not par.section])
print(content)

# Bondia

## comments

In [12]:
url = "https://www.bondia.ad/societat/astrie-retreu-la-falta-d-inversio-i-manca-de-resultats-en-habitatge"

In [13]:
try:
    response = requests.get(url)
except:
    print(f"Cannot access {url} right now. Please try again later.")
    soup = False

soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
comments_col = soup.find('div', class_="col-span-4 pt-2")
comments = comments_col.find_all('div', class_="flex flex-col gap-2 bg-primary-200 py-4 px-10")
print(len(comments))

In [None]:
comments[0].find_all('div')[1].text

## content

In [14]:
url = "https://www.bondia.ad/passava-per-aqui/seixanta-euros-pot-ser-poc-per-a-un-europeu-al-vietnam-canvien-vides"

In [15]:
try:
    response = requests.get(url)
except:
    print(f"Cannot access {url} right now. Please try again later.")
    soup = False

soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
paragraphs = soup.find('div', class_="article-body my-5 text-lg").div.find_all('p')
content = '\n'.join([par.text for par in paragraphs])

# diari

## comments

In [22]:
driver = crawler.setup_driver()

In [23]:
url = "https://www.diariandorra.ad/nacional/240928/els-contrabandistes-esquiven-llei-trossejant-les-compres_158929.html"

In [24]:
driver.get(url)

In [25]:
shadow_host = driver.find_element(By.CSS_SELECTOR, "hyvor-talk-comments")

In [26]:
# Use JavaScript to extract the shadow root's inner HTML
shadow_dom_html = driver.execute_script("""
    // Access the shadow root and return its inner HTML
    return arguments[0].shadowRoot.innerHTML;
""", shadow_host)

# Create a BeautifulSoup object with the extracted shadow DOM HTML
soup = BeautifulSoup(shadow_dom_html, "html.parser")

In [30]:
comments = soup.find_all('div', class_="comment")

In [38]:
len(comments)

30

In [81]:
comments[2].find_all('div', class_="comment-meta-left-2")[0].a['href'].split('comment-id=')[1]

'19107408'

In [78]:
comment = comments[3]

In [79]:
comment.find_all('div', class_="comment-meta-left-2")[0].a['href'].split('comment-id=')[1]

'19112152'

In [83]:
comment.find_all('time')[0]['datetime']

'9/28/2024, 11:17:11 AM'

In [89]:
date = '9/28/2024, 11:17:11 AM'
datetime.strptime(date, "%m/%d/%Y, %I:%M:%S %p")

datetime.datetime(2024, 9, 28, 11, 17, 11)

In [82]:
comment.find_all('span', class_="user-name")[0].text

'Venuts'

In [84]:
comment.find_all('div', class_="comment-content")[0].div.p.text

' Era així fins que al 99 ens van entregar per salvar lo seu.'

In [85]:
reply = comment.parent.parent
print(reply.get('class', ['-'])[0])
reply.get('class', ['-'])[0] == "comment-replies"

comment-replies


True

In [87]:
parent_comment = reply.parent
parent_comment.find_all('div', class_="comment-meta-left-2")[0].a['href'].split('comment-id=')[1]

'19107408'

In [88]:
driver.quit()

# periodic

## testing static?

In [90]:
url = "https://www.elperiodic.ad/noticia/108671/cardelus-sortira-en-22a-posicio-a-indonesia"

In [91]:
try:
    response = requests.get(url)
except:
    print(f"Cannot access {url} right now. Please try again later.")
    soup = False

soup = BeautifulSoup(response.text, 'html.parser')

# Bondia

In [None]:
soup = 