# Extracting News from Financial Websites

To extract the news, a Google browser search was performed with the following format:

*   *PETR4 site:infomoney.com.br after:2020-01-01 before:2020-12-31* (Year 2020)

This allowed the search results to show only the news from the specified newspaper and also within the date limits. It was very helpful in selecting the news accurately.

Some websites offer a related news section that was also very important in finding more relevant news.

In [None]:
# Importing the libraries needed to extract the news
# they have already been installed in the virtual environment
import requests
import time
import re
import csv
import pandas as pd
import dateparser
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium_stealth import stealth

## InfoMoney

In [None]:
# Creating a list of urls of the news to be extracted (InfoMoney website)
urls_infomoney = []

# creating a function to extract the news information from the infomoney website (title, date, content, newspaper)
def extrair_dados_infomoney(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0"
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except Exception as e:
        print(f"[ERROR] Failed to access {url}: {e}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting the news title
    titulo = soup.find("h1")
    titulo = titulo.get_text(strip=True) if titulo else "Title not found"

    # Extracting the date
    data = soup.find("time")
    data = data.get_text(strip=True) if data else "Date not found"

    # Main content (paragraphs within the article)
    conteudo_div = soup.find("article", attrs={"data-ds-component": "article"})
    if conteudo_div:
        paragrafos = conteudo_div.find_all("p")
        conteudo = "\n".join(p.get_text(" ", strip=True) for p in paragrafos)
    else:
        conteudo = "Content not found"

    # The return will be a dictionary containing all the important fields
    return {
        "url": url,
        "titulo": titulo,
        "data": data,
        "conteudo": conteudo,
        "site": "Infomoney"
    }

In [None]:
# creating an empty list that will store all news from the url list
for url in urls_infomoney:
    dados_noticia = extrair_dados_infomoney(url)
    if dados_noticia:
        updated_date = dateparser.parse(dados_noticia["data"], languages=["pt"])
        dados_noticia["data"] = updated_date.strftime("%d/%m/%Y %H:%M")
        noticias.append(dados_noticia)
    time.sleep(1)

## MoneyTimes

In [None]:
urls_moneytimes = []

def extrair_dados_moneytimes(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0"
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except Exception as e:
        print(f"[ERROR] Failed to access {url}: {e}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting the news title
    titulo = soup.find("h1")
    titulo = titulo.get_text(strip=True) if titulo else "Title not found"

    # Extracting the date
    data = soup.find("span", attrs={"class": "single_meta_author_infos_date_time"})
    data = data.get_text(strip=True) if data else "Date not found"

    # Main content (paragraphs within the article)
    conteudo_div = soup.find("div", attrs={"class": "single_block_news_text"})
    if conteudo_div:
        paragrafos = conteudo_div.find_all("p")
        conteudo = "\n".join(p.get_text(" ", strip=True) for p in paragrafos)
    else:
        conteudo = "Content not found"

    # The return will be a dictionary containing all the important fields
    return {
        "url": url,
        "titulo": titulo,
        "data": data,
        "conteudo": conteudo,
        "site": "Money Times"
    }

In [None]:
# The date for this website is in the following example format: 22 jul 2020, 15:19 (I need to format it)
noticias = []
for url in urls_moneytimes:
    dados_noticia = extrair_dados_moneytimes(url)
    if dados_noticia:
        updated_date = dateparser.parse(dados_noticia["data"], languages=["pt"])
        dados_noticia["data"] = updated_date.strftime("%d/%m/%Y %H:%M")
        noticias.append(dados_noticia)
    time.sleep(1)

## Seu Dinheiro

In [10]:
df = pd.read_csv("noticias_petrobras_2024.csv")

noticias = df.to_dict(orient="records")

print(noticias[len(noticias)-1])

{'url': 'https://www.suno.com.br/noticias/petrobras-petr4-define-dividendos-3t24-gss/', 'titulo': 'Petrobras (PETR4) define como pagará dividendos do 3T', 'data': '11/12/2024 11:35', 'conteudo': 'A Petrobras ( PETR4 ) anunciou detalhes sobre a distribuição de R$ 17,12 bilhões em proventos aos acionistas, referentes aos resultados do terceiro trimestre de 2024.\n\nA decisão, aprovada pelo Conselho de Administração da empresa em 7 de novembro, inclui pagamentos em duas parcelas, programados para 2025, e ajustes baseados na taxa Selic.\nAlém, foi confirmado o pagamento de dividendos extraordinários da Petrobras no valor R$ 20 bilhões ainda este ano.\nOs R$ 17,12 bilhões em dividendos da Petrobras , equivalentes a R$ 1,32 por ação ordinária e preferencial, serão pagos da seguinte forma:\nAmbas as parcelas terão seus valores corrigidos pela variação da taxa Selic entre 31 de dezembro de 2024 e as datas de pagamento.\nOs pagamentos sob a forma de JCP da Petrobras estão sujeitos à retenção de

In [None]:
# The normal code, only with bs4, was giving response code 202. That is, the server received the request and accepted to process it,
# but processing has not yet been completed. In the other scripts the response code is 200. That is, the request has already been successfully processed and sent.

urls_seudinheiro = []

def extrair_dados_seudinheiro(url):
    # Selenium settings (optional headless mode)
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # remove if you want to see the browser
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("user-agent=Mozilla/5.0")

    driver = webdriver.Chrome(options=chrome_options)

    try:
        driver.get(url)
        time.sleep(3)  # wait for JS loading

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Title
        titulo = soup.find("h1")
        titulo = titulo.get_text(strip=True) if titulo else "Title not found"

        # Date
        data = soup.select_one("div.js-first-letter.single__date-time")
        data = data.get_text(strip=True) if data else "Date not found"

        # Main content
        conteudo_div = soup.find("div", class_=["newSingle_content_right", "default"], id="js-first-letter")
        if conteudo_div:
            paragrafos = conteudo_div.find_all("p")
            conteudo = "\n".join(p.get_text(" ", strip=True) for p in paragrafos)
        else:
            conteudo = "Content not found"

        return {
            "url": url,
            "titulo": titulo,
            "data": data,
            "conteudo": conteudo,
            "site": "Seu Dinheiro"
        }

    except Exception as e:
        print(f"[ERROR] Failed to access {url}: {e}")
        return None

    finally:
        driver.quit()

In [None]:
# incrementing the news list
# The date for this website is in the following example format: 5 de março de 202013:25 (I need to format it)

for url in urls_seudinheiro:
    dados_noticia = extrair_dados_seudinheiro(url)
    if dados_noticia:
        data_str = dados_noticia["data"]

        # 1. Remove tabs and anything after " - atualizado"
        data_str = re.sub(r"\s+- atualizado.*", "", data_str).strip()

        # 2. Fix space between year and time (202012:34 → 2020 12:34)
        data_str = re.sub(r"(\d{4})(\d{2}:\d{2})", r"\1 \2", data_str)
        updated_date = dateparser.parse(data_str, languages=["pt"])

        if updated_date:  
            dados_noticia["data"] = updated_date.strftime("%d/%m/%Y %H:%M")
        else:
            dados_noticia["data"] = data_str

        noticias.append(dados_noticia)
    time.sleep(1)

## Suno News

In [None]:
urls_suno = []

def extrair_dados_suno(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0"
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except Exception as e:
        print(f"[ERROR] Failed to access {url}: {e}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting the news title
    titulo = soup.find("h1")
    titulo = titulo.get_text(strip=True) if titulo else "Title not found"

    # Extracting the date
    data = soup.find("time", attrs={"itemprop" : "datePublished"})
    data = data.get_text(strip=True) if data else "Date not found"

    # Main content (paragraphs within the article)
    conteudo_div = soup.find("article")
    if conteudo_div:
        paragrafos = conteudo_div.find_all("p")
        conteudo = "\n".join(p.get_text(" ", strip=True) for p in paragrafos)
    else:
        conteudo = "Content not found"

    # The return will be a dictionary containing all the important fields
    return {
        "url": url,
        "titulo": titulo,
        "data": data,
        "conteudo": conteudo,
        "site": "Suno Notícias"
    }

In [None]:
# incrementing the news list
for url in urls_suno:
    dados_noticia = extrair_dados_suno(url)
    if dados_noticia:
        noticias.append(dados_noticia)
    time.sleep(2)

## Uol Economy

In [None]:
# https://economia.uol.com.br/financas-pessoais/noticias/redacao/2020/03/25/bolsa-acao-petrobras-queda-o-que-fazer-vender-comprar-manter.htm

urls_uol = []

def extrair_dados_uol(url):
    # Selenium settings
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--user-data-dir=/tmp/chrome_user_data") # Add this line
    chrome_options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/113.0.0.0 Safari/537.36"
    )

    driver = webdriver.Chrome(options=chrome_options)

    # Applying "stealth" to mask that it's Selenium
    stealth(
        driver,
        languages=["pt-BR", "pt", "en-US", "en"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
    )

    try:
        driver.get(url)
        time.sleep(3)  # wait for JS loading

        # Get the HTML rendered by the browser
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        # Extracting the news title
        titulo = soup.find("h1")
        titulo = titulo.get_text(strip=True) if titulo else "Title not found"

        # Extracting the date
        data = soup.find("p", class_=["time"])
        data = data.get_text(strip=True) if data else "Date not found"

        # Main content (paragraphs within the article)
        conteudo_div = soup.find("div", class_=["text"])
        if conteudo_div:
            paragrafos = conteudo_div.find_all("p")
            conteudo = "\n".join(p.get_text(" ", strip=True) for p in paragrafos)
        else:
            conteudo = "Content not found"

        return {
            "url": url,
            "titulo": titulo,
            "data": data,
            "conteudo": conteudo,
            "site": "Uol Economia"
        }

    except Exception as e:
        print(f"[ERROR] Failed to access {url}: {e}")
        return None

    finally:
        driver.quit()

In [None]:
# incrementing the news list
for url in urls_uol:
    dados_noticia = extrair_dados_uol(url)
    if dados_noticia:
        noticias.append(dados_noticia)
    time.sleep(2)

## Saving the news to a csv file

In [None]:
# Converting the list to a DataFrame
df = pd.DataFrame(noticias)
# Saving the DataFrame to a CSV file
df.to_csv("noticias_petrobras_2024.csv", index=False, encoding="utf-8")

## Counting news by website

In [None]:
def imprimir_quantidade_noticias(nome_arquivo_csv: str):   
    # Loading the CSV file
    df = pd.read_csv(nome_arquivo_csv)
    # Counting the number of news per website
    contagem_sites = df['site'].value_counts()
    # printing the news count per website
    print(contagem_sites)
    # printing the total news
    print(f"Total news: {len(df)}")

# imprimir_quantidade_noticias("noticias_final_2020.csv")
# imprimir_quantidade_noticias("noticias_final_2021.csv")
# imprimir_quantidade_noticias("noticias_final_2023.csv")
# imprimir_quantidade_noticias("noticias_final_2024.csv")

site
Suno Notícias    270
Money Times      238
Seu Dinheiro     177
Uol Economia      25
Infomoney         10
Name: count, dtype: int64
Total de notícias: 720
site
Suno Notícias    304
Money Times      220
Seu Dinheiro     198
Infomoney         24
Uol Economia       6
Name: count, dtype: int64
Total de notícias: 752
site
Suno Notícias    292
Money Times      221
Seu Dinheiro     113
Infomoney         42
Uol Economia      22
Name: count, dtype: int64
Total de notícias: 690
site
Suno Notícias    385
Money Times      165
Seu Dinheiro     104
Infomoney         93
Name: count, dtype: int64
Total de notícias: 747
