Coleta de Noticias

In [1]:
## Fontes de Notícias

fonte_g1 = "https://g1.globo.com/economia/dolar/"
fonte_cnn = "https://www.cnnbrasil.com.br/tudo-sobre/dolar/"
fonte_folha = "https://www1.folha.uol.com.br/folha-topicos/dolar/"

In [8]:
## Importando Bibliotecas para Web Scraping
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
from datetime import datetime
import time
import selenium.webdriver as webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException



In [3]:
## Extraindo Notícias do G1 com o Selenium
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
driver.get(fonte_g1)
time.sleep(5)  # Aguarda o carregamento da página
driver.find_element(By.CSS_SELECTOR, "svg.fc-cancel-icon-svg").click() # Fecha o pop-up de ADS

# Scroll para carregar mais notícias
for _ in range(10):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)
    
html_g1 = driver.page_source


driver.quit()


In [4]:

soup = BeautifulSoup(html_g1, "html.parser")

# Tratando os dados do G1
url_imagem = [img["src"] for img in soup.select("img.bstn-fd-picture-image")]
titulos = [t.get_text(strip=True) for t in soup.select("div.feed-post-body-title a")]
links = [a["href"] for a in soup.select("a.feed-post-link")]
data_publicacao = [
    f"{link.split('/')[7]}/{link.split('/')[6]}/{link.split('/')[5]}"
    for link in links
]

# Salvando os dados em um DataFrame com a data de extração
data_extracao = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
df_g1 = pd.DataFrame({
    "urlImagem": url_imagem,
    "dataPublicacao": data_publicacao,
    "titulo": titulos,
    "link": links,
    "fonte": "G1",
    "dataExtracao": data_extracao
})

df_noticias = df_g1.copy()

In [5]:
## Extraindo Notícias do CNN 
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
driver.get(fonte_cnn)
time.sleep(5)  # Aguarda o carregamento da página

wait = WebDriverWait(driver, 10)
botao_ok = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".btn-agree button")))

# Clica no botão
botao_ok.click()


for i in range(2):  # Navega por 2 páginas
    time.sleep(5)
    html_cnn = driver.page_source
    soup = BeautifulSoup(html_cnn, "html.parser")
    container = soup.select_one('ul[data-section="article_list"]')
    

    # Printa a pagina percorrida
    url_imagem= []
    if container:
        url_imagem = [img["src"] for img in container.select("img") if img.get("src")]
    titulos = [t.get_text(strip=True) for t in soup.select("div.flex.flex-col.gap-4 h2")]
    links = [a["href"] for a in soup.select("div.flex.flex-col.gap-4 a")]
    links = [item for item in links if item != 'https://www.cnnbrasil.com.br/ao-vivo/']
    data_publicacao = []
    for tag in soup.select("time"):
        raw = tag.get("datetime")
        if raw:
            data_publicacao.append(datetime.fromisoformat(raw).strftime("%d/%m/%Y"))

    data_extracao = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    df_cnn = pd.DataFrame({
        "urlImagem": url_imagem,
        "dataPublicacao": data_publicacao,
        "titulo": titulos,
        "link": links,
        "fonte": "CNN",
        "dataExtracao": data_extracao
    })

    df_noticias = pd.concat([df_noticias, df_cnn], ignore_index=True)

    driver.find_element(By.CSS_SELECTOR, "a[aria-label='Ir para próxima página']").click()    
    
    time.sleep(5)

driver.quit()


In [None]:


options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)

driver.get(fonte_folha)

wait = WebDriverWait(driver, 20)

for i in range(4):
    try:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "main .row, main article, main li")))

        last_height = driver.execute_script("return document.body.scrollHeight")
        found = False

        for _ in range(50):  
            candidatos = driver.find_elements(By.CSS_SELECTOR, "button.c-button.c-button--expand")
        
            if candidatos:
                botao = candidatos[0]
                driver.execute_script("arguments[0].scrollIntoView({block:'center'});", botao)
                time.sleep(0.8)
                try:
                    wait.until(EC.element_to_be_clickable((By.XPATH, ".//button[contains(@class,'c-button') or @data-pagination-trigger]")))
                except TimeoutException:
                    pass 

                try:
                    botao.click()
                except Exception:
                    driver.execute_script("arguments[0].click();", botao)
                break
    finally:
        time.sleep(2)

html_folha = driver.page_source       
driver.quit()


In [None]:
soup = BeautifulSoup(html_folha, "html.parser") 

url_imagem = [img.get("data-src") for img in soup.select("img.c-headline__image") if img.get("data-src")]
titulos = [t.get_text(strip=True) for t in soup.select("h2.c-headline__title")]  
data_publicacao = [t.get("datetime") for t in soup.select("time.c-headline__dateline[itemprop='datePublished']")]
links = [a["href"] for a in soup.select("a.c-headline__url")]

# Converter a dataPublicacao para dd/mm/yyyy
data_publicacao = [
    datetime.strptime(d, "%Y-%m-%d %H:%M:%S").strftime("%d/%m/%Y")
    for d in data_publicacao if d
]

def pad(lst, n, fill=""):
    return lst + [fill] * (n - len(lst))

n = max(len(url_imagem), len(data_publicacao), len(titulos), len(links))

df_folha = pd.DataFrame({
    "urlImagem":     pad(url_imagem, n, ""),
    "dataPublicacao":pad(data_publicacao, n, ""),
    "titulo":        pad(titulos, n, ""),
    "link":          pad(links, n, ""),
    "fonte":         ["Folha"] * n,
    "dataExtracao":  [datetime.now().strftime("%d/%m/%Y %H:%M:%S")] * n,
})

df_folha = df_folha[df_folha['urlImagem'] != '']

df_noticias = pd.concat([df_noticias, df_cnn], ignore_index=True)


In [None]:
df_noticias