In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt
from fake_useragent import UserAgent
from tqdm.notebook import tqdm
import time

def configure_webdriver():
    # Configurações do WebDriver com agente de usuário aleatório
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')
    
    # Gera um agente de usuário aleatório usando o fake-useragent
    user_agent = UserAgent().random
    chrome_options.add_argument(f'user-agent={user_agent}')
    
    # Inicializa o WebDriver
    return webdriver.Chrome(options=chrome_options)

def fetch_news_for_date(web, pesquisa, data):
    url = f'https://www.google.com/search?q={pesquisa}&sca_esv=8a38f1b162d79def&rlz=1C1CHBD_pt-PTBR1080BR1080&tbs=cdr:1,cd_min:{data},cd_max:{data},lr:lang_1pt&tbm=nws&source=lnt&lr=lang_pt&sa=X&ved=2ahUKEwjy8NbRxaqIAxVJpZUCHVnpHTgQpwV6BAgCEAc&biw=1536&bih=738&dpr=1.25'
    
    web.get(url)
    
    # Espera o carregamento completo da página
    WebDriverWait(web, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'div.n0jPhd.ynAwRc.MBeuO.nDgy9d'))
    )
    
    soup = BeautifulSoup(web.page_source, 'html.parser')
    
    # Ajuste aqui dependendo do HTML atual
    news = soup.find_all('div', class_='n0jPhd ynAwRc MBeuO nDgy9d')
    
    return [{'Data': data, 'Notícia': new.text} for new in news]

def get_news(pesquisa, dia=None, quant_dias=5, dias=None):
    if dia is None:
        dia = dt.datetime.now()
    pesquisa = pesquisa.replace(' ', '+')
    
    web = configure_webdriver()
    dados = []
    datas_ruins = []

    try:
        # Gera a lista de datas se não for fornecida
        if dias is None:
            dias = [(dia - dt.timedelta(days=i)).strftime('%m/%d/%Y') for i in range(1, quant_dias + 1)]
        
        with tqdm(total=len(dias), desc="Progresso", leave=True) as pbar:
            for passado in dias:
                try:
                    noticias = fetch_news_for_date(web, pesquisa, passado)
                    dados.extend(noticias)
                except Exception as e:
                    datas_ruins.append(passado)
                    print(f'Erro ao processar a data {passado}: {e}')
                
                pbar.update(1)
                
                # Adiciona um delay aleatório para evitar ser bloqueado pelo Google
                time.sleep(2)
    finally:
        web.quit()
    
    # Converte a lista para DataFrame
    return pd.DataFrame(dados), datas_ruins

In [2]:
lista = ['bolsa de valores', 'ibovespa', 'acoes']
for pesquisa in lista:
    df, datas = get_news(pesquisa, quant_dias=3650)
    df['Data'] = pd.to_datetime(df['Data'])
    df.to_csv(f'./data/noticias_{pesquisa}.csv', index=False)
    print(f'Dados salvos em noticias_{pesquisa}.csv')

Progresso:   0%|          | 0/3650 [00:00<?, ?it/s]

Erro ao processar a data 04/14/2019: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF7A1E5B5D2+29090]
	(No symbol) [0x00007FF7A1DCE689]
	(No symbol) [0x00007FF7A1C8B1CA]
	(No symbol) [0x00007FF7A1CDEFD7]
	(No symbol) [0x00007FF7A1CDF22C]
	(No symbol) [0x00007FF7A1D297F7]
	(No symbol) [0x00007FF7A1D0672F]
	(No symbol) [0x00007FF7A1D265D9]
	(No symbol) [0x00007FF7A1D06493]
	(No symbol) [0x00007FF7A1CD09B1]
	(No symbol) [0x00007FF7A1CD1B11]
	GetHandleVerifier [0x00007FF7A2178C5D+3295277]
	GetHandleVerifier [0x00007FF7A21C4843+3605523]
	GetHandleVerifier [0x00007FF7A21BA707+3564247]
	GetHandleVerifier [0x00007FF7A1F16EB6+797318]
	(No symbol) [0x00007FF7A1DD980F]
	(No symbol) [0x00007FF7A1DD53F4]
	(No symbol) [0x00007FF7A1DD5580]
	(No symbol) [0x00007FF7A1DC4A1F]
	BaseThreadInitThunk [0x00007FFD7A47257D+29]
	RtlUserThreadStart [0x00007FFD7B86AF28+40]

Erro ao processar a data 11/03/2018: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF7A1E5B5D2+29090]
	(No symbol) [0x00007FF7A1DCE689]