In [10]:
# pip install playwright
# playwright install

import csv
import pandas as pd
import asyncio
from typing import List, Dict, Optional
from playwright.async_api import async_playwright, Page
from urllib.parse import urljoin
from datetime import datetime


In [11]:
import asyncio, csv, re
from datetime import datetime
from urllib.parse import urljoin
from playwright.async_api import async_playwright, Page

START_URL = "https://www.bbc.com/news/topics/c2vdnvdg6xxt"
TARGET = 100
OUTCSV = "bbc_israel_gaza_noticias.csv"
BASE = "https://www.bbc.com"

def abs_url(href): return urljoin(BASE, href or "")

async def accept_cookies(page: Page):
    for sel in (
        '[data-testid="cookie-banner"] button:has-text("Accept")',
        'button:has-text("I Agree")','button:has-text("Agree")','#bbccookies-continue-button',
    ):
        b = page.locator(sel).first
        if await b.count() and await b.is_visible():
            await b.click(); break

async def wait_heading(page: Page):
    await page.wait_for_selector('h2[data-testid="alaska-title"]', timeout=15000)

async def extract_latest_updates_on_page(page: Page):
    # Extrator geométrico: pega links /news/ VISUALMENTE entre o heading e a paginação
    js = """
    () => {
      const BASE = 'https://www.bbc.com';
      const head = document.querySelector('h2[data-testid="alaska-title"]');
      if (!head) return {items: [], debug: {reason: 'no heading'}};
      const headBottom = head.getBoundingClientRect().bottom + window.scrollY;

      // tenta achar a barra de paginação (nav ou container com botões 1,2,3)
      let pag = document.querySelector('nav[aria-label*="Pagination" i]') ||
                Array.from(document.querySelectorAll('nav, div, section'))
                  .find(n => /Go to page/i.test(n.textContent||'') || /\b1\b.*\b2\b.*\b3\b/.test(n.textContent||''));
      let pagTop = Infinity;
      if (pag) pagTop = pag.getBoundingClientRect().top + window.scrollY;

      const links = Array.from(document.querySelectorAll('a[href*="/news/"]'));
      const filtered = [];

      for (const a of links) {
        const r = a.getBoundingClientRect();
        const y = r.top + window.scrollY;
        if (y > headBottom && y < pagTop) {
          // título
          let title = (a.querySelector('h3,h2')?.textContent || a.textContent || '').trim().replace(/\s+/g,' ');
          if (!title || title.length < 5) continue;

          // item container p/ achar resumo/time
          const container = a.closest('li, article, div[role="listitem"], div, section') || a;
          const p = container.querySelector('p');

          const summary = (p?.textContent || '').trim().replace(/\s+/g,' ');

          try {
            const url = new URL(a.getAttribute('href'), BASE).toString();
            filtered.push({"titulo": title, "url": url, "mini_resumo": summary});
          } catch {}
        }
      }

      // dedupe por URL e remove duplicados do mesmo título
      const seen = new Set();
      const items = [];
      for (const it of filtered) {
        if (!seen.has(it["url"])) { seen.add(it["url"]); items.push(it); }
      }
      return {items, debug: {headBottom, pagTop, totalLinks: links.length, kept: items.length}};
    }
    """
    res = await page.evaluate(js)
    items = res["items"]
    iso = datetime.now().isoformat()
    for it in items: it["timestamp_coleta"] = iso
    print(f'→ between heading/pagination: {res["debug"]["kept"]} of {res["debug"]["totalLinks"]}')
    return items

async def get_max_page(page: Page) -> int:
    # lê todos "Go to page N" e pega o maior
    nums = set()
    btns = page.locator('button[aria-label^="Go to page "]')
    for i in range(await btns.count()):
        lbl = await btns.nth(i).get_attribute("aria-label")
        m = re.search(r"(\d+)$", lbl or "")
        if m: nums.add(int(m.group(1)))
    # fallback: números visíveis no paginador
    nav = page.locator("nav").filter(has_text=re.compile(r"\b1\b"))
    if await nav.count():
        txt = " ".join(await nav.first.all_text_contents())
        for n in re.findall(r"\b\d+\b", txt):
            nums.add(int(n))
    return max(nums) if nums else 1

async def click_page_n(page: Page, n: int) -> bool:
    # garante que o paginador está na tela
    await page.mouse.wheel(0, 99999)
    # 1) força o clique via JS no aria-label "Go to page n"
    ok = await page.evaluate("""
    (n) => {
      const byAria = Array.from(document.querySelectorAll('button[aria-label^="Go to page "]'))
        .find(b => (b.getAttribute('aria-label')||'').trim().endsWith(String(n)));
      if (byAria) { byAria.click(); return true; }
      // fallback por texto visível = n
      const byText = Array.from(document.querySelectorAll('nav button, button'))
        .find(b => (b.textContent||'').trim() === String(n));
      if (byText) { byText.click(); return true; }
      return false;
    }
    """, n)
    if ok:
        await page.wait_for_load_state("domcontentloaded")
        await page.wait_for_timeout(900)
        return True

    # 2) fallback: clica no chevron ">" (próxima)
    chevron = page.locator('nav button[aria-label*="next" i], nav button:has-text(">"), nav button:has-text("›")').first
    if await chevron.count():
        await chevron.click()
        await page.wait_for_load_state("domcontentloaded")
        await page.wait_for_timeout(900)
        return True

    return False

async def scrape_latest_updates(target: int = TARGET):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        await page.goto(START_URL, timeout=60_000)
        await accept_cookies(page)
        await wait_heading(page)

        results, seen = [], set()
        page_no = 1
        max_page = await get_max_page(page)

        while len(results) < target and page_no <= max_page:
            # rola um pouco p/ garantir render
            await page.mouse.wheel(0, 2200); await asyncio.sleep(0.3)
            batch = await extract_latest_updates_on_page(page)
            print(f"Página {page_no} → {len(batch)} itens")
            for it in batch:
                if it["url"] in seen: continue
                seen.add(it["url"]); results.append(it)
                if len(results) >= target: break
            if len(results) >= target: break
            page_no += 1
            if page_no > max_page: break
            # leva o paginador ao viewport e clica no número
            await page.mouse.wheel(0, 9_999); await asyncio.sleep(0.2)
            if not await click_page_n(page, page_no):
                print(f"Não consegui clicar na página {page_no}")
                break

        await browser.close()
        return results[:target]

async def save_csv(rows, path=OUTCSV):
    cols = ["titulo","url","mini_resumo","timestamp_coleta"]
    with open(path,"w",newline="",encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=cols); w.writeheader()
        for r in rows: w.writerow({k:r.get(k,"") for k in cols})

# Jupyter:
data = await scrape_latest_updates(100)
await save_csv(data, OUTCSV)

  let title = (a.querySelector('h3,h2')?.textContent || a.textContent || '').trim().replace(/\s+/g,' ');


→ between heading/pagination: 9 of 41
Página 1 → 9 itens
→ between heading/pagination: 21 of 41
Página 2 → 21 itens
→ between heading/pagination: 21 of 41
Página 2 → 21 itens
→ between heading/pagination: 22 of 41
Página 3 → 22 itens
→ between heading/pagination: 22 of 41
Página 3 → 22 itens
→ between heading/pagination: 22 of 41
Página 4 → 22 itens
→ between heading/pagination: 22 of 41
Página 4 → 22 itens
→ between heading/pagination: 22 of 41
Página 5 → 22 itens
→ between heading/pagination: 22 of 41
Página 5 → 22 itens
→ between heading/pagination: 22 of 41
Página 6 → 22 itens
→ between heading/pagination: 22 of 41
Página 6 → 22 itens
→ between heading/pagination: 22 of 41
Página 7 → 22 itens
→ between heading/pagination: 22 of 41
Página 7 → 22 itens
→ between heading/pagination: 22 of 41
Página 8 → 22 itens
→ between heading/pagination: 22 of 41
Página 8 → 22 itens
→ between heading/pagination: 22 of 41
Página 9 → 22 itens
→ between heading/pagination: 22 of 41
Página 9 → 22 itens

In [12]:
news = pd.read_csv(OUTCSV)
news.head()

Unnamed: 0,titulo,url,mini_resumo,timestamp_coleta
0,Israel confirms identities of hostages' bodies...,https://www.bbc.com/news/articles/c4gj90j2g8jo,The bodies were identified as those of Amiram ...,2025-10-31T20:25:53.516424
1,Who are the released hostages?,https://www.bbc.com/news/articles/cpvl9k4mw8no,The Israeli military says 20 living hostages h...,2025-10-31T20:25:53.516424
2,UK pledges £4m to clear land mines to help flo...,https://www.bbc.com/news/articles/c9d6x02xdj6o,It will allow a UN body to clear land mines an...,2025-10-31T20:25:53.516424
3,Dependants of some Gazan students can join the...,https://www.bbc.com/news/articles/cly91lj9y47o,The decision is a reversal of the original pol...,2025-10-31T20:25:53.516424
4,Can the Gaza ceasefire deal survive?,https://www.bbc.com/news/articles/ckgk4x5ze3mo,Its prospects depend heavily on the continuing...,2025-10-31T20:25:53.516424


# Extração de Dados de Preços do Petróleo
Este notebook utiliza a função `extrair_dados_petroleo` para obter os preços diários do petróleo (WTI ou Brent) em um intervalo de datas especificado.

In [13]:
# Importar a função de extração
from extrair_dados_petroleo import extrair_dados_petroleo

In [14]:
# Definir os parâmetros para a extração
tipo_serie = "brent"  # Escolha entre "wti" ou "brent"
data_inicio = "2024-01-01"
data_fim = "2025-12-31"

In [15]:
# Chamar a função para extrair os dados
dados_petroleo = extrair_dados_petroleo(tipo_serie, data_inicio, data_fim)

In [16]:
# Visualizar as primeiras linhas do DataFrame
print(dados_petroleo.head())

          ds      price
0 2024-01-02  75.889999
1 2024-01-03  78.250000
2 2024-01-04  77.589996
3 2024-01-05  78.760002
4 2024-01-08  76.120003


In [17]:
# Salvar os dados em um arquivo CSV (opcional)
dados_petroleo.to_csv(f"precos_{tipo_serie}_{data_inicio}_to_{data_fim}.csv", index=False)
print("Arquivo CSV salvo com sucesso!")

Arquivo CSV salvo com sucesso!
