In [None]:
import sys
import os
import pandas as pd
from datetime import datetime

project_root = os.path.abspath(os.path.join(".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.extract_news import ExtractNews
from src.extract_yahoo import ExtractYahoo
from src.pipeline import Pipeline
from src.logger_config import setup_logger
from src.transform import Transform
from src.init_db import init_db
from src.load import Load

RAW_DIR = "../data/raw"
PROCESSED_DIR = "../data/processed"
LOG_DIR = "../logs/pipeline"
DB_PATH = "../data/analytcs/pipeline.db"
SCHEMA_FILE = "../data/database/create_tables.sql"

os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

logger, log_file = setup_logger("PipelineLogger", log_dir=LOG_DIR)
logger.info("Início da execução da pipeline")

init_db(db_path=DB_PATH, schema_file=SCHEMA_FILE)

15-10-25 21:32:47 [INFO] Início da execução da pipeline


✅ Banco inicializado em: ../data/database/pipeline.db


In [2]:
urls = {
    "g1": [
        "https://g1.globo.com/ce/ceara/",
        "https://g1.globo.com/",
        "https://g1.globo.com/economia/",
        "https://g1.globo.com/politica/",
        "https://g1.globo.com/mundo/",
        "https://g1.globo.com/ciencia-e-saude/",
        "https://g1.globo.com/tecnologia/",
    ],
    "folha": ["https://www1.folha.uol.com.br/ultimas-noticias/"],
}

task_news = ExtractNews(urls)
task_yahoo = ExtractYahoo("https://finance.yahoo.com/quote/DSPY/", tickers=None)

In [None]:
pipeline = Pipeline([task_news, task_yahoo])
resultado = pipeline.run()
logger.info("Extração finalizada com sucesso!")


In [None]:
timestamp = datetime.now().strftime("%d-%m-%y_%H-%M-%S")

df_news = resultado.get("ExtractNews", pd.DataFrame())
df_yahoo = resultado.get("ExtractYahoo", pd.DataFrame())

if not df_news.empty:
    csv_news = os.path.join(RAW_DIR, f"ExtractNews_{timestamp}.csv")
    df_news.to_csv(csv_news, index=False, encoding="utf-8")
    logger.info(f"CSV de ExtractNews cru salvo em: {csv_news}")

if not df_yahoo.empty:
    csv_yahoo = os.path.join(RAW_DIR, f"ExtractYahoo_{timestamp}.csv")
    df_yahoo.to_csv(csv_yahoo, index=False, encoding="utf-8")
    logger.info(f"CSV de ExtractYahoo cru salvo em: {csv_yahoo}")

15-10-25 21:02:21 [INFO] CSV de ExtractNews cru salvo em: ../data/raw\ExtractNews_15-10-25_21-02-21.csv
INFO:PipelineLogger:CSV de ExtractNews cru salvo em: ../data/raw\ExtractNews_15-10-25_21-02-21.csv
15-10-25 21:02:21 [INFO] CSV de ExtractYahoo cru salvo em: ../data/raw\ExtractYahoo_15-10-25_21-02-21.csv
INFO:PipelineLogger:CSV de ExtractYahoo cru salvo em: ../data/raw\ExtractYahoo_15-10-25_21-02-21.csv


In [None]:
transform = Transform()

df_news_clean = transform.clean_news(df_news)
df_yahoo_clean = transform.clean_yahoo(df_yahoo)

csv_news_clean = os.path.join(PROCESSED_DIR, f"ExtractNews_CLEAN_{timestamp}.csv")
df_news_clean.to_csv(csv_news_clean, index=False, encoding="utf-8")
logger.info(f"CSV de notícias tratado salvo em: {csv_news_clean}")

csv_yahoo_clean = os.path.join(PROCESSED_DIR, f"ExtractYahoo_CLEAN_{timestamp}.csv")
df_yahoo_clean.to_csv(csv_yahoo_clean, index=False, encoding="utf-8")
logger.info(f"CSV do Yahoo tratado salvo em: {csv_yahoo_clean}")

csv_news_removed = os.path.join(PROCESSED_DIR, f"ExtractNews_REMOVED_{timestamp}.csv")
transform.removed_news.to_csv(csv_news_removed, index=False, encoding="utf-8")
logger.info(f"CSV de notícias removidas salvo em: {csv_news_removed}")

csv_yahoo_removed = os.path.join(PROCESSED_DIR, f"ExtractYahoo_REMOVED_{timestamp}.csv")
transform.removed_yahoo.to_csv(csv_yahoo_removed, index=False, encoding="utf-8")
logger.info(f"CSV de Yahoo removidos salvo em: {csv_yahoo_removed}")

15-10-25 21:02:21 [INFO] CSV de notícias tratado salvo em: ../data/processed\ExtractNews_CLEAN_15-10-25_21-02-21.csv
INFO:PipelineLogger:CSV de notícias tratado salvo em: ../data/processed\ExtractNews_CLEAN_15-10-25_21-02-21.csv
15-10-25 21:02:21 [INFO] CSV do Yahoo tratado salvo em: ../data/processed\ExtractYahoo_CLEAN_15-10-25_21-02-21.csv
INFO:PipelineLogger:CSV do Yahoo tratado salvo em: ../data/processed\ExtractYahoo_CLEAN_15-10-25_21-02-21.csv
15-10-25 21:02:21 [INFO] CSV de notícias removidas salvo em: ../data/processed\ExtractNews_REMOVED_15-10-25_21-02-21.csv
INFO:PipelineLogger:CSV de notícias removidas salvo em: ../data/processed\ExtractNews_REMOVED_15-10-25_21-02-21.csv
15-10-25 21:02:21 [INFO] CSV de Yahoo removidos salvo em: ../data/processed\ExtractYahoo_REMOVED_15-10-25_21-02-21.csv
INFO:PipelineLogger:CSV de Yahoo removidos salvo em: ../data/processed\ExtractYahoo_REMOVED_15-10-25_21-02-21.csv


In [None]:
loader = Load(DB_PATH)

inserted_news, ignored_news = loader.save_news(df_news_clean)
inserted_instr, ignored_instr = loader.save_instruments(df_yahoo_clean)
attempted_prices, _ = loader.save_prices(df_yahoo_clean)

logger.info("Carga dos dados concluída com sucesso no SQLite!")

15-10-25 21:02:21 [INFO] Carga dos dados concluída com sucesso no SQLite!
INFO:PipelineLogger:Carga dos dados concluída com sucesso no SQLite!


In [None]:
print("Pipeline finalizada!")
print(f"Total notícias cru: {len(df_news)}")
print(f"Total registros históricos cru: {len(df_yahoo)}")
print(f"Total notícias tratadas: {len(df_news_clean)}")
print(f"Total registros históricos tratados: {len(df_yahoo_clean)}")
print(f"Total notícias removidas: {len(transform.removed_news)}")
print(f"Total registros históricos removidos: {len(transform.removed_yahoo)}")

display(df_news.head(10))
display(df_news_clean.head(10))
display(transform.removed_news.head(10))

display(df_yahoo.head(10))
display(df_yahoo_clean.head(10))
display(transform.removed_yahoo.head(10))

logger.info(f"Log salvo em: {log_file}")

Pipeline finalizada!
Total notícias cru: 161
Total registros históricos cru: 1260
Total notícias tratadas: 156
Total registros históricos tratados: 1260
Total notícias removidas: 5
Total registros históricos removidos: 0


Unnamed: 0,NoticiaID,titulo,url,lead,dataHora
0,1,"Polícia prende Alex Gardenal, homem com extens...",https://g1.globo.com/ce/ceara/noticia/2025/10/...,Alex Gardenal foi preso novamente na última te...,2025-10-15T16:35:19.903-03:00
1,2,Policiais receberam R$ 300 mil em propina para...,https://g1.globo.com/ce/ceara/noticia/2025/10/...,Uma investigação do Ministério Público do Esta...,2025-10-15T13:39:51.220-03:00
2,3,Motociclista morre atropelado por Hilux em For...,https://g1.globo.com/ce/ceara/noticia/2025/10/...,Um acidente de trânsito resultou na morte de u...,2025-10-15T12:51:37.557-03:00
3,4,Assista às reportagens do CETV 1ª edição,https://g1.globo.com/ce/ceara/edicao/2025/05/2...,,
4,5,Vídeo: polícia cumpre 12 mandados de prisão co...,https://g1.globo.com/ce/ceara/noticia/2025/10/...,"A Polícia Civil do Ceará cumpriu, na madrugada...",2025-10-15T11:17:10.510-03:00
5,6,Vídeo: homem morre durante exercício físico em...,https://g1.globo.com/ce/ceara/noticia/2025/10/...,Um homem de 44 anos faleceu na noite da última...,2025-10-14T11:26:58.739-03:00
6,7,Assaltante derruba motociclista em movimento e...,https://g1.globo.com/ce/ceara/noticia/2025/10/...,Um motociclista foi vítima de um assalto na ta...,2025-10-15T10:56:52.579-03:00
7,8,Governo passa a indicar faixa etária para apps...,https://g1.globo.com/politica/noticia/2025/10/...,"O ministro da Justiça e Segurança Pública, Ric...",2025-10-15T15:35:05.673-03:00
8,9,Lula confirma reunião e brinca sobre Trump: 'P...,https://g1.globo.com/politica/noticia/2025/10/...,Presidente Lula confirmou nesta quarta-feira (...,2025-10-15T12:13:56.886-03:00
9,10,O que define um 'serial killer'? Especialistas...,https://g1.globo.com/ciencia/noticia/2025/10/1...,Caso da estudante de Direito apontada pela pol...,2025-10-15T12:14:26.315-03:00


Unnamed: 0,NoticiaID,titulo,url,lead,dataHora
0,1,"Polícia prende Alex Gardenal, homem com extens...",https://g1.globo.com/ce/ceara/noticia/2025/10/...,Alex Gardenal foi preso novamente na última te...,15/10/25 16:35:19
1,2,Policiais receberam R$ 300 mil em propina para...,https://g1.globo.com/ce/ceara/noticia/2025/10/...,Uma investigação do Ministério Público do Esta...,15/10/25 13:39:51
2,3,Motociclista morre atropelado por Hilux em For...,https://g1.globo.com/ce/ceara/noticia/2025/10/...,Um acidente de trânsito resultou na morte de u...,15/10/25 12:51:37
3,4,Assista às reportagens do CETV 1ª edição,https://g1.globo.com/ce/ceara/edicao/2025/05/2...,Sem dados,Sem dados
4,5,Vídeo: polícia cumpre 12 mandados de prisão co...,https://g1.globo.com/ce/ceara/noticia/2025/10/...,"A Polícia Civil do Ceará cumpriu, na madrugada...",15/10/25 11:17:10
5,6,Vídeo: homem morre durante exercício físico em...,https://g1.globo.com/ce/ceara/noticia/2025/10/...,Um homem de 44 anos faleceu na noite da última...,14/10/25 11:26:58
6,7,Assaltante derruba motociclista em movimento e...,https://g1.globo.com/ce/ceara/noticia/2025/10/...,Um motociclista foi vítima de um assalto na ta...,15/10/25 10:56:52
7,8,Governo passa a indicar faixa etária para apps...,https://g1.globo.com/politica/noticia/2025/10/...,"O ministro da Justiça e Segurança Pública, Ric...",15/10/25 15:35:05
8,9,Lula confirma reunião e brinca sobre Trump: 'P...,https://g1.globo.com/politica/noticia/2025/10/...,Presidente Lula confirmou nesta quarta-feira (...,15/10/25 12:13:56
9,10,O que define um 'serial killer'? Especialistas...,https://g1.globo.com/ciencia/noticia/2025/10/1...,Caso da estudante de Direito apontada pela pol...,15/10/25 12:14:26


Unnamed: 0,NoticiaID,titulo,url,lead,dataHora,url_hash
0,25,PGR pede que STF reabra inquérito sobre supost...,https://g1.globo.com/politica/noticia/2025/10/...,A PGR pediu nesta quarta-feira (15) que o STF ...,2025-10-15T18:24:42.845-03:00,d964e0cbdf26877aff02d3075a8318d4
1,26,"Correios: queda nas receitas, gastos com pesso...",https://g1.globo.com/economia/noticia/2025/10/...,Correios anunciaram que buscam R$ 20 bilhões e...,2025-10-15T17:10:15.636-03:00,959fa6dcf87a2647189d9ee21f5e0f33
2,31,Governo passa a classificar aplicativos e cria...,https://g1.globo.com/politica/noticia/2025/10/...,"O ministro da Justiça e Segurança Pública, Ric...",2025-10-15T15:35:05.673-03:00,534fb2956b473f51999e5af955b64f83
3,40,Lula confirma reunião de negociação com EUA ne...,https://g1.globo.com/politica/noticia/2025/10/...,Presidente Lula confirmou nesta quarta-feira (...,2025-10-15T12:13:56.886-03:00,59983ededc2697ac4716f4dcdad5e113
4,45,O que define um 'serial killer'? Veja o que di...,https://g1.globo.com/ciencia/noticia/2025/10/1...,Caso da estudante de Direito apontada pela pol...,2025-10-15T12:14:26.315-03:00,4e3b4b9c34a6069de7e051a4b9ed852d


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,Empresa,Setor,Industria,Moeda,Pais,Bolsa,Data_Coletada
0,2025-04-16,104.53685,106.776566,100.43736,104.476852,397016900,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25 21:02:07
1,2025-04-17,104.436853,104.456855,100.037413,101.477226,292517500,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25 21:02:07
2,2025-04-21,98.757571,99.427492,95.028044,96.897812,288501100,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25 21:02:07
3,2025-04-22,98.767569,99.797438,97.267758,98.877556,241004800,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25 21:02:07
4,2025-04-23,104.506845,104.786816,102.007159,102.697075,247526000,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25 21:02:07
5,2025-04-24,103.466977,106.52659,103.097021,106.416603,220815000,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25 21:02:07
6,2025-04-25,106.83656,111.905922,105.716706,110.99604,251064700,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25 21:02:07
7,2025-04-28,109.676194,110.356109,106.006651,108.716316,207708500,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25 21:02:07
8,2025-04-29,107.65645,110.186131,107.426484,109.006279,170444300,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25 21:02:07
9,2025-04-30,104.456859,108.906296,104.066908,108.906296,235044600,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25 21:02:07


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,Empresa,Setor,Industria,Moeda,Pais,Bolsa,Data_Coletada
0,16/04/25,104.53685,106.776566,100.43736,104.476852,397016900,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25
1,17/04/25,104.436853,104.456855,100.037413,101.477226,292517500,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25
2,21/04/25,98.757571,99.427492,95.028044,96.897812,288501100,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25
3,22/04/25,98.767569,99.797438,97.267758,98.877556,241004800,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25
4,23/04/25,104.506845,104.786816,102.007159,102.697075,247526000,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25
5,24/04/25,103.466977,106.52659,103.097021,106.416603,220815000,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25
6,25/04/25,106.83656,111.905922,105.716706,110.99604,251064700,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25
7,28/04/25,109.676194,110.356109,106.006651,108.716316,207708500,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25
8,29/04/25,107.65645,110.186131,107.426484,109.006279,170444300,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25
9,30/04/25,104.456859,108.906296,104.066908,108.906296,235044600,0.0,0.0,NVDA,NVIDIA Corporation,Technology,Semiconductors,USD,United States,NMS,15/10/25


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,Empresa,Setor,Industria,Moeda,Pais,Bolsa,Data_Coletada,hash


15-10-25 21:02:21 [INFO] Log salvo em: ../logs/pipeline\PipelineLogger_15-10-25_20-59-43.log
INFO:PipelineLogger:Log salvo em: ../logs/pipeline\PipelineLogger_15-10-25_20-59-43.log


In [None]:
import sqlite3
import pandas as pd



conn = sqlite3.connect(DB_PATH)

tables = ["news", "instruments", "prices"]

for table in tables:
    print(f"===== {table.upper()} =====")
    try:
        df = pd.read_sql_query(f"SELECT * FROM {table} LIMIT 10", conn)
        display(df)
        print(
            f"Total registros na tabela {table}: {pd.read_sql_query(f'SELECT COUNT(*) as count FROM {table}', conn)['count'][0]}\n"
        )
    except Exception as e:
        print(f"Erro ao ler tabela {table}: {e}\n")

conn.close()

===== NEWS =====


Unnamed: 0,id,noticia_id,titulo,url,lead,data_hora
0,1,1,Vídeo: polícia cumpre 12 mandados de prisão co...,https://g1.globo.com/ce/ceara/noticia/2025/10/...,"A Polícia Civil do Ceará cumpriu, na madrugada...",15/10/25 11:17:10
1,2,2,Vídeo: homem morre durante exercício físico em...,https://g1.globo.com/ce/ceara/noticia/2025/10/...,Um homem de 44 anos faleceu na noite da última...,14/10/25 11:26:58
2,3,3,Assaltante derruba motociclista em movimento e...,https://g1.globo.com/ce/ceara/noticia/2025/10/...,Um motociclista foi vítima de um assalto na ta...,15/10/25 10:56:52
3,4,4,Assista às reportagens do CETV 1ª edição,https://g1.globo.com/ce/ceara/edicao/2025/05/2...,Sem dados,Sem dados
4,5,5,Vagões do metrô ficam superlotados com a reduç...,https://g1.globo.com/ce/ceara/noticia/2025/10/...,Falhas técnicas em duas composições que operam...,15/10/25 07:22:31
5,6,6,Vídeo: motociclista atropela corredor e foge s...,https://g1.globo.com/ce/ceara/noticia/2025/10/...,Um homem foi atropelado enquanto corria com a ...,15/10/25 05:03:06
6,7,7,Calçada em Fortaleza vira 'atalho' para motoci...,https://g1.globo.com/ce/ceara/noticia/2025/10/...,Uma calçada no cruzamento da Rua 15 de Novembr...,15/10/25 08:31:12
7,8,8,Trump autoriza ação secreta da CIA e 'operaçõe...,https://g1.globo.com/mundo/noticia/2025/10/15/...,O governo de Donald Trump autorizou ações secr...,15/10/25 15:43:10
8,9,9,Lula confirma reunião e brinca sobre Trump: 'P...,https://g1.globo.com/politica/noticia/2025/10/...,Presidente Lula confirmou nesta quarta-feira (...,15/10/25 12:13:56
9,10,10,"Sem dinheiro nem para demitir, Correios buscam...",https://g1.globo.com/economia/noticia/2025/10/...,Os Correios anunciaram nesta quarta-feira (15)...,15/10/25 12:51:40


Total registros na tabela news: 255

===== INSTRUMENTS =====


Unnamed: 0,instrument_id,ticker,nome,setor,pais,bolsa
0,1,NVIDIA Corporation,NVIDIA Corporation,Technology,United States,NMS
1,2,Microsoft Corporation,Microsoft Corporation,Technology,United States,NMS
2,3,Apple Inc.,Apple Inc.,Technology,United States,NMS
3,4,"Amazon.com, Inc.","Amazon.com, Inc.",Consumer Cyclical,United States,NMS
4,5,"Tesla, Inc.","Tesla, Inc.",Consumer Cyclical,United States,NMS
5,6,Broadcom Inc.,Broadcom Inc.,Technology,United States,NMS
6,7,"Meta Platforms, Inc.","Meta Platforms, Inc.",Communication Services,United States,NMS
7,8,Berkshire Hathaway Inc. New,Berkshire Hathaway Inc. New,Financial Services,United States,NYQ
8,9,JP Morgan Chase & Co.,JP Morgan Chase & Co.,Financial Services,United States,NYQ
9,10,Eli Lilly and Company,Eli Lilly and Company,Healthcare,United States,NYQ


Total registros na tabela instruments: 20

===== PRICES =====


Unnamed: 0,date,ticker,open,high,low,close,volume,data_coletada
0,2025-04-15,NVIDIA Corporation,110.956042,113.60571,110.486099,112.185883,228966900,15/10/25
1,2025-04-16,NVIDIA Corporation,104.53685,106.776566,100.43736,104.476852,397016900,15/10/25
2,2025-04-17,NVIDIA Corporation,104.436853,104.456855,100.037413,101.477226,292517500,15/10/25
3,2025-04-21,NVIDIA Corporation,98.757571,99.427492,95.028044,96.897812,288501100,15/10/25
4,2025-04-22,NVIDIA Corporation,98.767569,99.797438,97.267758,98.877556,241004800,15/10/25
5,2025-04-23,NVIDIA Corporation,104.506845,104.786816,102.007159,102.697075,247526000,15/10/25
6,2025-04-24,NVIDIA Corporation,103.466977,106.52659,103.097021,106.416603,220815000,15/10/25
7,2025-04-25,NVIDIA Corporation,106.83656,111.905922,105.716706,110.99604,251064700,15/10/25
8,2025-04-28,NVIDIA Corporation,109.676194,110.356109,106.006651,108.716316,207708500,15/10/25
9,2025-04-29,NVIDIA Corporation,107.65645,110.186131,107.426484,109.006279,170444300,15/10/25


Total registros na tabela prices: 2540

