In [5]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from gnews import GNews
import time, re

RAW_NEWS_DIR = Path("../data/raw/noticias"); RAW_NEWS_DIR.mkdir(parents=True, exist_ok=True)
INT_NEWS_DIR = Path("../data/interim/noticias"); INT_NEWS_DIR.mkdir(parents=True, exist_ok=True)
PROC_NEWS_DIR = Path("../data/processed/noticias"); PROC_NEWS_DIR.mkdir(parents=True, exist_ok=True)

# Config inicial: España, ES, resultados en español primero
g = GNews(language='es', country='ES', max_results=100)  # max_results por query


In [6]:
from datetime import datetime, date
import pandas as pd
from tqdm import tqdm
import time

def month_iter(year_start=2024, year_end=2025):
    for y in range(year_start, year_end+1):
        for m in range(1, 13):
            yield y, m

def month_date_range_dt(year:int, month:int):
    # Devuelve objetos datetime.date (no strings)
    start = pd.Timestamp(year=year, month=month, day=1)
    end = (start + pd.offsets.MonthEnd(1))
    return start.date(), end.date()

def fetch_gnews_bbva(year_start=2024, year_end=2025, sleep_s=0.4):
    rows = []
    for y, m in tqdm(list(month_iter(year_start, year_end))):
        start_dt, end_dt = month_date_range_dt(y, m)

        # OBLIGATORIO: asignar objetos date/datetime (no str)
        g.start_date = start_dt
        g.end_date   = end_dt

        query = '("BBVA" OR "Banco Bilbao Vizcaya Argentaria" OR BBVA.MC)'
        try:
            items = g.get_news(query) or []
        except Exception:
            items = []

        for it in items:
            rows.append({
                "dt": it.get("published date"),
                "title": it.get("title"),
                "url": it.get("url"),
                "publisher": (it.get("publisher") or {}).get("title"),
                "desc": it.get("description"),
                "_kw": "BBVA",
                "_source": "gnews",
                "_window": f"{start_dt}..{end_dt}"
            })
        print(f"{y}-{m:02d}: {len(items)}")
        time.sleep(sleep_s)

    df = pd.DataFrame(rows)
    if not df.empty:
        df["dt"] = pd.to_datetime(df["dt"], errors="coerce")
        df = df.dropna(subset=["dt"]).sort_values("dt").reset_index(drop=True)
    return df

news_bbva_2425 = fetch_gnews_bbva(2024, 2025)
print(news_bbva_2425.shape)
news_bbva_2425.head(3)


  0%|          | 0/24 [00:00<?, ?it/s]

2024-01: 100




2024-02: 100


  8%|▊         | 2/24 [02:25<26:33, 72.44s/it]

2024-03: 100


 12%|█▎        | 3/24 [03:35<25:00, 71.44s/it]

2024-04: 100


 17%|█▋        | 4/24 [04:46<23:44, 71.20s/it]

2024-05: 100


 21%|██        | 5/24 [06:01<22:58, 72.55s/it]

2024-06: 100


 25%|██▌       | 6/24 [07:27<23:06, 77.02s/it]

2024-07: 100


 29%|██▉       | 7/24 [09:03<23:35, 83.27s/it]

2024-08: 100


 33%|███▎      | 8/24 [10:42<23:34, 88.42s/it]

2024-09: 100


 38%|███▊      | 9/24 [12:07<21:46, 87.10s/it]

2024-10: 100


 42%|████▏     | 10/24 [13:30<20:05, 86.11s/it]

2024-11: 100


 46%|████▌     | 11/24 [14:54<18:28, 85.29s/it]

2024-12: 100


 50%|█████     | 12/24 [16:21<17:10, 85.85s/it]

2025-01: 100


 54%|█████▍    | 13/24 [17:50<15:54, 86.80s/it]

2025-02: 100


 58%|█████▊    | 14/24 [19:15<14:21, 86.19s/it]

2025-03: 100


 62%|██████▎   | 15/24 [20:45<13:07, 87.52s/it]

2025-04: 0


 67%|██████▋   | 16/24 [20:47<08:13, 61.71s/it]

2025-05: 100


 71%|███████   | 17/24 [22:18<08:12, 70.34s/it]

2025-06: 100


 75%|███████▌  | 18/24 [23:51<07:43, 77.32s/it]

2025-07: 100


 79%|███████▉  | 19/24 [25:36<07:07, 85.46s/it]

2025-08: 100


 83%|████████▎ | 20/24 [27:18<06:02, 90.55s/it]

2025-09: 100


 88%|████████▊ | 21/24 [28:52<04:34, 91.62s/it]

2025-10: 100


 92%|█████████▏| 22/24 [30:17<02:59, 89.70s/it]

2025-11: 57


 96%|█████████▌| 23/24 [31:02<01:16, 76.30s/it]

2025-12: 0


100%|██████████| 24/24 [31:03<00:00, 77.66s/it]

(2157, 8)





Unnamed: 0,dt,title,url,publisher,desc,_kw,_source,_window
0,2024-01-01 08:00:00,BBVA y Jordi Roca fomentan el aprovechamiento ...,https://news.google.com/rss/articles/CBMimgFBV...,Gastroactitud,BBVA y Jordi Roca fomentan el aprovechamiento ...,BBVA,gnews,2024-01-01..2024-01-31
1,2024-01-02 08:00:00,Se filtraron los datos de mis tarjetas de créd...,https://news.google.com/rss/articles/CBMi2wFBV...,Info Viajera,Se filtraron los datos de mis tarjetas de créd...,BBVA,gnews,2024-01-01..2024-01-31
2,2024-01-02 08:00:00,"BBVA Argentina, entre las empresas con mejor r...",https://news.google.com/rss/articles/CBMirAFBV...,Diario Responsable,"BBVA Argentina, entre las empresas con mejor r...",BBVA,gnews,2024-01-01..2024-01-31


In [7]:
from pathlib import Path

RAW_NEWS_DIR = Path("../data/raw/noticias"); RAW_NEWS_DIR.mkdir(parents=True, exist_ok=True)

def save_df(df, path_stem: Path):
    """
    Guarda df en Parquet si hay motor disponible; si falla, guarda CSV.
    path_stem es la ruta SIN extensión (Path(.../"nombre_sin_ext")).
    """
    pq = path_stem.with_suffix(".parquet")
    csv = path_stem.with_suffix(".csv")
    try:
        df.to_parquet(pq, index=False, engine="pyarrow")
        print(f"✔ Guardado Parquet: {pq}")
        return pq
    except Exception as e1:
        try:
            df.to_parquet(pq, index=False)  # por si existe otro engine
            print(f"✔ Guardado Parquet (engine auto): {pq}")
            return pq
        except Exception as e2:
            print(f"⚠ Parquet no disponible ({e1 or e2}). Guardo CSV.")
            df.to_csv(csv, index=False)
            print(f"✔ Guardado CSV: {csv}")
            return csv

def clean_news_basic(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty: 
        return df
    out = df.copy()
    # Normalizar strings
    for c in ["title","desc","publisher","url"]:
        if c in out.columns:
            out[c] = out[c].astype(str).str.strip()
    # Deduplicados
    out = out.drop_duplicates(subset=["url"])
    out = out.drop_duplicates(subset=["title","dt"])
    # Mantener columnas clave
    keep = ["dt","title","desc","publisher","url","_kw","_source","_window"]
    out = out[[c for c in keep if c in out.columns]].reset_index(drop=True)
    return out

bbva_raw_clean = clean_news_basic(news_bbva_2425)
_ = save_df(bbva_raw_clean, RAW_NEWS_DIR / "gnews_bbva_2024_2025")
bbva_raw_clean.shape, bbva_raw_clean.head(3)


✔ Guardado Parquet: ..\data\raw\noticias\gnews_bbva_2024_2025.parquet


((2157, 8),
                    dt                                              title  \
 0 2024-01-01 08:00:00  BBVA y Jordi Roca fomentan el aprovechamiento ...   
 1 2024-01-02 08:00:00  Se filtraron los datos de mis tarjetas de créd...   
 2 2024-01-02 08:00:00  BBVA Argentina, entre las empresas con mejor r...   
 
                                                 desc           publisher  \
 0  BBVA y Jordi Roca fomentan el aprovechamiento ...       Gastroactitud   
 1  Se filtraron los datos de mis tarjetas de créd...        Info Viajera   
 2  BBVA Argentina, entre las empresas con mejor r...  Diario Responsable   
 
                                                  url   _kw _source  \
 0  https://news.google.com/rss/articles/CBMimgFBV...  BBVA   gnews   
 1  https://news.google.com/rss/articles/CBMi2wFBV...  BBVA   gnews   
 2  https://news.google.com/rss/articles/CBMirAFBV...  BBVA   gnews   
 
                   _window  
 0  2024-01-01..2024-01-31  
 1  2024-01-01..2024-01-

In [8]:
bbva_raw_clean.assign(month=bbva_raw_clean["dt"].dt.to_period("M")) \
              .groupby("month")["url"].count().tail(12)

month
2024-11    100
2024-12    100
2025-01    100
2025-02    100
2025-03    100
2025-05    100
2025-06    100
2025-07    100
2025-08    100
2025-09    100
2025-10    100
2025-11     57
Freq: M, Name: url, dtype: int64

In [9]:
from pysentimiento import create_analyzer
from tqdm import tqdm
from pathlib import Path
import pandas as pd

INT_NEWS_DIR = Path("../data/interim/noticias"); INT_NEWS_DIR.mkdir(parents=True, exist_ok=True)

analyzer = create_analyzer(task="sentiment", lang="es")  # BETO

def pick_text(row):
    t = (row.get("title") or "").strip()
    d = (row.get("desc") or "").strip()
    txt = (t + ". " + d).strip()
    return txt[:900]  # recorte para velocidad/estabilidad

def infer_sentiment_df(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty: 
        return df.copy()
    labs, scores = [], []
    for _, r in tqdm(df.iterrows(), total=len(df)):
        txt = pick_text(r)
        if not txt:
            labs.append(None); scores.append(None); continue
        res = analyzer.predict(txt)
        labs.append(res.output)                 # POS/NEG/NEU
        scores.append(res.probas.get(res.output))
    out = df.copy()
    out["sent_label"] = labs
    out["sent_score"] = scores
    return out

bbva_sent = infer_sentiment_df(bbva_raw_clean)
_ = save_df(bbva_sent, INT_NEWS_DIR / "gnews_bbva_2024_2025_sent")
bbva_sent.head(3)


100%|██████████| 2157/2157 [03:59<00:00,  9.02it/s]

✔ Guardado Parquet: ..\data\interim\noticias\gnews_bbva_2024_2025_sent.parquet





Unnamed: 0,dt,title,desc,publisher,url,_kw,_source,_window,sent_label,sent_score
0,2024-01-01 08:00:00,BBVA y Jordi Roca fomentan el aprovechamiento ...,BBVA y Jordi Roca fomentan el aprovechamiento ...,Gastroactitud,https://news.google.com/rss/articles/CBMimgFBV...,BBVA,gnews,2024-01-01..2024-01-31,POS,0.477571
1,2024-01-02 08:00:00,Se filtraron los datos de mis tarjetas de créd...,Se filtraron los datos de mis tarjetas de créd...,Info Viajera,https://news.google.com/rss/articles/CBMi2wFBV...,BBVA,gnews,2024-01-01..2024-01-31,NEU,0.572798
2,2024-01-02 08:00:00,"BBVA Argentina, entre las empresas con mejor r...","BBVA Argentina, entre las empresas con mejor r...",Diario Responsable,https://news.google.com/rss/articles/CBMirAFBV...,BBVA,gnews,2024-01-01..2024-01-31,NEU,0.55003


In [10]:
from pysentimiento import create_analyzer

analyzer = create_analyzer(task="sentiment", lang="es")
print("OK · pysentimiento cargado")

for t in [
    "BBVA presenta beneficios récord y mejora su guía",
    "El regulador impone una multa millonaria a la entidad",
    "El banco mantiene estable su posición de solvencia",
]:
    r = analyzer.predict(t)
    print(f"{t}\n -> {r.output} {r.probas}\n")


OK · pysentimiento cargado
BBVA presenta beneficios récord y mejora su guía
 -> POS {'NEG': 0.03400544077157974, 'NEU': 0.38048219680786133, 'POS': 0.5855123996734619}

El regulador impone una multa millonaria a la entidad
 -> NEU {'NEG': 0.4376159608364105, 'NEU': 0.5081786513328552, 'POS': 0.05420536920428276}

El banco mantiene estable su posición de solvencia
 -> NEU {'NEG': 0.03560333326458931, 'NEU': 0.7690609693527222, 'POS': 0.19533571600914001}



In [11]:
from pathlib import Path

PROC_NEWS_DIR = Path("../data/processed/noticias"); PROC_NEWS_DIR.mkdir(parents=True, exist_ok=True)

df = bbva_sent.copy()
df["Date"] = df["dt"].dt.date

map_num = {"POS": 1, "NEU": 0, "NEG": -1, None: 0}
df["sent_num"] = df["sent_label"].map(map_num).fillna(0)

# Diario
daily = (df.groupby("Date")
           .agg(n=("url","count"),
                pos=("sent_label", lambda s: (s=="POS").sum()),
                neg=("sent_label", lambda s: (s=="NEG").sum()),
                neu=("sent_label", lambda s: (s=="NEU").sum()),
                sent_mean=("sent_num","mean"))
           .reset_index())

# Semanal (inicio lunes)
tmp = pd.to_datetime(df["Date"])
weekly = (df.assign(week=tmp.dt.to_period("W-MON").dt.start_time)
            .groupby("week")
            .agg(n=("url","count"),
                 pos=("sent_label", lambda s: (s=="POS").sum()),
                 neg=("sent_label", lambda s: (s=="NEG").sum()),
                 neu=("sent_label", lambda s: (s=="NEU").sum()),
                 sent_mean=("sent_num","mean"))
            .reset_index().rename(columns={"week":"Date"}))

# Mensual (inicio de mes)
monthly = (df.assign(month=pd.to_datetime(df["Date"]).values.astype("datetime64[M]"))
             .groupby("month")
             .agg(n=("url","count"),
                  pos=("sent_label", lambda s: (s=="POS").sum()),
                  neg=("sent_label", lambda s: (s=="NEG").sum()),
                  neu=("sent_label", lambda s: (s=="NEU").sum()),
                  sent_mean=("sent_num","mean"))
             .reset_index().rename(columns={"month":"Date"}))

_ = save_df(daily,   PROC_NEWS_DIR / "bbva_daily_2024_2025")
_ = save_df(weekly,  PROC_NEWS_DIR / "bbva_weekly_2024_2025")
_ = save_df(monthly, PROC_NEWS_DIR / "bbva_monthly_2024_2025")

daily.tail(5)


✔ Guardado Parquet: ..\data\processed\noticias\bbva_daily_2024_2025.parquet
✔ Guardado Parquet: ..\data\processed\noticias\bbva_weekly_2024_2025.parquet
✔ Guardado Parquet: ..\data\processed\noticias\bbva_monthly_2024_2025.parquet


Unnamed: 0,Date,n,pos,neg,neu,sent_mean
492,2025-10-28,1,0,0,1,0.0
493,2025-10-29,4,0,1,3,-0.25
494,2025-10-30,8,1,3,4,-0.25
495,2025-11-01,28,2,6,20,-0.142857
496,2025-11-02,29,4,3,22,0.034483


In [15]:
# --- Asegurar mismas fechas y dtypes ---
precios_bbva = pd.read_csv("../data/interim/precios_limpios/BBVA_core_clean.csv", parse_dates=["Date"])

# Convertimos ambos a datetime64[ns] a medianoche (naive)
daily_fix = daily.copy()
daily_fix["Date"] = pd.to_datetime(daily_fix["Date"])           # pasa de object(date) -> datetime64[ns]
precios_bbva["Date"] = pd.to_datetime(precios_bbva["Date"]).dt.normalize()

# --- Merge (left sobre precios, para no perder días de mercado) ---
final_bbva = precios_bbva.merge(daily_fix, on="Date", how="left")

# (Opcional) Rellenos de NaN de noticias cuando no hubo artículos ese día
final_bbva[["n","pos","neg","neu"]] = final_bbva[["n","pos","neg","neu"]].fillna(0).astype("int64")
final_bbva["sent_mean"] = final_bbva["sent_mean"].fillna(0.0)

_ = save_df(final_bbva, Path("../data/processed/noticias") / "bbva_precios_sent_diario_2024_2025")
final_bbva.tail(10)


✔ Guardado Parquet: ..\data\processed\noticias\bbva_precios_sent_diario_2024_2025.parquet


Unnamed: 0,Date,Close,High,Low,Open,Volume,n,pos,neg,neu,sent_mean
6624,2025-10-17,1.6655e+16,173800000000000.0,1.6395e+16,1.6655e+16,32814118.0,25,1,20,4,-0.76
6625,2025-10-20,1.7295e+16,1732000000000000.0,1.679e+16,168700000000000.0,10945260.0,5,2,1,2,0.2
6626,2025-10-21,1.7065e+16,1.73e+16,170050000000000.0,1.727e+16,7222921.0,4,0,0,4,0.0
6627,2025-10-22,170.0,1718000000000000.0,1.685e+16,1.69e+16,7052132.0,4,0,0,4,0.0
6628,2025-10-23,1.696e+16,1.7175e+16,1.69e+16,170.0,4720528.0,0,0,0,0,0.0
6629,2025-10-24,1.69e+16,1.71e+16,1.6735e+16,1.7015e+16,6644829.0,1,0,0,1,0.0
6630,2025-10-27,1.72e+16,1.7235e+16,170050000000000.0,1.705e+16,6597073.0,0,0,0,0,0.0
6631,2025-10-28,1.719e+16,1721500000000000.0,1.704e+16,1707000000000000.0,5314738.0,1,0,0,1,0.0
6632,2025-10-29,1.758e+16,1.7585e+16,1705500000000000.0,1.721e+16,10750147.0,4,0,1,3,-0.25
6633,2025-10-30,1.728e+16,1741000000000000.0,1.705e+16,1.739e+16,10839276.0,8,1,3,4,-0.25


---

In [17]:
import pandas as pd
from pathlib import Path

# Intenta usar el CSV directo de yfinance (el que guardaste en el notebook de precios)
p1 = Path("../data/precios/BBVA.csv")
p2 = Path("../data/interim/precios_limpios/BBVA_core_clean.csv")

if p1.exists():
    precios_bbva = pd.read_csv(p1, parse_dates=["Date"])
    fuente = "yfinance CSV (recomendado)"
else:
    # fallback: intenta el 'core_clean' pero forzaremos saneado de columnas
    precios_bbva = pd.read_csv(p2, parse_dates=["Date"])
    fuente = "core_clean (se saneará)"

print("Fuente precios:", fuente)
print(precios_bbva.dtypes)
precios_bbva.head()

Fuente precios: core_clean (se saneará)
Date      datetime64[ns]
Close            float64
High             float64
Low              float64
Open             float64
Volume           float64
dtype: object


Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2000-01-03,4115111000000000.0,415574000000000.0,410640500000000.0,4135426000000000.0,8244257.0
1,2000-01-04,4007733000000000.0,4088991000000000.0,3.993223e+16,4062872000000000.0,8522096.0
2,2000-01-05,3917772000000000.0,3990323000000000.0,3900359000000000.0,3.970008e+16,12159826.0
3,2000-01-06,3917772000000000.0,3.917772e+16,3.917772e+16,3.917772e+16,0.0
4,2000-01-07,3967107000000000.0,4001931000000000.0,3926478000000000.0,4001931000000000.0,62261944.0


In [18]:
# Detecta columnas de precio típicas
price_cols = [c for c in ["Open","High","Low","Close","Adj Close"] if c in precios_bbva.columns]

# Si hay magnitudes absurdas, re-parsea fuerte (quita separadores raros) y normaliza
def needs_fix(s):
    try:
        v = pd.Series(s).dropna()
        return (v.abs().gt(1e4).mean() > 0.2)  # si >20% de filas superan 10k, mal
    except Exception:
        return True

if any(needs_fix(precios_bbva[c]) for c in price_cols):
    for c in price_cols:
        precios_bbva[c] = (
            precios_bbva[c]
            .astype(str)
            .str.replace(" ", "", regex=False)
            .str.replace("\u00A0", "", regex=False)     # no-break space
            .str.replace(".", "", regex=False)          # quita miles en formato EU
            .str.replace(",", ".", regex=False)         # coma -> punto
        )
        precios_bbva[c] = pd.to_numeric(precios_bbva[c], errors="coerce")

    # Si aún quedan monstruos, intenta una normalización por orden de magnitud (opcional)
    for c in price_cols:
        v = precios_bbva[c]
        med = v.dropna().median()
        if pd.notna(med) and med > 1000:
            # divide hasta que mediana caiga a rango razonable (0.1–1000)
            while med > 1000:
                v = v / 10
                med = v.dropna().median()
            precios_bbva[c] = v

print(precios_bbva[price_cols].describe(percentiles=[]))

               Open          High           Low         Close
count  6.634000e+03  6.634000e+03  6.634000e+03  6.634000e+03
mean   4.048540e+17  3.786762e+17  4.045265e+17  3.725203e+17
std    9.722699e+17  9.623619e+17  9.590312e+17  9.314793e+17
min    1.200000e-11  1.250000e-11  1.250000e-11  1.300000e-11
50%    5.065551e+02  5.002445e+02  5.025070e+02  5.041849e+02
max    7.038987e+18  7.075268e+18  6.684943e+18  7.121778e+18


In [19]:
# Asegura tipo datetime64[ns] en ambas tablas
precios_bbva["Date"] = pd.to_datetime(precios_bbva["Date"]).dt.normalize()

daily_fix = daily.copy()
daily_fix["Date"] = pd.to_datetime(daily_fix["Date"])  # de object(date) -> datetime64[ns]

final_bbva = precios_bbva.merge(daily_fix, on="Date", how="left")

# Rellenos cuando no hubo noticias ese día
for col in ["n","pos","neg","neu"]:
    if col in final_bbva.columns:
        final_bbva[col] = final_bbva[col].fillna(0).astype("int64")
if "sent_mean" in final_bbva.columns:
    final_bbva["sent_mean"] = final_bbva["sent_mean"].fillna(0.0)

_ = save_df(final_bbva, Path("../data/processed/noticias") / "bbva_precios_sent_diario_2024_2025")
final_bbva.tail(10)


✔ Guardado Parquet: ..\data\processed\noticias\bbva_precios_sent_diario_2024_2025.parquet


Unnamed: 0,Date,Close,High,Low,Open,Volume,n,pos,neg,neu,sent_mean
6624,2025-10-17,1.6655e+18,17.38,1.6395e+18,1.6655e+18,32814118.0,25,1,20,4,-0.76
6625,2025-10-20,1.7295e+18,173.2,1.679e+18,16.87,10945260.0,5,2,1,2,0.2
6626,2025-10-21,1.7065e+18,1.73e+18,17.005,1.727e+18,7222921.0,4,0,0,4,0.0
6627,2025-10-22,1.7e-11,171.8,1.685e+18,1.69e+18,7052132.0,4,0,0,4,0.0
6628,2025-10-23,1.696e+18,1.7175e+18,1.69e+18,1.7e-11,4720528.0,0,0,0,0,0.0
6629,2025-10-24,1.69e+18,1.71e+18,1.6735e+18,1.7015e+18,6644829.0,1,0,0,1,0.0
6630,2025-10-27,1.72e+18,1.7235e+18,17.005,1.705e+18,6597073.0,0,0,0,0,0.0
6631,2025-10-28,1.719e+18,172.15,1.704e+18,170.7,5314738.0,1,0,0,1,0.0
6632,2025-10-29,1.758e+18,1.7585e+18,170.55,1.721e+18,10750147.0,4,0,1,3,-0.25
6633,2025-10-30,1.728e+18,174.1,1.705e+18,1.739e+18,10839276.0,8,1,3,4,-0.25
