In [2]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from gnews import GNews
import time, re

RAW_NEWS_DIR = Path("../data/raw/noticias"); RAW_NEWS_DIR.mkdir(parents=True, exist_ok=True)
INT_NEWS_DIR = Path("../data/interim/noticias"); INT_NEWS_DIR.mkdir(parents=True, exist_ok=True)
PROC_NEWS_DIR = Path("../data/processed/noticias"); PROC_NEWS_DIR.mkdir(parents=True, exist_ok=True)

# Config inicial: España, ES, resultados en español primero
g = GNews(language='es', country='ES', max_results=100)  # max_results por query


In [4]:
from datetime import datetime, date
import pandas as pd
from tqdm import tqdm
import time

def month_iter(year_start=2024, year_end=2025):
    for y in range(year_start, year_end+1):
        for m in range(1, 13):
            yield y, m

def month_date_range_dt(year:int, month:int):
    # Devuelve objetos datetime.date (no strings)
    start = pd.Timestamp(year=year, month=month, day=1)
    end = (start + pd.offsets.MonthEnd(1))
    return start.date(), end.date()

def fetch_gnews_bbva(year_start=2024, year_end=2025, sleep_s=0.4):
    rows = []
    for y, m in tqdm(list(month_iter(year_start, year_end))):
        start_dt, end_dt = month_date_range_dt(y, m)

        # OBLIGATORIO: asignar objetos date/datetime (no str)
        g.start_date = start_dt
        g.end_date   = end_dt

        query = '("BBVA" OR "Banco Bilbao Vizcaya Argentaria" OR BBVA.MC)'
        try:
            items = g.get_news(query) or []
        except Exception:
            items = []

        for it in items:
            rows.append({
                "dt": it.get("published date"),
                "title": it.get("title"),
                "url": it.get("url"),
                "publisher": (it.get("publisher") or {}).get("title"),
                "desc": it.get("description"),
                "_kw": "BBVA",
                "_source": "gnews",
                "_window": f"{start_dt}..{end_dt}"
            })
        print(f"{y}-{m:02d}: {len(items)}")
        time.sleep(sleep_s)

    df = pd.DataFrame(rows)
    if not df.empty:
        df["dt"] = pd.to_datetime(df["dt"], errors="coerce")
        df = df.dropna(subset=["dt"]).sort_values("dt").reset_index(drop=True)
    return df

news_bbva_2425 = fetch_gnews_bbva(2024, 2025)
print(news_bbva_2425.shape)
news_bbva_2425.head(3)


  0%|          | 0/24 [00:00<?, ?it/s]

2024-01: 100




2024-02: 100


  8%|▊         | 2/24 [02:21<25:49, 70.44s/it]

2024-03: 100


 12%|█▎        | 3/24 [03:32<24:46, 70.80s/it]

2024-04: 100


 17%|█▋        | 4/24 [04:54<25:09, 75.47s/it]

2024-05: 100


 21%|██        | 5/24 [06:26<25:47, 81.44s/it]

2024-06: 100


 25%|██▌       | 6/24 [08:04<26:03, 86.84s/it]

2024-07: 100


 29%|██▉       | 7/24 [09:36<25:08, 88.71s/it]

2024-08: 100


 33%|███▎      | 8/24 [11:02<23:25, 87.86s/it]

2024-09: 100


 38%|███▊      | 9/24 [12:31<22:00, 88.03s/it]

2024-10: 100


 42%|████▏     | 10/24 [13:57<20:26, 87.62s/it]

2024-11: 100


 46%|████▌     | 11/24 [15:15<18:17, 84.44s/it]

2024-12: 100


 50%|█████     | 12/24 [16:33<16:29, 82.46s/it]

2025-01: 100


 54%|█████▍    | 13/24 [17:50<14:50, 80.99s/it]

2025-02: 100


 58%|█████▊    | 14/24 [19:13<13:36, 81.68s/it]

2025-03: 100


 62%|██████▎   | 15/24 [20:40<12:29, 83.23s/it]

2025-04: 100


 67%|██████▋   | 16/24 [22:07<11:14, 84.27s/it]

2025-05: 100


 71%|███████   | 17/24 [23:53<10:35, 90.79s/it]

2025-06: 100


 75%|███████▌  | 18/24 [25:16<08:51, 88.58s/it]

2025-07: 100


 79%|███████▉  | 19/24 [26:31<07:01, 84.38s/it]

2025-08: 100


 83%|████████▎ | 20/24 [27:48<05:29, 82.26s/it]

2025-09: 100


 88%|████████▊ | 21/24 [29:07<04:03, 81.17s/it]

2025-10: 100


 92%|█████████▏| 22/24 [30:28<02:42, 81.18s/it]

2025-11: 45


 96%|█████████▌| 23/24 [31:10<01:09, 69.49s/it]

2025-12: 0


100%|██████████| 24/24 [31:11<00:00, 78.00s/it]

(2245, 8)





Unnamed: 0,dt,title,url,publisher,desc,_kw,_source,_window
0,2024-01-01 08:00:00,BBVA y Jordi Roca fomentan el aprovechamiento ...,https://news.google.com/rss/articles/CBMimgFBV...,Gastroactitud,BBVA y Jordi Roca fomentan el aprovechamiento ...,BBVA,gnews,2024-01-01..2024-01-31
1,2024-01-02 08:00:00,"BBVA Argentina, entre las empresas con mejor r...",https://news.google.com/rss/articles/CBMirAFBV...,Diario Responsable,"BBVA Argentina, entre las empresas con mejor r...",BBVA,gnews,2024-01-01..2024-01-31
2,2024-01-02 08:00:00,Se filtraron los datos de mis tarjetas de créd...,https://news.google.com/rss/articles/CBMi2wFBV...,Info Viajera,Se filtraron los datos de mis tarjetas de créd...,BBVA,gnews,2024-01-01..2024-01-31


In [6]:
from pathlib import Path

RAW_NEWS_DIR = Path("../data/raw/noticias"); RAW_NEWS_DIR.mkdir(parents=True, exist_ok=True)

def save_df(df, path_stem: Path):
    """
    Guarda df en Parquet si hay motor disponible; si falla, guarda CSV.
    path_stem es la ruta SIN extensión (Path(.../"nombre_sin_ext")).
    """
    pq = path_stem.with_suffix(".parquet")
    csv = path_stem.with_suffix(".csv")
    try:
        df.to_parquet(pq, index=False, engine="pyarrow")
        print(f"✔ Guardado Parquet: {pq}")
        return pq
    except Exception as e1:
        try:
            df.to_parquet(pq, index=False)  # por si existe otro engine
            print(f"✔ Guardado Parquet (engine auto): {pq}")
            return pq
        except Exception as e2:
            print(f"⚠ Parquet no disponible ({e1 or e2}). Guardo CSV.")
            df.to_csv(csv, index=False)
            print(f"✔ Guardado CSV: {csv}")
            return csv

def clean_news_basic(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty: 
        return df
    out = df.copy()
    # Normalizar strings
    for c in ["title","desc","publisher","url"]:
        if c in out.columns:
            out[c] = out[c].astype(str).str.strip()
    # Deduplicados
    out = out.drop_duplicates(subset=["url"])
    out = out.drop_duplicates(subset=["title","dt"])
    # Mantener columnas clave
    keep = ["dt","title","desc","publisher","url","_kw","_source","_window"]
    out = out[[c for c in keep if c in out.columns]].reset_index(drop=True)
    return out

bbva_raw_clean = clean_news_basic(news_bbva_2425)
_ = save_df(bbva_raw_clean, RAW_NEWS_DIR / "gnews_bbva_2024_2025")
bbva_raw_clean.shape, bbva_raw_clean.head(3)


⚠ Parquet no disponible (Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.). Guardo CSV.
✔ Guardado CSV: ..\data\raw\noticias\gnews_bbva_2024_2025.csv


((2245, 8),
                    dt                                              title  \
 0 2024-01-01 08:00:00  BBVA y Jordi Roca fomentan el aprovechamiento ...   
 1 2024-01-02 08:00:00  BBVA Argentina, entre las empresas con mejor r...   
 2 2024-01-02 08:00:00  Se filtraron los datos de mis tarjetas de créd...   
 
                                                 desc           publisher  \
 0  BBVA y Jordi Roca fomentan el aprovechamiento ...       Gastroactitud   
 1  BBVA Argentina, entre las empresas con mejor r...  Diario Responsable   
 2  Se filtraron los datos de mis tarjetas de créd...        Info Viajera   
 
                                                  url   _kw _source  \
 0  https://news.google.com/rss/articles/CBMimgFBV...  BBVA   gnews   
 1  https://news.google.com/rss/articles/CBMirAFBV...  BBVA   gnews   
 2  https://news.google.com/rss/articles/CBMi2wFBV...  BBVA   gnews   
 
                   _window  
 0  2024-01-01..2024-01-31  
 1  2024-01-01..2024-01-

In [7]:
bbva_raw_clean.assign(month=bbva_raw_clean["dt"].dt.to_period("M")) \
              .groupby("month")["url"].count().tail(12)

month
2024-12    100
2025-01    100
2025-02    100
2025-03    100
2025-04    100
2025-05    100
2025-06    100
2025-07    100
2025-08    100
2025-09    100
2025-10    100
2025-11     45
Freq: M, Name: url, dtype: int64

In [1]:
from pysentimiento import create_analyzer
from tqdm import tqdm
from pathlib import Path

INT_NEWS_DIR = Path("../data/interim/noticias"); INT_NEWS_DIR.mkdir(parents=True, exist_ok=True)

analyzer = create_analyzer(task="sentiment", lang="es")  # BETO

def pick_text(row):
    t = (row.get("title") or "").strip()
    d = (row.get("desc") or "").strip()
    txt = (t + ". " + d).strip()
    return txt[:900]  # recorte para velocidad/estabilidad

def infer_sentiment_df(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty: 
        return df.copy()
    labs, scores = [], []
    for _, r in tqdm(df.iterrows(), total=len(df)):
        txt = pick_text(r)
        if not txt:
            labs.append(None); scores.append(None); continue
        res = analyzer.predict(txt)
        labs.append(res.output)                 # POS/NEG/NEU
        scores.append(res.probas.get(res.output))
    out = df.copy()
    out["sent_label"] = labs
    out["sent_score"] = scores
    return out

bbva_sent = infer_sentiment_df(bbva_raw_clean)
_ = save_df(bbva_sent, INT_NEWS_DIR / "gnews_bbva_2024_2025_sent")
bbva_sent.head(3)


ModuleNotFoundError: No module named 'pysentimiento'

In [2]:
from pysentimiento import create_analyzer

analyzer = create_analyzer(task="sentiment", lang="es")
print("OK · pysentimiento cargado")

for t in [
    "BBVA presenta beneficios récord y mejora su guía",
    "El regulador impone una multa millonaria a la entidad",
    "El banco mantiene estable su posición de solvencia",
]:
    r = analyzer.predict(t)
    print(f"{t}\n -> {r.output} {r.probas}\n")


ModuleNotFoundError: No module named 'pysentimiento'

In [3]:
import sys, site
print("Python:", sys.version)
print("Exe   :", sys.executable)
print("Site  :", site.getsitepackages())


Python: 3.11.9 (tags/v3.11.9:de54cf5, Apr  2 2024, 10:12:12) [MSC v.1938 64 bit (AMD64)]
Exe   : C:\Users\lopec\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe
Site  : ['C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\Lib\\site-packages']
