In [8]:
# Scraping aggiuntivo Google News – Netflix & SVB (con finestre temporali)
# In questo notebook estendiamo il dataset di Google News sugli eventi:
# - Netflix (shock del 19 aprile 2022)
# - Silicon Valley Bank (SVB, crisi di marzo 2023)
# Partiamo da un dataset esistente già pulito (es. `google_news_ultimo.csv`) e
# aggiungiamo nuovi articoli rispettando rigorosamente le finestre temporali.

In [10]:
import pandas as pd

# Carichiamo il dataset
base_file = "google_news_ultimo.csv"

df_existing = pd.read_csv(base_file, parse_dates=["published_utc"])

# Set di link e (titolo, fonte) già presenti = serviranno per evitare duplicati
existing_links = set(df_existing["link"].dropna())
existing_title_source = set(
    zip(
        df_existing["title"].fillna(""),
        df_existing["source"].fillna("")
    )
)

print("Articoli esistenti:", len(df_existing))
print("Link unici esistenti:", len(existing_links))
df_existing.head()


Articoli esistenti: 488
Link unici esistenti: 488


Unnamed: 0,case,title,source,published_utc,link,query,date,out_of_range
0,SVB,Silicon Valley Bank and Signature Bank taken o...,Norton Rose Fulbright,2023-03-10 08:00:00+00:00,https://news.google.com/rss/articles/CBMizwFBV...,"""Silicon Valley Bank"" regulators",2023-03-10,False
1,SVB,"Silicon Valley Bank Closed by Regulators, FDIC...",The Wall Street Journal,2023-03-10 08:00:00+00:00,https://news.google.com/rss/articles/CBMizgNBV...,"""Silicon Valley Bank"" regulators",2023-03-10,False
2,SVB,Regulators take over Silicon Valley Bank - Ban...,Banking Dive,2023-03-10 08:00:00+00:00,https://news.google.com/rss/articles/CBMitwFBV...,"""Silicon Valley Bank"" regulators",2023-03-10,False
3,SVB,Silicon Valley Bank has officially failed afte...,Fortune,2023-03-10 08:00:00+00:00,https://news.google.com/rss/articles/CBMiigFBV...,"""Silicon Valley Bank"" regulators",2023-03-10,False
4,SVB,Silicon Valley Bank shut down by regulators - ...,Fox Business,2023-03-10 08:00:00+00:00,https://news.google.com/rss/articles/CBMiiAFBV...,"""Silicon Valley Bank"" regulators",2023-03-10,False


In [30]:
from datetime import date

# Nuove query per ampliare il dataset
NEW_QUERIES = {
    "Netflix": [
        '"Netflix" stock',
        '"Netflix" earnings',
        '"Netflix" subscribers',
        '"Netflix" shares earnings',
        '"Netflix" streaming market stock'
    ],
    "SVB": [
        '"Silicon Valley Bank" collapse',
        '"Silicon Valley Bank" bank',
        '"SVB" banking crisis',
        '"SVB" financial system',
        '"Silicon Valley Bank" regulators'
    ]
}

# Finestre temporali
TIME_WINDOWS = {
    "Netflix": {
        "start": date(2022, 4, 19),
        "end":   date(2022, 5, 3)   # inclusivo
    },
    "SVB": {
        "start": date(2023, 3, 10),
        "end":   date(2023, 3, 17)  # inclusivo
    }
}

TIME_WINDOWS

NEWS_EDITIONS = [
    # Inglese
    {"hl": "en-US", "gl": "US", "ceid": "US:en"},
    {"hl": "en-GB", "gl": "GB", "ceid": "GB:en"},

    # Spagnolo
    {"hl": "es-ES", "gl": "ES", "ceid": "ES:es"},

    # Italiano
    {"hl": "it-IT", "gl": "IT", "ceid": "IT:it"},

    # Tedesco
    {"hl": "de-DE", "gl": "DE", "ceid": "DE:de"},

    # Giapponese
    {"hl": "ja-JP", "gl": "JP", "ceid": "JP:ja"},

    # Coreano
    {"hl": "ko-KR", "gl": "KR", "ceid": "KR:ko"},
]


In [12]:
!pip install feedparser


Collecting feedparser
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.12-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.5/81.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6046 sha256=c8ce7a49e26a0c597fc31c45dd50ba2ec8e0a8be4610317be3f44c853e6ef06d
  Stored in directory: /root/.cache/pip/wheels/03/f5/1a/23761066dac1d0e8e683e5fdb27e12de53209d05a4a37e6246
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.12 sgmllib3k-1.0.0


In [26]:
import feedparser
import urllib.parse
from dateutil import parser as date_parser
from datetime import timezone
import time

def build_rss(query: str, edition: dict) -> str:
    """
    Costruisce l'URL RSS di Google News per una query
    e una specifica edizione linguistica/paese.
    """
    base = "https://news.google.com/rss/search"
    params = (
        f"?q={urllib.parse.quote(query)}"
        f"&hl={edition['hl']}"
        f"&gl={edition['gl']}"
        f"&ceid={edition['ceid']}"
    )
    return base + params

def parse_date(entry):
    """
    Prova a leggere la data di pubblicazione in diversi campi RSS
    e la restituisce in UTC.
    """
    for key in ["published", "pubDate", "updated"]:
        if key in entry:
            dt = date_parser.parse(entry[key])
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=timezone.utc)
            else:
                dt = dt.astimezone(timezone.utc)
            return dt
    return None


In [22]:
for edition in NEWS_EDITIONS:
    feed = feedparser.parse(build_rss(q, edition))


In [31]:
new_articles = []

for case, queries in NEW_QUERIES.items():
    window = TIME_WINDOWS[case]
    filters = CASE_FILTERS[case]

    print(f"\nScraping aggiuntivo – {case} (window: {window['start']} → {window['end']})")

    for q in queries:
        print(f"  Query: {q}")

        # giriamo su TUTTE le edizioni (en, es, de, ja, ko, ...)
        for edition in NEWS_EDITIONS:
            feed = feedparser.parse(build_rss(q, edition))

            for entry in feed.entries:
                # 1. Parse data di pubblicazione
                dt = parse_date(entry)
                if dt is None:
                    continue

                article_date = dt.date()

                # 2. FILTRO TEMPORALE HARD
                if not (window["start"] <= article_date <= window["end"]):
                    continue

                # 3. Titolo
                title = entry.get("title", "").strip()
                if not title:
                    continue

                title_lower = title.lower()

                # 4. FILTRO TEMATICO SUL TITOLO
                # Deve contenere almeno una parola "must"
                if not any(word in title_lower for word in filters["must"]):
                    continue

                # Non deve contenere parole di esclusione
                if any(word in title_lower for word in filters["must_not"]):
                    continue

                # 5. Link
                link = entry.get("link", "").strip()
                if not link:
                    continue

                # 6. DEDUPLICA (solo per link)
                if link in existing_links:
                    continue

                # 7. Fonte
                source = ""
                if "source" in entry and hasattr(entry["source"], "title"):
                    source = entry["source"].title

                # 8. Se passa tutto → salva articolo
                new_articles.append({
                    "case": case,
                    "title": title,
                    "source": source,
                    "published_utc": dt,
                    "link": link,
                    "query": q,
                    "edition_hl": edition["hl"],
                    "edition_gl": edition["gl"],
                })

            # piccola pausa per non stressare troppo Google
            time.sleep(1)

print("\nNuovi articoli trovati (nel range, tematici, non duplicati):", len(new_articles))



Scraping aggiuntivo – Netflix (window: 2022-04-19 → 2022-05-03)
  Query: "Netflix" stock
  Query: "Netflix" earnings
  Query: "Netflix" subscribers
  Query: "Netflix" shares earnings
  Query: "Netflix" streaming market stock

Scraping aggiuntivo – SVB (window: 2023-03-10 → 2023-03-17)
  Query: "Silicon Valley Bank" collapse
  Query: "Silicon Valley Bank" bank
  Query: "SVB" banking crisis
  Query: "SVB" financial system
  Query: "Silicon Valley Bank" regulators

Nuovi articoli trovati (nel range, tematici, non duplicati): 249


In [32]:
import pandas as pd

# Trasformiamo i nuovi articoli in DataFrame
df_new = pd.DataFrame(new_articles)

print("Nuovi articoli trovati:", len(df_new))

# Merge con il dataset esistente
df_final = pd.concat([df_existing, df_new], ignore_index=True)

# Deduplica finale di sicurezza (solo per link)
df_final = df_final.drop_duplicates(subset=["case", "link"]).reset_index(drop=True)

print("Articoli totali nel dataset finale:", len(df_final))
print("\nDistribuzione per case:")
print(df_final["case"].value_counts())

# Salvataggio CSV
output_file = "google_news_ultimissimo.csv"
df_final.to_csv(output_file, index=False)

print("\nCSV salvato correttamente come:", output_file)


Nuovi articoli trovati: 249
Articoli totali nel dataset finale: 691

Distribuzione per case:
case
SVB        529
Netflix    162
Name: count, dtype: int64

CSV salvato correttamente come: google_news_ultimissimo.csv


In [3]:
import pandas as pd

#
# Carichiamo il CSV
#
input_file = "google_news_ultimissimo.csv"
df_clean = pd.read_csv(input_file, parse_dates=["published_utc"])

print("Articoli caricati:", len(df_clean))

# Rimuoviamo colonne di edizione
cols_to_drop = ["edition_hl", "edition_gl"]
df_clean = df_clean.drop(columns=cols_to_drop, errors="ignore")

# Controllo out of range
if "out_of_range" in df_clean.columns:
    n_out = df_clean["out_of_range"].sum()
    print("Articoli fuori range trovati:", n_out)

    if n_out > 0:
        raise ValueError(
            "ATTENZIONE: ci sono articoli fuori range. "
            "Il dataset NON dovrebbe contenerli."
        )

    # Se tutti False, eliminiamo la colonna
    df_clean = df_clean.drop(columns=["out_of_range"])
else:
    print("Colonna out_of_range non presente (ok).")

# Deduplica finale
before = len(df_clean)

df_clean = df_clean.drop_duplicates(subset=["case", "link"]).reset_index(drop=True)

after = len(df_clean)
print("Duplicati rimossi:", before - after)
print("Articoli dopo deduplica:", after)

# Salvataggio
output_file = "google_news_svb_netflix_FINAL_CLEAN.csv"
df_clean.to_csv(output_file, index=False)

print("\nCSV finale pulito salvato come:", output_file)



Articoli caricati: 691
Articoli fuori range trovati: 0
Duplicati rimossi: 0
Articoli dopo deduplica: 691

CSV finale pulito salvato come: google_news_svb_netflix_FINAL_CLEAN.csv


In [5]:
import pandas as pd
import requests
from tqdm import tqdm

# Carichiamo il dataset finale pulito
df = pd.read_csv("google_news_svb_netflix_FINAL_CLEAN.csv", parse_dates=["published_utc"])

print("Articoli nel dataset:", len(df))

# Controllo accessibilità link
TIMEOUT = 10
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; DatasetAudit/1.0)"
}

def check_link(url):
    try:
        r = requests.head(url, headers=HEADERS, timeout=TIMEOUT, allow_redirects=True)
        return True, r.status_code
    except Exception:
        try:
            r = requests.get(url, headers=HEADERS, timeout=TIMEOUT, allow_redirects=True)
            return True, r.status_code
        except Exception:
            return False, None

results = []
print("\nControllo accessibilità link...")

for url in tqdm(df["link"], desc="Checking links"):
    ok, status = check_link(url)
    results.append({
        "link_ok": ok,
        "http_status": status
    })

df["link_ok"] = [r["link_ok"] for r in results]
df["http_status"] = [r["http_status"] for r in results]

n_ok = df["link_ok"].sum()
n_total = len(df)
print(f"\nLink accessibili: {n_ok}/{n_total} ({n_ok/n_total*100:.2f}%)")


Articoli nel dataset: 691

Controllo accessibilità link...


Checking links: 100%|██████████| 691/691 [01:21<00:00,  8.52it/s]


Link accessibili: 691/691 (100.00%)





In [6]:
# Conteggio articoli per caso (totali)
print("Articoli per case (totali):")
print(df["case"].value_counts(), "\n")


Articoli per case (totali):
case
SVB        529
Netflix    162
Name: count, dtype: int64 

