In [15]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
import pandas as pd

# Carica dataset esistente
# Assicurati che il percorso sia corretto per il tuo file su Google Drive
df_existing = pd.read_csv("/content/drive/Shareddrives/information project/Information Disorder Project/Fase 1 - Raccolta articoli OSINT/Prove/google_news_svb_netflix_multiquery1.csv") # <<< MODIFICA QUESTO PERCORSO

print("Articoli esistenti:", len(df_existing))

# Set di link già presenti
existing_links = set(df_existing["link"].dropna())

# Fallback: titolo + fonte
existing_title_source = set(
    zip(
        df_existing["title"].fillna(""),
        df_existing["source"].fillna("")
    )
)

print("Link unici esistenti:", len(existing_links))


Articoli esistenti: 412
Link unici esistenti: 412


In [18]:
#Inseriamo nuove queries
NEW_QUERIES = {
    "Netflix": [
        '"Netflix" stock crash',
        '"Netflix" shares plunge',
        '"Netflix" Wall Street reaction',
        '"Netflix" earnings shock',
        '"Netflix" subscriber slowdown'
    ],
    "SVB": [
        '"Silicon Valley Bank" regulators',
        '"SVB" bank collapse',
        '"SVB" financial contagion',
        '"Silicon Valley Bank" Fed response',
        '"SVB" systemic risk'
    ]
}


In [19]:
!pip install feedparser


Collecting feedparser
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.12-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.5/81.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6046 sha256=b828c81aaa9d040b02c42c42425f3cc1cfb0cef1b830bd2d29eee904dc0f6a30
  Stored in directory: /root/.cache/pip/wheels/03/f5/1a/23761066dac1d0e8e683e5fdb27e12de53209d05a4a37e6246
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.12 sgmllib3k-1.0.0


In [20]:
import feedparser
import urllib.parse
from dateutil import parser as date_parser
from datetime import timezone
import time


def build_rss(query):
    return f"https://news.google.com/rss/search?q={urllib.parse.quote(query)}"


def parse_date(entry):
    for key in ["published", "pubDate", "updated"]:
        if key in entry:
            dt = date_parser.parse(entry[key])
            return dt.astimezone(timezone.utc) if dt.tzinfo else dt.replace(tzinfo=timezone.utc)
    return None


new_articles = []

for case, queries in NEW_QUERIES.items():
    print(f"\nScraping aggiuntivo – {case}")

    for q in queries:
        print(f"  Query: {q}")
        feed = feedparser.parse(build_rss(q))

        for entry in feed.entries:
            link = entry.get("link", "").strip()
            title = entry.get("title", "").strip()

            source = ""
            if "source" in entry and hasattr(entry["source"], "title"):
                source = entry["source"].title

            # Deduplica
            if link in existing_links:
                continue
            if (title, source) in existing_title_source:
                continue

            dt = parse_date(entry)
            if dt is None:
                continue

            new_articles.append({
                "case": case,
                "title": title,
                "source": source,
                "published_utc": dt,
                "link": link,
                "query": q
            })

        time.sleep(1)

print("\nNuovi articoli trovati:", len(new_articles))



Scraping aggiuntivo – Netflix
  Query: "Netflix" stock crash
  Query: "Netflix" shares plunge
  Query: "Netflix" Wall Street reaction
  Query: "Netflix" earnings shock
  Query: "Netflix" subscriber slowdown

Scraping aggiuntivo – SVB
  Query: "Silicon Valley Bank" regulators
  Query: "SVB" bank collapse
  Query: "SVB" financial contagion
  Query: "Silicon Valley Bank" Fed response
  Query: "SVB" systemic risk

Nuovi articoli trovati: 627


In [21]:
df_new = pd.DataFrame(new_articles)

df_combined = pd.concat([df_existing, df_new], ignore_index=True)

print("Totale articoli dopo merge:", len(df_combined))

# Salva nuovo dataset
df_combined.to_csv(
    "google_news_svb_netflix_extended.csv",
    index=False
)

print("Salvato: google_news_svb_netflix_extended.csv")


Totale articoli dopo merge: 1039
Salvato: google_news_svb_netflix_extended.csv


In [22]:
import pandas as pd

try:
    df_work = df_combined.copy()
    print("Usato df_combined dalla memoria.")
except NameError:
    df_work = pd.read_csv(
        "google_news_svb_netflix_extended.csv",
        parse_dates=["published_utc"]
    )
    print("Caricato google_news_svb_netflix_extended.csv")

print("Numero articoli iniziali:", len(df_work))
df_work.head()


Usato df_combined dalla memoria.
Numero articoli iniziali: 1039


Unnamed: 0,case,title,source,published_utc,link,query
0,Netflix,Netflix shares down more than 20% after losing...,TechCrunch,2022-04-19 07:00:00+00:00,https://news.google.com/rss/articles/CBMitwFBV...,"""Netflix"" subscribers loss"
1,Netflix,Netflix Reports Loss of Subscribers for the Fi...,globaldata.com,2022-04-19 07:00:00+00:00,https://news.google.com/rss/articles/CBMi1AFBV...,"""Netflix"" subscribers loss"
2,Netflix,Netflix to Start “Pulling Back” Content Spend ...,hollywoodreporter.com,2022-04-19 07:00:00+00:00,https://news.google.com/rss/articles/CBMikAFBV...,"""Netflix"" subscriber loss April 2022"
3,Netflix,Netflix just lost $50 billion in market cap - CNN,CNN,2022-04-19 07:00:00+00:00,https://news.google.com/rss/articles/CBMiY0FVX...,"""Netflix"" subscriber loss April 2022"
4,Netflix,"Netflix Loses 200,000 Subscribers in Q1, Predi...",Variety,2022-04-19 07:00:00+00:00,https://news.google.com/rss/articles/CBMiigFBV...,"""Netflix"" subscriber loss April 2022"


In [23]:
import requests
from tqdm import tqdm

TIMEOUT = 10
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; DatasetAudit/1.0)"
}

def check_link(url):
    try:
        r = requests.head(url, headers=HEADERS, timeout=TIMEOUT, allow_redirects=True)
        return True, r.status_code
    except Exception:
        try:
            r = requests.get(url, headers=HEADERS, timeout=TIMEOUT, allow_redirects=True)
            return True, r.status_code
        except Exception:
            return False, None

results = []

print("Controllo accessibilità link...")
for url in tqdm(df_work["link"], desc="Checking links"):
    ok, status = check_link(url)
    results.append({
        "link_ok": ok,
        "http_status": status
    })

df_work = pd.concat(
    [df_work.reset_index(drop=True), pd.DataFrame(results)],
    axis=1
)

print("Link accessibili:", df_work["link_ok"].sum(), "/", len(df_work))


Controllo accessibilità link...


Checking links: 100%|██████████| 1039/1039 [02:00<00:00,  8.65it/s]

Link accessibili: 1039 / 1039





In [None]:
before = len(df_work)

# Deduplica primaria: link
df_work = df_work.drop_duplicates(subset=["case", "link"])

# Deduplica secondaria (sicurezza)
df_work = df_work.drop_duplicates(
    subset=["case", "title", "source", "published_utc"]
)

after = len(df_work)

print(f"Duplicati rimossi: {before - after}")
print("Articoli dopo deduplica:", after)


Duplicati rimossi: 54
Articoli dopo deduplica: 968


In [None]:
# Ordinamento finale
#
# Ordine esplicito dei casi
case_order = {"SVB": 0, "Netflix": 1}

df_work["case_order"] = df_work["case"].map(case_order)

df_work = (
    df_work
    .sort_values(by=["case_order", "published_utc"])
    .drop(columns=["case_order"])
    .reset_index(drop=True)
)

df_work.head()


Unnamed: 0,case,title,source,published_utc,link,query,link_ok,http_status
0,SVB,Silicon Valley Bank and Signature Bank taken o...,Norton Rose Fulbright,2023-03-10 08:00:00+00:00,https://news.google.com/rss/articles/CBMizwFBV...,"""Silicon Valley Bank"" regulators",True,200
1,SVB,"Silicon Valley Bank Closed by Regulators, FDIC...",The Wall Street Journal,2023-03-10 08:00:00+00:00,https://news.google.com/rss/articles/CBMizgNBV...,"""Silicon Valley Bank"" regulators",True,200
2,SVB,Regulators take over Silicon Valley Bank - Ban...,Banking Dive,2023-03-10 08:00:00+00:00,https://news.google.com/rss/articles/CBMitwFBV...,"""Silicon Valley Bank"" regulators",True,200
3,SVB,Silicon Valley Bank has officially failed afte...,Fortune,2023-03-10 08:00:00+00:00,https://news.google.com/rss/articles/CBMiigFBV...,"""Silicon Valley Bank"" regulators",True,200
4,SVB,Silicon Valley Bank shut down by regulators - ...,Fox Business,2023-03-10 08:00:00+00:00,https://news.google.com/rss/articles/CBMiiAFBV...,"""Silicon Valley Bank"" regulators",True,200


In [None]:
df_work = df_work.drop(columns=["link_ok", "http_status"], errors="ignore")

print("Colonne rimanenti:")
print(df_work.columns.tolist())

Colonne rimanenti:
['case', 'title', 'source', 'published_utc', 'link', 'query']


In [None]:
df_work.to_csv("google_news_svb_netflix_FINAL_.csv", index=False)
print("CSV finale pulito salvato.")


CSV finale pulito salvato.


In [None]:
counts_by_case = df_work["case"].value_counts()

print("Numero articoli per caso:\n")
print(counts_by_case)

Numero articoli per caso:

case
Netflix    497
SVB        471
Name: count, dtype: int64


In [None]:
from datetime import date
import pandas as pd

# Forziamo published_utc A Datetime (PASSO CHIAVE)
df_work["published_utc"] = pd.to_datetime(
    df_work["published_utc"],
    errors="coerce",
    utc=True
)

# Controllo rapido
print("Tipo published_utc:", df_work["published_utc"].dtype)

# Creiamo la colonna 'date' (giorno)
df_work["date"] = df_work["published_utc"].dt.date


# Definiamo finestre temporali
TIME_WINDOWS = {
    "Netflix": {
        "start": date(2022, 4, 19),
        "end":   date(2022, 5, 3)
    },
    "SVB": {
        "start": date(2023, 3, 10),
        "end":   date(2023, 3, 17)
    }
}

print("Finestre temporali definite correttamente.")


Tipo published_utc: datetime64[ns, UTC]
Finestre temporali definite correttamente.


In [None]:
def is_out_of_range(row):
    window = TIME_WINDOWS[row["case"]]
    return not (window["start"] <= row["date"] <= window["end"])

df_work["out_of_range"] = df_work.apply(is_out_of_range, axis=1)

df_work["out_of_range"].value_counts()


Unnamed: 0_level_0,count
out_of_range,Unnamed: 1_level_1
False,488
True,480


In [None]:
df_work.groupby(["case", "out_of_range"]).size()


Unnamed: 0_level_0,Unnamed: 1_level_0,0
case,out_of_range,Unnamed: 2_level_1
Netflix,False,154
Netflix,True,343
SVB,False,334
SVB,True,137


In [None]:
out_df = df_work[df_work["out_of_range"] == True][
    ["case", "date", "title", "source", "link"]
].sort_values(by=["case", "date"])

print("Articoli FUORI range temporale:\n")
print(out_df)


Articoli FUORI range temporale:

        case        date                                              title  \
471  Netflix  2016-07-19  Netflix sees subscriber slowdown at home and a...   
472  Netflix  2016-07-20  Netflix suffering from a growth problem - The ...   
473  Netflix  2018-04-17  Here's How Wall Street Is Reacting to Netflix'...   
474  Netflix  2018-07-17  Subscriber growth slowdown spooks Netflix shar...   
475  Netflix  2018-07-17  Netflix’s slowdown sparks fresh fears of video...   
..       ...         ...                                                ...   
204      SVB  2025-09-22  KPMG’s Work for Silicon Valley Bank Exposes Au...   
205      SVB  2025-10-23  Ex-SVB Top Brass Can't Ditch FDIC Suit Over 20...   
206      SVB  2025-10-24  Ex-SVB Executives Must Face FDIC Lawsuit Over ...   
207      SVB  2025-12-04  Former Signature Bank executives launch blockc...   
208      SVB  2026-01-15  Senator Warns: Crypto Threatens Banking Collap...   

                  

In [None]:
df_in_range = df_work[df_work["out_of_range"] == False].copy()

print("Numero totale articoli DENTRO range:", len(df_in_range))
print("\nDistribuzione per case:")
print(df_in_range["case"].value_counts())

output_path = "google_news_svb_netflix_multiquery2.csv"
df_in_range.to_csv(output_path, index=False)

print("\nCSV salvato:", output_path)

Numero totale articoli DENTRO range: 488

Distribuzione per case:
case
SVB        334
Netflix    154
Name: count, dtype: int64

CSV salvato: google_news_.csv
