In [3]:
import sys
from pathlib import Path
import json
import pandas as pd

ROOT = Path.cwd().resolve()
if (ROOT / "src" / "sentiment").exists():
    pass
elif (ROOT.parent / "src" / "sentiment").exists():
    ROOT = ROOT.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.utils import DATA_CLEANED
pd.set_option("display.max_colwidth", None)

In [4]:
processed_files = sorted(DATA_CLEANED.glob("processed_*.jsonl"), key=lambda p: p.stat().st_mtime, reverse=True)
if not processed_files:
    raise FileNotFoundError("No data/cleaned/processed_*.jsonl found. Run: python scripts/run_process.py")
processed_path = processed_files[0]
print(f"Loaded: {processed_path.name}")
rows = []
with open(processed_path, encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            rows.append(json.loads(line))
        except json.JSONDecodeError:
            continue
df = pd.DataFrame(rows)

Loaded: processed_20260226_15.jsonl


In [5]:
# Inspect shape and sample
df.head()

Unnamed: 0,posted_at,fetched_at,headline,url,source,reporter,ticker,is_ai_related,is_proxy_partnership,sentiment_vader,sentiment_llm_phi3,sentiment_llm_llama3_2,sentiment_llm_deepseek_r1
0,2026-02-26T14:00:00Z,2026-02-26T15:43:31Z,Figma partners with OpenAI to bake in support for Codex,https://techcrunch.com/2026/02/26/figma-partners-with-openai-to-bake-in-support-for-codex/,TechCrunch,TechCrunch,AAPL,True,True,,0.6,0.3,0.0
1,2026-02-26T14:00:00Z,2026-02-26T15:43:31Z,Figma partners with OpenAI to bake in support for Codex,https://techcrunch.com/2026/02/26/figma-partners-with-openai-to-bake-in-support-for-codex/,TechCrunch,TechCrunch,MSFT,True,True,,0.6,0.3,0.0
2,2026-02-26T14:00:00Z,2026-02-26T15:43:31Z,Figma partners with OpenAI to bake in support for Codex,https://techcrunch.com/2026/02/26/figma-partners-with-openai-to-bake-in-support-for-codex/,TechCrunch,TechCrunch,NVDA,True,True,,0.6,0.3,0.0
3,2026-02-26T12:00:00Z,2026-02-26T15:43:31Z,Instagram now alerts parents if their teen searches for suicide or self-harm content,https://techcrunch.com/2026/02/26/instagram-now-alerts-parents-if-their-teen-searches-for-suicide-or-self-harm-content/,TechCrunch,TechCrunch,META,False,False,,-0.6,-0.8,-0.5
4,2026-02-25T23:49:19Z,2026-02-26T15:43:31Z,Anthropic acquires computer-use AI startup Vercept after Meta poached one of its founders,https://techcrunch.com/2026/02/25/anthropic-acquires-vercept-ai-startup-agents-computer-use-founders-investors/,TechCrunch,TechCrunch,AMZN,True,True,,-0.3,-0.8,0.5


In [6]:
sent_cols = [c for c in df.columns if c.startswith("sentiment_")]
print(df[sent_cols].describe())
if len(sent_cols) >= 2:
    print("\nCorrelation:")
    print(df[sent_cols].corr())


       sentiment_llm_phi3  sentiment_llm_llama3_2  sentiment_llm_deepseek_r1
count           66.000000               66.000000                  66.000000
mean             0.186364               -0.030303                   0.125758
std              0.703264                0.617431                   0.477110
min             -1.000000               -0.800000                  -0.800000
25%             -0.600000               -0.800000                   0.000000
50%              0.600000                0.200000                   0.000000
75%              0.800000                0.400000                   0.550000
max              1.000000                0.800000                   0.900000

Correlation:
                           sentiment_vader  sentiment_llm_phi3  \
sentiment_vader                        NaN                 NaN   
sentiment_llm_phi3                     NaN            1.000000   
sentiment_llm_llama3_2                 NaN            0.804373   
sentiment_llm_deepseek_r1    

## Testing is-ai-related flag

In [7]:
cols = ["headline", "is_ai_related","url"]
headline_flags = df[cols].drop_duplicates(subset=["headline"]).reset_index(drop=True)

headline_flags.loc[headline_flags["is_ai_related"] == False]

Unnamed: 0,headline,is_ai_related,url
1,Instagram now alerts parents if their teen searches for suicide or self-harm content,False,https://techcrunch.com/2026/02/26/instagram-now-alerts-parents-if-their-teen-searches-for-suicide-or-self-harm-content/
5,YouTube's More Affordable Premium Lite Plan Gets New Perks - MacRumors,False,https://www.macrumors.com/2026/02/24/youtube-lite-plan-upgrade/
6,"Apple's 2026 MacBook Pro Refresh Brings Dynamic Island, OLED Screens, and New Touch Gestures - TechPowerUp",False,https://www.techpowerup.com/346724/apples-2026-macbook-pro-refresh-brings-dynamic-island-oled-screens-and-new-touch-gestures
7,"Chevy Corvette ZR1 Sets Car And Driver's New Lightning Lap Record, Beats McLaren Senna By 0.7 Seconds - Jalopnik",False,https://www.jalopnik.com/2109419/chevy-corvette-zr1-sets-car-and-driver-lightning-lap-record/
28,Leaked Documents Show Meta Cracking Down on Access to Abortion Information - Mother Jones,False,https://news.google.com/rss/articles/CBMikwFBVV95cUxQejFKNjNQakpZTUhEZHJmRkVEUDZSVk53VjBES3o5eWZPU1A3emN0WVYzby14RXFKVFJCVGRIdGNqSDA2Q3N2VHg5RnFMNld2bThJQTJfZlpnMzR2cmpPcVhGNGVPdHQ0eVlCSG1rRlNVZlo4bC1pT3dIZ0NZTF8tUzgyZVUxUWY5eWJxZlZwODRzNWc?oc=5


In [8]:
headline_flags.loc[headline_flags["is_ai_related"] == True]

Unnamed: 0,headline,is_ai_related,url
0,Figma partners with OpenAI to bake in support for Codex,True,https://techcrunch.com/2026/02/26/figma-partners-with-openai-to-bake-in-support-for-codex/
2,Anthropic acquires computer-use AI startup Vercept after Meta poached one of its founders,True,https://techcrunch.com/2026/02/25/anthropic-acquires-vercept-ai-startup-agents-computer-use-founders-investors/
3,Nvidia has another record quarter amid record capex spends,True,https://techcrunch.com/2026/02/25/nvidia-earnings-record-capex-spend-ai/
4,Alphabet-owned robotics software company Intrinsic joins Google,True,https://techcrunch.com/2026/02/25/alphabet-owned-robotics-software-company-intrinsic-joins-google/
8,Microsoft adds Copilot data controls to all storage locations - BleepingComputer,True,https://www.bleepingcomputer.com/news/microsoft/microsoft-adds-copilot-data-controls-to-all-storage-locations/
9,Music generator ProducerAI joins Google Labs - TechCrunch,True,https://techcrunch.com/2026/02/24/music-generator-producerai-joins-google-labs/
10,Gemini 3.1 Pro is a powerhouse for deep work — here are 7 prompts that prove it - Tom's Guide,True,https://www.tomsguide.com/ai/gemini-3-1-pro-is-a-powerhouse-for-deep-work-here-are-7-prompts-that-prove-it
11,AirPods as Apple’s first AI wearable product makes so much sense - 9to5Mac,True,https://9to5mac.com/2026/02/23/airpods-as-apples-first-ai-wearable-product-makes-so-much-sense/
12,Scoop: Pentagon takes first step toward blacklisting Anthropic - Axios,True,https://news.google.com/rss/articles/CBMid0FVX3lxTE1XaEdBN1VBSlNtRnNENDJqdzBxal9uQUpmd19TbkVhLS0yY0xaMThaRjN5THNRd3ZyMFktczFlY0JFWnNXNE5KT04xSlJTYnU4U2tCcmdkTk51cGhCc2JLckg5ZzU0aXk3ak5JZmZ5VGFlazBqemFR?oc=5
13,Hacker Used Anthropic’s Claude to Steal Mexican Data Trove - Bloomberg,True,https://news.google.com/rss/articles/CBMiswFBVV95cUxNc3YtZ19Ubk5WZXJHMUdLM0p6XzVzNHJSMHZCNEYxUl9Ca1lCLXdNSkRvQlo2bC11WVIyMU5STWJOd0xrd2d5T0o2dTRWVHNhUml2QWlfMHhGV21pcUFqUkhLQ2RhYUdvS0hWRlE0NWR0Rm16M3JuNlp1aUdpc2U2OEtMWFZwRG5ldXVpMHJ5dWpmSEJNWm1qaHNvdkFUZS1rRmd2REhMUFdobkw2bGtnU0laQQ?oc=5
