In [64]:
from typing import Literal, Optional

import tqdm
import dspy
import pandas as pd

from rich import print as print_pretty

## Data Loading

In [65]:
df = pd.read_json("../data/news-clean.jsonl", lines=True, convert_dates=["publication_date"]).dropna(subset="headline")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 544 entries, 0 to 543
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   id                544 non-null    object        
 1   headline          544 non-null    object        
 2   content           544 non-null    object        
 3   publication_date  544 non-null    datetime64[ns]
 4   url               544 non-null    object        
 5   source            544 non-null    object        
dtypes: datetime64[ns](1), object(5)
memory usage: 25.6+ KB


In [66]:
df.head(2)

Unnamed: 0,id,headline,content,publication_date,url,source
0,5a618a2daa89d751b75a0c76732cca3b,Apple urged to remove new AI feature after fal...,The press freedom group Reporters Without Bord...,2024-12-19 20:32:19.900,https://www.cnn.com/2024/12/19/media/apple-int...,CNN
1,89a222209eced5c8fbb71670a8817219,"Cyber Monday 2024 is over, but you can still s...",Best Buy is offering shoppers one more chance ...,2024-12-03 12:33:00.000,https://www.cnn.com/cnn-underscored/deals/best...,CNN


## DSPy Init

In [67]:
lm = dspy.LM('ollama_chat/llama3.1:latest', api_base='http://localhost:7869', api_key='')
dspy.configure(lm=lm)

## Sentiment Analysis

In [68]:
class NewsSentimentAnalysis(dspy.Signature):
    """Classify sentiment of a given news report."""

    headline: str = dspy.InputField()
    contents: str = dspy.InputField()
    company: Literal['APPLE', 'MICROSOFT', 'META', 'NVIDIA', 'OTHER'] = dspy.OutputField()
    sentiment: Literal["POSITIVE", "NEUTRAL", "NEGATIVE"] = dspy.OutputField()
    confidence: float = dspy.OutputField()

In [69]:
clf_sentiment = dspy.ChainOfThought(NewsSentimentAnalysis)
clf_sentiment

self = Predict(StringSignature(headline, contents -> reasoning, company, sentiment, confidence
    instructions='Classify sentiment of a given news report.'
    headline = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Headline:', 'desc': '${headline}'})
    contents = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Contents:', 'desc': '${contents}'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_field_type': 'output'})
    company = Field(annotation=Literal['APPLE', 'MICROSOFT', 'META', 'NVIDIA', 'OTHER'] required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Company:', 'desc': '${company}'})
    sentiment = Field(annotation=Literal['POSITIVE', 'NEUTRAL', 'NEGATIVE'] required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Sentiment:', 'de

In [70]:
input_index = 1

print_pretty({
    "predicted": clf_sentiment(headline=df.iloc[input_index, 0], contents=df.iloc[input_index, 1]),
    "title": df.iloc[input_index, 0],
})

## Process All News

In [80]:
results = []
for news in tqdm.tqdm(df.itertuples(), total=df.shape[0]):
    try:
        res = clf_sentiment(headline=news.headline, contents=news.content)
        results.append({
            "id": news.id,
            "source": news.source,
            "headline": news.headline,
            "publication_date": news.publication_date,
            "company": res.company,
            "sentiment": res.sentiment,
            "confidence": res.confidence,
        })
    except:
        pass

100%|██████████| 544/544 [00:00<00:00, 704.32it/s]


In [81]:
df_results = pd.DataFrame(results)
df_results.head()

Unnamed: 0,id,source,headline,publication_date,company,sentiment,confidence
0,5a618a2daa89d751b75a0c76732cca3b,CNN,Apple urged to remove new AI feature after fal...,2024-12-19 20:32:19.900,APPLE,NEGATIVE,0.8
1,89a222209eced5c8fbb71670a8817219,CNN,"Cyber Monday 2024 is over, but you can still s...",2024-12-03 12:33:00.000,OTHER,NEUTRAL,0.8
2,37a292b603e0313952c8fd6b623233ca,CNN,21 last-minute Cyber Monday tech deals on prod...,2024-12-03 00:50:00.000,OTHER,POSITIVE,0.9
3,c947aa040c36f2621b7231012e451594,CNN,Shop these 34 last-minute Apple Cyber Monday d...,2024-12-03 05:55:00.000,APPLE,POSITIVE,0.9
4,94d4c11e27bba1525f2d26af836e87a3,CNN,Apple’s AirPods Max hit a new record-low price...,2024-12-03 03:55:00.000,APPLE,POSITIVE,0.9


In [82]:
df_results.to_csv("../data/sentiment.csv", index=None)

In [83]:
df_results["company"].value_counts()

company
OTHER        211
APPLE        126
META          81
MICROSOFT     71
NVIDIA        54
Name: count, dtype: int64