In [17]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
import torch
import matplotlib.pyplot as plt
import seaborn as sns

# Load your dataset
df = pd.read_csv('fomc.csv', engine='python', parse_dates=['Date', 'Release Date'])

cutoff_start_date = pd.to_datetime("2013-01-01")
cutoff_end_date = pd.to_datetime("2024-12-31")
df = df[(df['Release Date'] >= cutoff_start_date) & (df['Release Date'] <= cutoff_end_date)]

df.dropna(subset=['Text'], inplace=True)

def chunk_text(text, tokenizer, max_length=500):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    for i in range(0, len(tokens), max_length - 2):  # Keep room for special tokens
        chunk = tokens[i:i + (max_length - 2)]
        chunk = [tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id]  # Add special tokens
        yield torch.tensor([chunk])

def perform_sentiment_analysis(text):
    finbert = pipeline('sentiment-analysis',
                       model='yiyanghkust/finbert-tone',
                       tokenizer='yiyanghkust/finbert-tone')

    tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
    sentiments = []
    scores = []

    for chunk in chunk_text(text, tokenizer):
        result = finbert(tokenizer.decode(chunk.squeeze().tolist()))[0]
        sentiments.append(result['label'])
        scores.append(result['score'])
    
    # Averaging scores for overall sentiment
    avg_score = sum(scores) / len(scores)
    overall_sentiment = max(set(sentiments), key=sentiments.count)

    return overall_sentiment, avg_score

df[['Sentiment', 'Score']] = df['Text'].apply(lambda x: perform_sentiment_analysis(x)).apply(pd.Series)

# Calculate Positiveness
positiveness_scores = []
for index, row in df.iterrows():
    if row['Sentiment'] == 'Positive':
        positiveness_scores.append(row['Score'])
    elif row['Sentiment'] == 'Negative':
        positiveness_scores.append(-row['Score'])
    else:
        positiveness_scores.append(0)

df['Positiveness'] = positiveness_scores



print(df[['Date', 'Release Date', 'Type', 'Sentiment', 'Score', 'Positiveness']])
df.to_csv('fomc_statement_with_sentiment.csv', index=False, sep='|')

Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set

          Date Release Date       Type Sentiment     Score  Positiveness
2   2024-12-18   2024-12-18  Statement   Neutral  0.997844           0.0
4   2024-11-07   2024-11-26     Minute   Neutral  0.923947           0.0
5   2024-11-07   2024-11-07  Statement   Neutral  0.997992           0.0
6   2024-09-18   2024-09-18  Statement   Neutral  0.973147           0.0
7   2024-09-18   2024-10-09     Minute   Neutral  0.928081           0.0
..         ...          ...        ...       ...       ...           ...
197 2013-03-20   2013-04-10     Minute   Neutral  0.931640           0.0
198 2013-03-20   2013-03-20  Statement   Neutral  0.995403           0.0
199 2013-01-30   2013-02-20     Minute   Neutral  0.944843           0.0
200 2013-01-30   2013-01-30  Statement   Neutral  0.927228           0.0
201 2012-12-12   2013-01-03     Minute   Neutral  0.901510           0.0

[194 rows x 6 columns]


TypeError: unsupported operand type(s) for &: 'Timestamp' and 'DatetimeArray'

In [18]:
# number of positive, negative and neutral sentiments
sentiment_counts = df['Sentiment'].value_counts()

In [19]:
sentiment_counts

Sentiment
Neutral     181
Positive     11
Negative      2
Name: count, dtype: int64