In [11]:
# !pip install torch transformers datasets

In [40]:

from transformers import pipeline
from datasets import load_dataset
import pandas as pd
import ast
from tqdm import tqdm

tqdm.pandas()

## Prepare Data

In [41]:

ds = load_dataset("JanosAudran/financial-reports-sec", "small_full")

In [42]:
df = ds["train"].to_pandas()
COLS = ["sentenceID", "sentence", "docID", "filingDate", "section", "name", "labels", "tickers", "reportDate", "returns"]
df = df[COLS]


# convert the dict val inside the labels col to 3 separate cols for 1d, 5d and 30d window. 
def parse_labels(label):
    if isinstance(label, str):  # Check if the entry is a string
        try:
            return json.loads(label.replace("'", '"'))  # Convert single quotes to double quotes for JSON
        except json.JSONDecodeError as e:
            print(f"Error parsing label: {label} -> {e}")
            return {}  # Return an empty dict if parsing fails
    return label 


labels_df = df['labels'].progress_apply(parse_labels).progress_apply(pd.Series)
labels_df.columns = [f"label_{col}" for col in labels_df.columns]
df = pd.concat([df, labels_df], axis=1).drop(columns=['labels'])

100%|██████████| 200000/200000 [00:00<00:00, 1502960.37it/s]
100%|██████████| 200000/200000 [00:24<00:00, 8140.96it/s] 


In [43]:
df.head()

Unnamed: 0,sentenceID,sentence,docID,filingDate,section,name,tickers,reportDate,returns,label_1d,label_5d,label_30d
0,0000001750_10-K_2020_section_1_0,ITEM 1.BUSINESS General AAR CORP. and its subs...,0000001750_10-K_2020,2020-07-21,0,AAR CORP,[AIR],2020-05-31,{'1d': {'closePriceEndDate': 19.01000022888183...,0,1,0
1,0000001750_10-K_2020_section_1_1,"AAR was founded in 1951, organized in 1955 and...",0000001750_10-K_2020,2020-07-21,0,AAR CORP,[AIR],2020-05-31,{'1d': {'closePriceEndDate': 19.01000022888183...,0,1,0
2,0000001750_10-K_2020_section_1_2,We are a diversified provider of products and ...,0000001750_10-K_2020,2020-07-21,0,AAR CORP,[AIR],2020-05-31,{'1d': {'closePriceEndDate': 19.01000022888183...,0,1,0
3,0000001750_10-K_2020_section_1_3,Fiscal 2020 began with strategic initiatives f...,0000001750_10-K_2020,2020-07-21,0,AAR CORP,[AIR],2020-05-31,{'1d': {'closePriceEndDate': 19.01000022888183...,0,1,0
4,0000001750_10-K_2020_section_1_4,Our momentum from a successful fiscal 2019 car...,0000001750_10-K_2020,2020-07-21,0,AAR CORP,[AIR],2020-05-31,{'1d': {'closePriceEndDate': 19.01000022888183...,0,1,0


## Extract Sentiments

In [44]:
sentiment_pipeline = pipeline("text-classification", model="ProsusAI/finbert", device=0, truncation=True)

In [46]:
# testing on few rows
sentiment_pipeline(df.iloc[0:5]['sentence'].tolist(),)

[{'label': 'neutral', 'score': 0.9483352899551392},
 {'label': 'neutral', 'score': 0.9519177079200745},
 {'label': 'neutral', 'score': 0.8594400882720947},
 {'label': 'positive', 'score': 0.6688447594642639},
 {'label': 'positive', 'score': 0.953032374382019}]

In [47]:
texts = df['sentence'].tolist()

sentiment_labels = []
batch_size = 64

for i in tqdm(range(0, len(texts), batch_size), desc="Processing Batches"):
    batch_texts = texts[i:i+batch_size]
    batch_results = sentiment_pipeline(batch_texts, truncation=True, batch_size=batch_size)
    sentiment_labels.extend([res['label'] for res in batch_results])

Processing Batches: 100%|██████████| 3125/3125 [02:50<00:00, 18.36it/s]


In [48]:
df['sentiment'] = sentiment_labels

In [49]:
df.to_csv("10k_sentences_small_sentiments.csv")

In [50]:
df['sentiment'].value_counts()

sentiment
neutral     150589
negative     29999
positive     19412
Name: count, dtype: int64

In [51]:
df.head()

Unnamed: 0,sentenceID,sentence,docID,filingDate,section,name,tickers,reportDate,returns,label_1d,label_5d,label_30d,sentiment
0,0000001750_10-K_2020_section_1_0,ITEM 1.BUSINESS General AAR CORP. and its subs...,0000001750_10-K_2020,2020-07-21,0,AAR CORP,[AIR],2020-05-31,{'1d': {'closePriceEndDate': 19.01000022888183...,0,1,0,neutral
1,0000001750_10-K_2020_section_1_1,"AAR was founded in 1951, organized in 1955 and...",0000001750_10-K_2020,2020-07-21,0,AAR CORP,[AIR],2020-05-31,{'1d': {'closePriceEndDate': 19.01000022888183...,0,1,0,neutral
2,0000001750_10-K_2020_section_1_2,We are a diversified provider of products and ...,0000001750_10-K_2020,2020-07-21,0,AAR CORP,[AIR],2020-05-31,{'1d': {'closePriceEndDate': 19.01000022888183...,0,1,0,neutral
3,0000001750_10-K_2020_section_1_3,Fiscal 2020 began with strategic initiatives f...,0000001750_10-K_2020,2020-07-21,0,AAR CORP,[AIR],2020-05-31,{'1d': {'closePriceEndDate': 19.01000022888183...,0,1,0,positive
4,0000001750_10-K_2020_section_1_4,Our momentum from a successful fiscal 2019 car...,0000001750_10-K_2020,2020-07-21,0,AAR CORP,[AIR],2020-05-31,{'1d': {'closePriceEndDate': 19.01000022888183...,0,1,0,positive
