In [20]:
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Download punkt for sentence splitting (only need to run once)
nltk.download('punkt')

# Your full MD&A text goes here. For example:
text = """
Low interest rates lead to increased borrowing
"""

# Load FinBERT model and tokenizer
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Step 1: Split the text into sentences
sentences = sent_tokenize(text)

# Step 2: Chunk sentences without exceeding the token limit
max_len = 510  # Maximum number of tokens per chunk (leave room for special tokens)
chunks = []
current_chunk = []
current_length = 0

for sent in sentences:
    sent_tokens = tokenizer.tokenize(sent)
    sent_len = len(sent_tokens)

    # If adding this sentence stays under max_len, append it
    if current_length + sent_len <= max_len:
        current_chunk.append(sent)
        current_length += sent_len
    else:
        # Save the current chunk and start a new one
        chunks.append(" ".join(current_chunk))
        current_chunk = [sent]
        current_length = sent_len

# Add the final chunk (if any)
if current_chunk:
    chunks.append(" ".join(current_chunk))

# Step 3: Run FinBERT sentiment analysis on each chunk
results = []
for i, chunk in enumerate(chunks):
    result = nlp(chunk)
    print(f"Chunk {i+1} sentiment:", result)
    results.append(result)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/harshverma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use mps:0


Chunk 1 sentiment: [{'label': 'positive', 'score': 0.6094936728477478}]


In [None]:
import re
import pandas as pd
import torch
from torch.nn.functional import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# -----------------------------------------------------------------------------
# 1) Load FinBERT model + tokenizer
# -----------------------------------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model     = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
model.eval()

# -----------------------------------------------------------------------------
# 2) Load your MD&A CSV: item2_extracted_sec.csv
#    It must have at least these three columns:
#       - 'cik'
#       - 'filename'
#       - 'item2'   (the full MD&A text)
#
#    If it has extra columns (date, exchange, etc.) that's fine; we ignore those.
# -----------------------------------------------------------------------------
path_mda = "item2_extracted_sec.csv"
df_mda   = pd.read_csv(path_mda, dtype=str)

# Sanity check: do we see those columns?
required_cols = {"cik", "filename", "item2"}
missing = required_cols - set(df_mda.columns)
if missing:
    raise KeyError(f"Missing columns in {path_mda}: {missing}")

# If some rows are 10-K instead of 10-Q, skip them:
# (We assume all 10-Q filenames contain the substring "_10-Q_")
df_mda_10q = df_mda[df_mda["filename"].str.contains("_10-Q_")].copy().reset_index(drop=True)
print(f"Loaded {len(df_mda)} total rows; {len(df_mda_10q)} rows appear to be 10-Q (filename contains '_10-Q_').")

# -----------------------------------------------------------------------------
# 3) Iterate row by row, run FinBERT on each 'item2' (MD&A) field
# -----------------------------------------------------------------------------
results = []
for idx, row in df_mda_10q.iterrows():
    cik      = row["cik"]
    filename = row["filename"]
    mda_text = row["item2"]

    if not isinstance(mda_text, str) or mda_text.strip() == "" or len(mda_text.strip()) < 20:
        # If MD&A is empty or ridiculously short, skip
        print(f"  → SKIPPING (empty/short MD&A) for: CIK={cik}, filename={filename}")
        continue

    # Split the MD&A into “paragraphs” by blank lines, and drop any piece < 50 chars
    paragraphs = [
        para.strip()
        for para in re.split(r"\n+", mda_text)
        if len(para.strip()) > 50
    ]

    # If you prefer sentence‐splitting or sliding windows, swap this out; this is just a simple example.
    chunk_sentiments = []
    for para in paragraphs:
        encoded = tokenizer(
            para,
            return_tensors="pt",
            truncation=True,
            max_length=512
        )
        with torch.no_grad():
            out   = model(**encoded)
            probs = softmax(out.logits, dim=1).squeeze().tolist()
            # ProsusAI/finbert’s logits order is [positive, negative, neutral]
            chunk_sentiments.append({
                "positive": probs[0],
                "negative": probs[1],
                "neutral":  probs[2],
            })

    if chunk_sentiments:
        avg_pos = sum(d["positive"] for d in chunk_sentiments) / len(chunk_sentiments)
        avg_neg = sum(d["negative"] for d in chunk_sentiments) / len(chunk_sentiments)
        avg_neu = sum(d["neutral"]  for d in chunk_sentiments) / len(chunk_sentiments)
    else:
        # If every “paragraph” was < 50 characters, we mark None (or 0) as you wish
        avg_pos = avg_neg = avg_neu = None

    results.append({
        "cik": cik,
        "filename": filename,
        "num_chunks": len(chunk_sentiments),
        "avg_positive": avg_pos,
        "avg_negative": avg_neg,
        "avg_neutral":  avg_neu
    })

# -----------------------------------------------------------------------------
# 4) Dump final sentiment scores to CSV
# -----------------------------------------------------------------------------
df_out = pd.DataFrame(results)
df_out.to_csv("item2_sentiment_10Q.csv", index=False)
print("Done. Wrote 'item2_sentiment_10Q.csv' with", len(df_out), "rows.")


Loaded 406 total rows; 372 rows appear to be 10-Q (filename contains '_10-Q_').
  → SKIPPING (empty/short MD&A) for: CIK=827052, filename=20241029_10-Q_edgar_data_827052_0000827052-24-000075.txt


In [2]:
import re
import pandas as pd
import torch
from torch.nn.functional import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# -----------------------------------------------------------------------------
# 1) Load FinBERT model + tokenizer
# -----------------------------------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model     = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
model.eval()

# -----------------------------------------------------------------------------
# 2) Load your CSV of pre-extracted Management’s Discussion and Analysis
#
#    - The CSV must have at least these three columns:
#        • 'cik'
#        • 'filename'  (must contain "_10-Q_" for actual 10-Q filings)
#        • 'item2'     (the full text of Management’s Discussion and Analysis of Financial Condition and Results of Operations)
#
#    If there are extra columns, they will be ignored.
# -----------------------------------------------------------------------------
path_mda = "item2_extracted_sec.csv"
df_all  = pd.read_csv(path_mda, dtype=str)

# Check for required columns
required_cols = {"cik", "filename", "item2"}
missing_cols  = required_cols - set(df_all.columns)
if missing_cols:
    raise KeyError(f"Missing required columns in '{path_mda}': {missing_cols}")

# Keep only rows whose filename indicates a 10-Q (i.e. contains "_10-Q_")
df_10q = df_all[df_all["filename"].str.contains("_10-Q_")].copy().reset_index(drop=True)
print(f"Loaded '{path_mda}' → {len(df_all)} total rows, {len(df_10q)} rows appear to be 10-Q (filename contains '_10-Q_').")

# Rename 'item2' to 'management_discussion' for clarity:
df_10q = df_10q.rename(columns={"item2": "management_discussion"})

# -----------------------------------------------------------------------------
# 3) Run FinBERT on each row’s Management’s Discussion and Analysis
# -----------------------------------------------------------------------------
results = []

for idx, row in df_10q.iterrows():
    cik      = row["cik"]
    filename = row["filename"]
    text_mda = row["management_discussion"]

    # Skip any empty or extremely short text
    if not isinstance(text_mda, str) or text_mda.strip() == "" or len(text_mda.strip()) < 20:
        print(f"  → SKIPPING (empty/short Management’s Discussion) for: CIK={cik}, filename={filename}")
        continue

    # Split the Management’s Discussion and Analysis into “paragraphs” by blank lines,
    # dropping any short chunk (< 50 characters)
    paragraphs = [
        para.strip()
        for para in re.split(r"\n+", text_mda)
        if len(para.strip()) > 50
    ]

    chunk_sentiments = []
    for para in paragraphs:
        encoded = tokenizer(
            para,
            return_tensors="pt",
            truncation=True,
            max_length=512
        )
        with torch.no_grad():
            out   = model(**encoded)
            probs = softmax(out.logits, dim=1).squeeze().tolist()
            # FinBERT’s output ordering is [positive, negative, neutral]
            chunk_sentiments.append({
                "positive": probs[0],
                "negative": probs[1],
                "neutral":  probs[2],
            })

    if chunk_sentiments:
        avg_pos = sum(d["positive"] for d in chunk_sentiments) / len(chunk_sentiments)
        avg_neg = sum(d["negative"] for d in chunk_sentiments) / len(chunk_sentiments)
        avg_neu = sum(d["neutral"]  for d in chunk_sentiments) / len(chunk_sentiments)
    else:
        # If all “paragraphs” were shorter than 50 chars, leave None
        avg_pos = avg_neg = avg_neu = None

    results.append({
        "cik": cik,
        "filename": filename,
        "num_paragraphs": len(chunk_sentiments),
        "avg_positive": avg_pos,
        "avg_negative": avg_neg,
        "avg_neutral":  avg_neu
    })

# -----------------------------------------------------------------------------
# 4) Dump final sentiment scores to CSV
# -----------------------------------------------------------------------------
df_out = pd.DataFrame(results)
df_out.to_csv("management_discussion_sentiment_10Q.csv", index=False)
print(f"Done. Wrote 'management_discussion_sentiment_10Q.csv' with {len(df_out)} rows.")


Loaded 'item2_extracted_sec.csv' → 406 total rows, 372 rows appear to be 10-Q (filename contains '_10-Q_').
  → SKIPPING (empty/short Management’s Discussion) for: CIK=827052, filename=20241029_10-Q_edgar_data_827052_0000827052-24-000075.txt
  → SKIPPING (empty/short Management’s Discussion) for: CIK=1002047, filename=20241125_10-Q_edgar_data_1002047_0000950170-24-130551.txt
  → SKIPPING (empty/short Management’s Discussion) for: CIK=1037868, filename=20241031_10-Q_edgar_data_1037868_0001037868-24-000053.txt
  → SKIPPING (empty/short Management’s Discussion) for: CIK=831001, filename=20241107_10-Q_edgar_data_831001_0000831001-24-000134.txt
  → SKIPPING (empty/short Management’s Discussion) for: CIK=895421, filename=20241104_10-Q_edgar_data_895421_0000895421-24-000491.txt
  → SKIPPING (empty/short Management’s Discussion) for: CIK=878927, filename=20241106_10-Q_edgar_data_878927_0000950170-24-122440.txt
  → SKIPPING (empty/short Management’s Discussion) for: CIK=65984, filename=20241101