In [1]:
import os
import re
import glob
import nltk
import pandas as pd

# Use the notebook‐only tqdm (no attempt to import ipywidgets)
from tqdm.notebook import tqdm   

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline
)
from nltk.tokenize import sent_tokenize

# Download NLTK punkt (sentence splitter) if not already present
nltk.download("punkt", quiet=True)


# ─── 2) Item 7 Extractor ─────────────────────────────────────────────────
def extract_item7_from_10k(filepath: str, skip_chars: int = 18000) -> str:
    """
    Reads a raw EDGAR 10-K text file and returns the Item 7 section
    (“Management’s Discussion & Analysis…”) as a single string.
    """
    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        raw_text = f.read()

    text_to_search = raw_text[skip_chars:]
    start_pattern = re.compile(r"(?m)^ITEM\s+7(?:\s*[\.\-]|\s).*", re.IGNORECASE)
    m_start = start_pattern.search(text_to_search)
    if not m_start:
        return None
    start_idx = m_start.end()

    end_pattern = re.compile(
        r"(?m)^(ITEM\s+7A(?:\s*[\.\-]|\s).*|ITEM\s+8(?:\s*[\.\-]|\s).*)",
        re.IGNORECASE
    )
    m_end = end_pattern.search(text_to_search, pos=start_idx)
    if not m_end:
        return None
    end_idx = m_end.start()

    return text_to_search[start_idx:end_idx].strip()


# ─── 3) Chunking Utility ────────────────────────────────────────────────
def chunk_text_for_finbert(full_text: str, tokenizer, max_tokens: int = 510) -> list[str]:
    """
    Splits `full_text` into strings of ≤ max_tokens tokens each,
    by sentence‐splitting and accumulating until the limit is hit.
    """
    sentences = sent_tokenize(full_text)
    chunks = []
    current_chunk = []
    current_len = 0

    for sent in sentences:
        sent_tokens = tokenizer.tokenize(sent)
        sent_len = len(sent_tokens)
        if current_len + sent_len > max_tokens and current_chunk:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sent]
            current_len = sent_len
        else:
            current_chunk.append(sent)
            current_len += sent_len

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


# ─── 4) Main: Sentiment Function ─────────────────────────────────────────
def get_sentiment_for(
    ticker: str,
    quarter: str,
    raw_folder: str = "raw_data/edgar_filings_2024_QTR4",
    model_name: str = "ProsusAI/finbert"
) -> dict:
    """
    1) Finds the 10-K file for <ticker>_<quarter> in `raw_folder/`.
    2) Extracts Item 7 (MD&A).
    3) Splits it into ≤510-token chunks.
    4) Runs FinBERT on each chunk.
    5) Returns aggregated sentiment scores.
    """
    # 4.1) Locate the 10-K file (e.g. “AAPL_2023Q3_10-K.txt”)
    pattern = os.path.join(raw_folder, f"{ticker}_*{quarter}*_10-K.txt")
    matches = glob.glob(pattern)
    if not matches:
        raise FileNotFoundError(
            f"No 10-K file found for ticker={ticker}, quarter={quarter} in {raw_folder}/"
        )
    txt_path = matches[0]

    # 4.2) Extract Item 7 text
    item7_text = extract_item7_from_10k(txt_path, skip_chars=18000)
    if not item7_text:
        raise ValueError(f"Could not extract MD&A (Item 7) from {txt_path}")

    # 4.3) Load FinBERT model + tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model     = AutoModelForSequenceClassification.from_pretrained(model_name)
    nlp_pipe  = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

    # 4.4) Chunk MD&A into ≤510-token pieces
    chunks = chunk_text_for_finbert(item7_text, tokenizer, max_tokens=510)

    # 4.5) Run FinBERT on each chunk and collect scores
    all_pos = []
    all_neu = []
    all_neg = []

    for chunk in tqdm(chunks, desc=f"Scoring {ticker} {quarter}", leave=False):
        out = nlp_pipe(chunk)
        scores = {d["label"].lower(): d["score"] for d in out}
        all_pos.append(scores.get("positive", 0.0))
        all_neu.append(scores.get("neutral",  0.0))
        all_neg.append(scores.get("negative", 0.0))

    # 4.6) Compute averages & net sentiment
    avg_pos = sum(all_pos) / len(all_pos)
    avg_neu = sum(all_neu) / len(all_neu)
    avg_neg = sum(all_neg) / len(all_neg)
    net_sent = avg_pos - avg_neg

    result = {
        "ticker":        ticker.upper(),
        "quarter":       quarter.upper(),
        "num_chunks":    len(chunks),
        "avg_positive":  round(avg_pos,  4),
        "avg_neutral":   round(avg_neu,   4),
        "avg_negative":  round(avg_neg,   4),
        "net_sentiment": round(net_sent,  4)
    }

    print(f"\n=== Sentiment for {ticker.upper()} {quarter.upper()} ===")
    print(f"Chunks analyzed : {result['num_chunks']}")
    print(f"Avg positive    : {result['avg_positive']}")
    print(f"Avg neutral     : {result['avg_neutral']}")
    print(f"Avg negative    : {result['avg_negative']}")
    print(f"Net sentiment   : {result['net_sentiment']}")

    return result

In [21]:
company_ticker = "APPLE"
company_quarter = ""
sentiment_output = get_sentiment_for(company_ticker, company_quarter)

FileNotFoundError: No 10-K file found for ticker=APPLE, quarter= in raw_data/edgar_filings_2024_QTR4/

In [20]:
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Download punkt for sentence splitting (only need to run once)
nltk.download('punkt')

# Your full MD&A text goes here. For example:
text = """
Low interest rates lead to increased borrowing
"""

# Load FinBERT model and tokenizer
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Step 1: Split the text into sentences
sentences = sent_tokenize(text)

# Step 2: Chunk sentences without exceeding the token limit
max_len = 510  # Maximum number of tokens per chunk (leave room for special tokens)
chunks = []
current_chunk = []
current_length = 0

for sent in sentences:
    sent_tokens = tokenizer.tokenize(sent)
    sent_len = len(sent_tokens)

    # If adding this sentence stays under max_len, append it
    if current_length + sent_len <= max_len:
        current_chunk.append(sent)
        current_length += sent_len
    else:
        # Save the current chunk and start a new one
        chunks.append(" ".join(current_chunk))
        current_chunk = [sent]
        current_length = sent_len

# Add the final chunk (if any)
if current_chunk:
    chunks.append(" ".join(current_chunk))

# Step 3: Run FinBERT sentiment analysis on each chunk
results = []
for i, chunk in enumerate(chunks):
    result = nlp(chunk)
    print(f"Chunk {i+1} sentiment:", result)
    results.append(result)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/harshverma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use mps:0


Chunk 1 sentiment: [{'label': 'positive', 'score': 0.6094936728477478}]


In [5]:
import os
import re
import pandas as pd
import torch
from torch.nn.functional import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# -----------------------------------------------------------------------------
# 1) Load FinBERT model/tokenizer
# -----------------------------------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model     = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
model.eval()

# -----------------------------------------------------------------------------
# 2) Load your core metadata: (cik, filing_type, filename), but keep only 10-Q
# -----------------------------------------------------------------------------
df_meta = pd.read_csv("core_table_v2.csv", dtype=str)[
    ["cik", "filing_type", "filename"]
].copy()

# Normalize filing_type (strip whitespace, uppercase)
df_meta["filing_type"] = df_meta["filing_type"].str.strip().str.upper()

# Keep only rows where filing_type == "10-Q"
df_meta_10q = df_meta[df_meta["filing_type"] == "10-Q"].reset_index(drop=True)
print(f"Loaded core_table_v2.csv → {len(df_meta_10q)} rows with filing_type = 10-Q")

# -----------------------------------------------------------------------------
# 3) Load the combined MD&A file (item2_extracted_sec.csv), which has both 10-Q and 10-K.
#    We only want the 10-Q rows, so we filter on 'filename' containing "_10-Q_".
#
#    item2_extracted_sec.csv columns (example): [date, cik, filename, item2, …]
# -----------------------------------------------------------------------------
path_10q = "item2_extracted_sec.csv"
if not os.path.exists(path_10q):
    raise FileNotFoundError(f"Could not find '{path_10q}' in working folder")

# Read in all rows
df_item2 = pd.read_csv(path_10q, dtype=str)

# Keep only those where filename contains "_10-Q_" (i.e. actual 10-Q filings).
df_10q_only = df_item2[df_item2["filename"].str.contains("_10-Q_")].copy()

# Now reduce to exactly the columns we need: cik, filename, and the MD&A text is in 'item2'
df_10q_only = df_10q_only[["cik", "filename", "item2"]].rename(columns={"item2": "mda_text"})
print(f"Loaded item2_extracted_sec.csv → {len(df_10q_only)} total rows with '_10-Q_' in filename")

# -----------------------------------------------------------------------------
# 4) Merge metadata + MD&A on (cik, filename) ONLY
#    (We drop filing_type from the merge keys, because we've already filtered both sides to 10-Q.)
# -----------------------------------------------------------------------------
df_merged = pd.merge(
    df_meta_10q[["cik", "filename"]],  # core metadata, only 10-Q rows
    df_10q_only,                       # pre-extracted MD&A for 10-Q
    how="left",
    on=["cik", "filename"]
)

missing_md = df_merged["mda_text"].isna().sum()
print(f"{missing_md} out of {len(df_merged)} 10-Q filings have no MD&A text in item2_extracted_sec.csv")

# -----------------------------------------------------------------------------
# 5) Run FinBERT on each row’s mda_text
# -----------------------------------------------------------------------------
results = []
for idx, row in df_merged.iterrows():
    cik      = row["cik"]
    filename = row["filename"]
    item2 = row["item2"]

    if not isinstance(item2, str) or item2.strip() == "":
        # Skip if no MD&A text
        print(f"  → SKIPPING (item2) for: CIK={cik}, filename={filename}")
        continue

    # Split into “paragraphs” by blank lines (drop any line shorter than 50 chars)
    paragraphs = [
        para.strip()
        for para in re.split(r"\n+", mda_text)
        if len(para.strip()) > 50
    ]

    chunk_sentiments = []
    for para in paragraphs:
        encoded = tokenizer(
            para,
            return_tensors="pt",
            truncation=True,
            max_length=512
        )
        with torch.no_grad():
            out   = model(**encoded)
            probs = softmax(out.logits, dim=1).squeeze().tolist()
            # ProsusAI/finbert’s output order is [positive, negative, neutral]
            chunk_sentiments.append({
                "positive": probs[0],
                "negative": probs[1],
                "neutral":  probs[2],
            })

    if chunk_sentiments:
        avg_pos = sum(d["positive"] for d in chunk_sentiments) / len(chunk_sentiments)
        avg_neg = sum(d["negative"] for d in chunk_sentiments) / len(chunk_sentiments)
        avg_neu = sum(d["neutral"]  for d in chunk_sentiments) / len(chunk_sentiments)
    else:
        # In case the MD&A text was present but every paragraph < 50 chars, mark as None
        avg_pos = avg_neg = avg_neu = None

    results.append({
        "cik": cik,
        "filename": filename,
        "num_chunks": len(chunk_sentiments),
        "avg_positive": avg_pos,
        "avg_negative": avg_neg,
        "avg_neutral":  avg_neu
    })

# -----------------------------------------------------------------------------
# 6) Save final sentiment scores to CSV
# -----------------------------------------------------------------------------
df_out = pd.DataFrame(results)
df_out.to_csv("filings_sentiment_10Q.csv", index=False)
print("Done. Wrote 'filings_sentiment_10Q.csv'.")


Loaded core_table_v2.csv → 48 rows with filing_type = 10-Q
Loaded item2_extracted_sec.csv → 372 total rows with '_10-Q_' in filename
48 out of 48 10-Q filings have no MD&A text in item2_extracted_sec.csv


KeyError: 'item2'

In [None]:
import re
import pandas as pd
import torch
from torch.nn.functional import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# -----------------------------------------------------------------------------
# 1) Load FinBERT model + tokenizer
# -----------------------------------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model     = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
model.eval()

# -----------------------------------------------------------------------------
# 2) Load your MD&A CSV: item2_extracted_sec.csv
#    It must have at least these three columns:
#       - 'cik'
#       - 'filename'
#       - 'item2'   (the full MD&A text)
#
#    If it has extra columns (date, exchange, etc.) that's fine; we ignore those.
# -----------------------------------------------------------------------------
path_mda = "item2_extracted_sec.csv"
df_mda   = pd.read_csv(path_mda, dtype=str)

# Sanity check: do we see those columns?
required_cols = {"cik", "filename", "item2"}
missing = required_cols - set(df_mda.columns)
if missing:
    raise KeyError(f"Missing columns in {path_mda}: {missing}")

# If some rows are 10-K instead of 10-Q, skip them:
# (We assume all 10-Q filenames contain the substring "_10-Q_")
df_mda_10q = df_mda[df_mda["filename"].str.contains("_10-Q_")].copy().reset_index(drop=True)
print(f"Loaded {len(df_mda)} total rows; {len(df_mda_10q)} rows appear to be 10-Q (filename contains '_10-Q_').")

# -----------------------------------------------------------------------------
# 3) Iterate row by row, run FinBERT on each 'item2' (MD&A) field
# -----------------------------------------------------------------------------
results = []
for idx, row in df_mda_10q.iterrows():
    cik      = row["cik"]
    filename = row["filename"]
    mda_text = row["item2"]

    if not isinstance(mda_text, str) or mda_text.strip() == "" or len(mda_text.strip()) < 20:
        # If MD&A is empty or ridiculously short, skip
        print(f"  → SKIPPING (empty/short MD&A) for: CIK={cik}, filename={filename}")
        continue

    # Split the MD&A into “paragraphs” by blank lines, and drop any piece < 50 chars
    paragraphs = [
        para.strip()
        for para in re.split(r"\n+", mda_text)
        if len(para.strip()) > 50
    ]

    # If you prefer sentence‐splitting or sliding windows, swap this out; this is just a simple example.
    chunk_sentiments = []
    for para in paragraphs:
        encoded = tokenizer(
            para,
            return_tensors="pt",
            truncation=True,
            max_length=512
        )
        with torch.no_grad():
            out   = model(**encoded)
            probs = softmax(out.logits, dim=1).squeeze().tolist()
            # ProsusAI/finbert’s logits order is [positive, negative, neutral]
            chunk_sentiments.append({
                "positive": probs[0],
                "negative": probs[1],
                "neutral":  probs[2],
            })

    if chunk_sentiments:
        avg_pos = sum(d["positive"] for d in chunk_sentiments) / len(chunk_sentiments)
        avg_neg = sum(d["negative"] for d in chunk_sentiments) / len(chunk_sentiments)
        avg_neu = sum(d["neutral"]  for d in chunk_sentiments) / len(chunk_sentiments)
    else:
        # If every “paragraph” was < 50 characters, we mark None (or 0) as you wish
        avg_pos = avg_neg = avg_neu = None

    results.append({
        "cik": cik,
        "filename": filename,
        "num_chunks": len(chunk_sentiments),
        "avg_positive": avg_pos,
        "avg_negative": avg_neg,
        "avg_neutral":  avg_neu
    })

# -----------------------------------------------------------------------------
# 4) Dump final sentiment scores to CSV
# -----------------------------------------------------------------------------
df_out = pd.DataFrame(results)
df_out.to_csv("item2_sentiment_10Q.csv", index=False)
print("Done. Wrote 'item2_sentiment_10Q.csv' with", len(df_out), "rows.")


Loaded 406 total rows; 372 rows appear to be 10-Q (filename contains '_10-Q_').
  → SKIPPING (empty/short MD&A) for: CIK=827052, filename=20241029_10-Q_edgar_data_827052_0000827052-24-000075.txt


In [1]:
import os
import re
import pandas as pd
import torch
from torch.nn.functional import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# -----------------------------------------------------------------------------
# 1) Load FinBERT model/tokenizer
# -----------------------------------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model     = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
model.eval()

# -----------------------------------------------------------------------------
# 2) Load your core metadata: (cik, filing_type, filename), but keep only 10-Q
# -----------------------------------------------------------------------------
df_meta = pd.read_csv("core_table_v2.csv", dtype=str)[
    ["cik", "filing_type", "filename"]
].copy()

# Normalize filing_type (strip whitespace, uppercase)
df_meta["filing_type"] = df_meta["filing_type"].str.strip().str.upper()

# Keep only rows where filing_type == "10-Q"
df_meta_10q = df_meta[df_meta["filing_type"] == "10-Q"].reset_index(drop=True)
print(f"Loaded core_table_v2.csv → {len(df_meta_10q)} rows with filing_type = 10-Q")

# -----------------------------------------------------------------------------
# 3) Load the combined MD&A file (item2_extracted_sec.csv), which has both 10-Q and 10-K.
#    We only want the 10-Q rows, so we filter on 'filename' containing "_10-Q_".
#
#    item2_extracted_sec.csv columns (example): [date, cik, filename, item2, …]
# -----------------------------------------------------------------------------
path_10q = "item2_extracted_sec.csv"
if not os.path.exists(path_10q):
    raise FileNotFoundError(f"Could not find '{path_10q}' in working folder")

# Read in all rows
df_all_mda = pd.read_csv(path_10q, dtype=str)

# Keep only those where filename contains "_10-Q_" (i.e. actual 10-Q filings).
df_10q_only = df_all_mda[df_all_mda["filename"].str.contains("_10-Q_")].copy()

# Now reduce to exactly the columns we need: cik, filename, and the MD&A text is in 'item2'
df_10q_only = df_10q_only[["cik", "filename", "item2"]].rename(columns={"item2": "mda_text"})
print(f"Loaded item2_extracted_sec.csv → {len(df_10q_only)} total rows with '_10-Q_' in filename")

# -----------------------------------------------------------------------------
# 4) Merge metadata + MD&A on (cik, filename) ONLY
#    (We drop filing_type from the merge keys, because we've already filtered both sides to 10-Q.)
# -----------------------------------------------------------------------------
df_merged = pd.merge(
    df_meta_10q[["cik", "filename"]],  # core metadata, only 10-Q rows
    df_10q_only,                       # pre-extracted MD&A for 10-Q
    how="left",
    on=["cik", "filename"]
)

missing_md = df_merged["mda_text"].isna().sum()
print(f"{missing_md} out of {len(df_merged)} 10-Q filings have no MD&A text in item2_extracted_sec.csv")

# -----------------------------------------------------------------------------
# 5) Run FinBERT on each row’s mda_text
# -----------------------------------------------------------------------------
results = []
for idx, row in df_merged.iterrows():
    cik      = row["cik"]
    filename = row["filename"]
    mda_text = row["mda_text"]

    if not isinstance(mda_text, str) or mda_text.strip() == "":
        # Skip if no MD&A text
        print(f"  → SKIPPING (no MD&A) for: CIK={cik}, filename={filename}")
        continue

    # Split into “paragraphs” by blank lines (drop any line shorter than 50 chars)
    paragraphs = [
        para.strip()
        for para in re.split(r"\n+", mda_text)
        if len(para.strip()) > 50
    ]

    chunk_sentiments = []
    for para in paragraphs:
        encoded = tokenizer(
            para,
            return_tensors="pt",
            truncation=True,
            max_length=512
        )
        with torch.no_grad():
            out   = model(**encoded)
            probs = softmax(out.logits, dim=1).squeeze().tolist()
            # ProsusAI/finbert’s output order is [positive, negative, neutral]
            chunk_sentiments.append({
                "positive": probs[0],
                "negative": probs[1],
                "neutral":  probs[2],
            })

    if chunk_sentiments:
        avg_pos = sum(d["positive"] for d in chunk_sentiments) / len(chunk_sentiments)
        avg_neg = sum(d["negative"] for d in chunk_sentiments) / len(chunk_sentiments)
        avg_neu = sum(d["neutral"]  for d in chunk_sentiments) / len(chunk_sentiments)
    else:
        # In case the MD&A text was present but every paragraph < 50 chars, mark as None
        avg_pos = avg_neg = avg_neu = None

    results.append({
        "cik": cik,
        "filename": filename,
        "num_chunks": len(chunk_sentiments),
        "avg_positive": avg_pos,
        "avg_negative": avg_neg,
        "avg_neutral":  avg_neu
    })

# -----------------------------------------------------------------------------
# 6) Save final sentiment scores to CSV
# -----------------------------------------------------------------------------
df_out = pd.DataFrame(results)
df_out.to_csv("filings_sentiment_10Q.csv", index=False)
print("Done. Wrote 'filings_sentiment_10Q.csv'.")

Loaded core_table_v2.csv → 48 rows with filing_type = 10-Q
Loaded item2_extracted_sec.csv → 372 total rows with '_10-Q_' in filename
48 out of 48 10-Q filings have no MD&A text in item2_extracted_sec.csv
  → SKIPPING (no MD&A) for: CIK=0001090872, filename=20220303_10-Q_edgar_data_1090872_0001090872-22-000007.txt
  → SKIPPING (no MD&A) for: CIK=0000006281, filename=20220216_10-Q_edgar_data_6281_0000006281-22-000020.txt
  → SKIPPING (no MD&A) for: CIK=0000006951, filename=20220224_10-Q_edgar_data_6951_0000006951-22-000011.txt
  → SKIPPING (no MD&A) for: CIK=0001730168, filename=20220310_10-Q_edgar_data_1730168_0001730168-22-000029.txt
  → SKIPPING (no MD&A) for: CIK=0000014693, filename=20220303_10-Q_edgar_data_14693_0000014693-22-000022.txt
  → SKIPPING (no MD&A) for: CIK=0000711404, filename=20220304_10-Q_edgar_data_711404_0000711404-22-000015.txt
  → SKIPPING (no MD&A) for: CIK=0000016732, filename=20220309_10-Q_edgar_data_16732_0000016732-22-000014.txt
  → SKIPPING (no MD&A) for: CI