In [5]:
cd ..

/Users/efang/Desktop/coding/research/src


In [6]:
import functions

In [1]:
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Your target word and list of words
target_word = "king"
word_list = ["queen", "apple", "monarch", "banana", "ruler"]

# Compute embeddings
target_embedding = model.encode(target_word, convert_to_tensor=True)
word_embeddings = model.encode(word_list, convert_to_tensor=True)

# Compute cosine similarities
cosine_scores = util.cos_sim(target_embedding, word_embeddings)[0]

# Pair each word with its similarity score
results = list(zip(word_list, cosine_scores.tolist()))

# Sort by similarity
results.sort(key=lambda x: x[1], reverse=True)

# Print top results
for word, score in results:
    print(f"{word}: {score:.4f}")


  from .autonotebook import tqdm as notebook_tqdm


monarch: 0.7360
queen: 0.6807
banana: 0.3950
ruler: 0.3564
apple: 0.2432


In [9]:
import re
from collections import defaultdict
from typing import List, Dict, Optional

from sentence_transformers import SentenceTransformer, util


def extract_exposure_3(
    text: str,
    keywords: List[str],
    threshold: float = 0.7,
    window: int = 5,
    model: Optional[SentenceTransformer] = None,
) -> Dict[str, List[str]]:
    """
    Extract context windows around words that are semantically similar to any
    keyword, using cosine similarity on sentence‑transformer embeddings.

    Args:
        text (str): Input text.
        keywords (list[str]): Seed keywords (case‑insensitive).
        threshold (float): Cosine‑similarity cut‑off (0–1).  Higher = stricter.
        window (int): Number of tokens to include on each side of a match.
        model (SentenceTransformer | None): Optional pre‑loaded model.  If None,
            'all‑MiniLM‑L6‑v2' is loaded the first time the function is called.

    Returns:
        dict[str, list[str]]: Maps each matched keyword to a list of context
        strings (one per occurrence).
    """
    if model is None:
        model = SentenceTransformer("all-MiniLM-L6-v2")

    # Lower‑case keywords up front
    kw_lower = [k.lower() for k in keywords]

    # Tokenize the text into “words” (alphanumerics)
    words = re.findall(r"\w+", text)
    words_lower = [w.lower() for w in words]

    # --- 1. Embed keywords and the unique words in the text ---------------
    kw_emb = model.encode(kw_lower, convert_to_tensor=True)

    uniq_words = list(set(words_lower))
    uw_emb = model.encode(uniq_words, convert_to_tensor=True)

    # --- 2. Compute cosine similarities (|uniq_words| × |keywords|) ------
    sim_matrix = util.cos_sim(uw_emb, kw_emb)  # torch.Tensor

    # For every unique word, keep the *closest* keyword and its score
    word2best_kw = {}
    for i, word in enumerate(uniq_words):
        best_score, best_idx = sim_matrix[i].max(dim=0)
        if best_score.item() >= threshold:
            word2best_kw[word] = kw_lower[best_idx]

    # --- 3. Scan the original token stream and collect contexts ----------
    contexts = defaultdict(list)
    for idx, (token, token_low) in enumerate(zip(words, words_lower)):
        if token_low in word2best_kw:
            kw = word2best_kw[token_low]
            start = max(0, idx - window)
            end = min(len(words), idx + window + 1)
            contexts[kw].append(" ".join(words[start:end]))

    return dict(contexts)


In [12]:
from functions import extract_text, extract_exposure, extract_exposure2, csv_to_list, sentiment_score, extract_company_info, calculate_risk_word_percentage
import logging
import os
import json
from pathlib import Path
from typing import List, Union

def model5v1(analyze_path, exposure_csv, n):
    """
    Model 5 Version 1 Pipeline:
    - text extraction from earnings call
    - exposure csv to exposure word list
    - exposure search with +- parameter
    - sentiment analysis on found exposure

    Returns:
        dict with exposure strings, sentiment score, pos/neg
    """
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    logging.info("Extracting Text...")
    text = extract_text(analyze_path)

    logging.info("Loading Exposure Word List...")
    exposure_word_list = csv_to_list(exposure_csv)
    # print(exposure_word_list)

    logging.info("Calculating Exposure...")
    exposure = extract_exposure_3(text, exposure_word_list, window=n)
    # print(exposure)

    logging.info("Finding Sentiment...")
    final = sentiment_score(exposure)

    # print(final)

    return final

In [13]:
model5v1("/Users/efang/Desktop/coding/research/src/data/earnings_calls/ex1.xml", "/Users/efang/Desktop/coding/research/src/data/paper_word_sets/political_words.csv", 5)

2025-05-31 11:28:04,878 - INFO - Extracting Text...
2025-05-31 11:28:04,937 - INFO - Loading Exposure Word List...
2025-05-31 11:28:04,939 - INFO - Calculating Exposure...
2025-05-31 11:28:04,940 - INFO - Use pytorch device_name: mps
2025-05-31 11:28:04,941 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 2/2 [00:00<00:00,  3.55it/s]
Batches: 100%|██████████| 45/45 [00:00<00:00, 50.33it/s]
2025-05-31 11:28:09,525 - INFO - Finding Sentiment...
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing

{'executive': [{'text': 'Ryder System Inc Chairman and CEO Art Garcia Ryder System Inc',
   'label': 'neutral',
   'score': 0.9295322895050049,
   'numeric_score': 0},
  {'text': 'Robert Sanchez Chairman and Chief Executive Officer and Art Garcia Executive',
   'label': 'neutral',
   'score': 0.9297224879264832,
   'numeric_score': 0},
  {'text': 'Executive Officer and Art Garcia Executive Vice President and Chief Financial',
   'label': 'neutral',
   'score': 0.9087579250335693,
   'numeric_score': 0},
  {'text': 'Ryder System Inc Chairman and CEO 3 Good morning everyone and',
   'label': 'positive',
   'score': 0.8000006675720215,
   'numeric_score': 0.8000006675720215},
  {'text': 'Ryder System Inc Chairman and CEO 5 Thanks Art Page 16',
   'label': 'neutral',
   'score': 0.7595441341400146,
   'numeric_score': 0},
  {'text': 'Ryder System Inc Chairman and CEO 7 Thanks Art Turning to',
   'label': 'neutral',
   'score': 0.6535797119140625,
   'numeric_score': 0},
  {'text': 'Ryder S