# Run Instructions

1. Make sure conda is installed
2. In terminal, run 'conda create -n myproject'
3. Activate your environment by running 'conda activate myproject'
4. run 'conda install pip'
5. to get dependencies, run 'pip install -r requirements.txt'

If you are still getting dependency issues, run 
'pip install word_forms sentence_transformers'

After these steps, you should be good to run these! You can select your environment in the upper right hand corner of the jupyter notebook. 

In [1]:
%cd ../../

/Users/efang/Desktop/coding/research


In [2]:
from src.functions.matching.matching_agent import MatchingAgent
from src.abstract_classes.attribute import DocumentAttr
from src.functions.decompose_transcript import extract_presentation_section, extract_qa_section, clean_spoken_content


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/efang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Function to load an xml file into a DocumentAttr object

def load_sample_document(file_path: str) -> DocumentAttr:
    """
    Load a sample XML earnings call transcript and extract its text content
    using the decompose_transcript functions.
    Returns a DocumentAttr object with the text.
    """
    try:
        # Extract presentation and Q&A sections
        presentation_text = extract_presentation_section(file_path)
        qa_text = extract_qa_section(file_path)
        
        # Combine sections
        full_text = presentation_text + "\n\n" + qa_text
        
        # Clean spoken content to remove speaker tags and separators
        cleaned_text = clean_spoken_content(full_text)
        
        return DocumentAttr(document=cleaned_text)
    except Exception as e:
        print(f"Error loading document: {e}")
        return DocumentAttr(document="")

In [4]:
# Initialize Matching Agent

agent = MatchingAgent(
    keywords_file="src/functions/matching/test_keywords.csv",
    document=load_sample_document("data/earnings_calls/ex1.xml")
)

Using device: cpu


In [5]:
# perform cosine similarity

matches = agent.cos_similarity(match_type="hybrid", exclude_duplicates=True)
matches2 = agent.cos_similarity(match_type="hybrid", exclude_duplicates=False)

In [6]:
print(matches)

Cosine Similarity Threshold: 0.7

-------------------- Summary --------------------
Total keywords searched: 4
Total keywords with matches: 2
Total direct matches: 19
Total cosine matches: 13
Total unique matches: 6
Unique matches: ['uncertain', 'impacted', 'impact', 'uncertainty', 'impacting', 'impacts']


Keyword: 'uncertainty' (6 total matches)
  Direct Matches (4):
    - Text: 'uncertainty', Context: 'There's a lot more uncertainty.', Position: 5948
    - Text: 'uncertainty', Context: 'And just given the uncertainty, could you talk about the month-to-month trends that you saw in rental?', Position: 6696
    - Text: 'uncertainty', Context: 'These statements are based on Management's current expectations and are subject to uncertainty and changes in circumstances.', Position: 116
    - Text: 'uncertainty', Context: 'Although commercial rental delivered solid growth, due to a high level of uncertainty regarding the macro environment and somewhat less robust demand conditions with rent

In [7]:
print(matches2)

Cosine Similarity Threshold: 0.7

-------------------- Summary --------------------
Total keywords searched: 4
Total keywords with matches: 2
Total direct matches: 19
Total cosine matches: 54
Total unique matches: 36
Unique matches: ['more uncertainty', 'and uncertain', 'highly uncertain', 'impact that', 'uncertain at', 'uncertainty', 'impacting', 'what impacted', 'its impact', 'the uncertainty', 'negatively impact', 'impacted primarily', 'impacts we', 'of uncertainty', 'impact from', 'impacts from', 'impacting approximately', 'impact', 'and impacts', 'to impact', 'impact on', 'uncertainty you', 'the impact', 'impacted', 'negative impact', 'is impacted', 'impact a', 'uncertain', 'impact in', 'uncertainty and', 'impact of', 'uncertainty could', 'impact to', 'uncertainty regarding', 'to uncertainty', 'impacts']


Keyword: 'uncertainty' (17 total matches)
  Direct Matches (4):
    - Text: 'uncertainty', Context: 'There's a lot more uncertainty.', Position: 5948
    - Text: 'uncertainty', 

In [8]:
agent2 = MatchingAgent(
    keywords_file="data/paper_word_sets/political_words.csv",
    document=load_sample_document("data/earnings_calls/ex1.xml")
)

Using device: cpu


In [9]:
matches = agent2.cos_similarity(match_type="hybrid")

In [18]:
matches.export_to_json("research/results/tester.json")

FileNotFoundError: [Errno 2] No such file or directory: 'research/results/tester.json'

In [19]:
from src.functions.matching.exposure_results import ExposureResults

test_json = ExposureResults.load_json("results/tester.json")

In [None]:
print(test_json)

Cosine Similarity Threshold: 0.7

-------------------- Summary --------------------
Total keywords searched: 57
Total keywords with matches: 7
Total direct matches: 5
Total cosine matches: 11
Total unique matches: 9
Unique matches: ['congress', 'conservatively', 'economy', 'policy', 'conservative', 'economic', 'executive', 'political', 'balance']


Keyword: 'Checks and balances' (2 total matches)
  Cosine Similarity Matches (2):
    - Text: 'balance', Context: 'This will provide us with additional balance sheet flexibility going forward and will be the key driver in restarting anti-dilutive share repurchases.', Score: 0.7201, Position: 3831
    - Text: 'balance', Context: 'We expect to begin repurchases in midyear 2016, but we'll continue to evaluate the appropriate timing, primarily based on our declining balance sheet leverage.', Score: 0.7201, Position: 599

Keyword: 'Congress' (1 total matches)
  Direct Matches (1):
    - Text: 'congress', Context: 'Good morning, and welcome to in 