## Setup

In [None]:
import sys
from pathlib import Path
import importlib

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

from src.analysis.analyst_module import Analyst
from src.document.abstract_classes.setup_module import SentimentSetup
import toml

# Force reload the module to pick up any code changes
import src.analysis.analyst_module
importlib.reload(src.analysis.analyst_module)
from src.analysis.analyst_module import Analyst

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arete\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Test 1: Sentiment-Only Batch Processing

Run sentiment analysis on all earnings calls without keyword matching.

In [None]:
print("=" * 80)
print("TEST 1: Sentiment-only batch processing")
print("=" * 80)

# Setup configuration for sentiment analysis
sentiment_setup = SentimentSetup(
    sheet_name_positive='ML_positive_unigram',
    sheet_name_negative='ML_negative_unigram',
    file_path=str(project_root / "data" / "word_sets" / "Garcia_MLWords.xlsx"),
    hf_model='cardiffnlp/twitter-roberta-base-sentiment-latest',
    device=-1
)

# Create analyst (no keyword_path needed for sentiment-only)
analyst = Analyst(setup=sentiment_setup)

# Run batch processing without keyword matching
result = analyst.process_directory(
    input_dir=str(project_root / "data" / "earnings_calls" / "2016"),
    output_dir=str(project_root / "results"),
    run_matching=False,  # Sentiment analysis only (default behavior)
)

print(f"\n✓ Batch directory created: {result['batch_directory']}")
print(f"✓ Batch summary CSV: {result['csv_path']}")
print(f"✓ Files processed: {result['num_files_processed']}")

# Save batch directory for next test, or you can just set the variable of batch directory yourself
batch_dir = result['batch_directory']

TEST 1: Sentiment-only batch processing


KeyboardInterrupt: 

## Keyword Matching on Existing Batch



In [None]:
print("=" * 80)
print("TEST 2: Direct matching on existing batch")
print("=" * 80)

# Specify the existing batch directory (adjust path as needed)
# The batch directory should contain subdirectories, each representing an earnings call
existing_batch_dir = str(project_root / "results" / "batch_20251030_031511-20251030T201729Z-1-001" / "batch_20251030_031511")

# Or use a variable if you ran Test 1:
# existing_batch_dir = batch_dir

# Create analyst (no setup needed for matching-only)
analyst = Analyst()

# Specify keyword file
keyword_path = str(project_root / "data" / "paper_word_sets" / "political_words.csv")

# Run direct keyword matching only
result = analyst.process_existing_batch(
    batch_dir=existing_batch_dir,
    keyword_path=keyword_path,
    similarity="direct",  # Use "direct" for exact matching only
    transcript_roots=[project_root / "data" / "earnings_calls" / "2016"],
)

print(f"\n✓ Matching completed!")
print(f"✓ Match ID: {result['match_id']}")
print(f"✓ Files processed: {result['num_files_processed']}")
print(f"✓ Exposure summary CSV: {result['exposure_summary_csv']}")


TEST 2: Direct matching on existing batch
Using device: cpu


Matching keywords:   0%|          | 0/26019 [00:00<?, ?file/s]


✓ Matching completed!
✓ Match ID: political_words_20251111_194350
✓ Files processed: 0
✓ Exposure summary CSV: c:\Users\arete\Cursor\research\results\batch_20251030_031511-20251030T201729Z-1-001\batch_20251030_031511\exposure_summary_political_words_20251111_194350.csv
