## Setup

In [1]:
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

from src.analysis.analyst_module import Analyst
from src.document.abstract_classes.setup_module import Setup
import toml

[nltk_data] Downloading package punkt to /Users/efang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


## Test 1: Sentiment-Only Batch Processing

Run sentiment analysis on all earnings calls without keyword matching.

In [None]:
print("=" * 80)
print("TEST 1: Sentiment-only batch processing")
print("=" * 80)

# Setup configuration for sentiment analysis
setup = Setup(
    sheet_name_positive='ML_positive_unigram',
    sheet_name_negative='ML_negative_unigram',
    file_path=str(project_root / "data" / "word_sets" / "Garcia_MLWords.xlsx"),
    hf_model='cardiffnlp/twitter-roberta-base-sentiment-latest',
    device=-1
)

# Create analyst (no keyword_path needed for sentiment-only)
analyst = Analyst(setup=setup)

# Run batch processing without keyword matching
result = analyst.fit_directory(
    input_dir=str(project_root / "data" / "earnings_calls" / "2016"),
    output_dir=str(project_root / "results"),
    run_matching=False,  # Sentiment analysis only (default behavior)
)

print(f"\n✓ Batch directory created: {result['batch_directory']}")
print(f"✓ Batch summary CSV: {result['csv_path']}")
print(f"✓ Files processed: {result['num_files_processed']}")

# Save batch directory for next test, or you can just set the variable of batch directory yourself
batch_dir = result['batch_directory']

TEST 1: Sentiment-only batch processing


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


## Keyword Matching on Existing Batch



In [None]:
print("=" * 80)
print("TEST 2: Keyword matching on existing batch")
print("=" * 80)

# Create analyst (no setup needed for matching-only)
analyst = Analyst()

keyword_path = str(project_root / "data" / "paper_word_sets" / "political_words.csv")

# Run keyword matching
result1 = analyst.match_existing_batch(
    batch_dir=batch_dir,
    keyword_path=keyword_path,
    similarity="cosine"
)
