In [None]:
%cd /content
!rm -rf research

In [None]:
!git clone https://github.com/cwruquants/research.git

In [None]:
%cd /content/research/

In [None]:
import torch

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

In [None]:
!pip -q install nltk textstat pysentiment2 datasets accelerate ipywidgets word-forms
import nltk
nltk.download('punkt_tab')

In [None]:
import sys

from pathlib import Path


# Add project root to path
project_root = Path.cwd() # current working directory is /content/research
sys.path.insert(0, str(project_root))
print(f"Project root: {project_root}")

from research.src.analysis.analyst_module import Analyst
from research.src.document.abstract_classes.setup_module import SentimentSetup
import toml

In [None]:
# Setup configuration for sentiment analysis
setup = SentimentSetup(
    sheet_name_positive='ML_positive_unigram',
    sheet_name_negative='ML_negative_unigram',
    ml_wordlist_path=str(project_root / "data" / "word_sets" / "Garcia_MLWords.xlsx"),
    device=0,
    batch_size="auto",
    hf_model="cardiffnlp/twitter-roberta-base-sentiment-latest"
)

# Create analyst (no keyword_path needed for sentiment-only)
analyst = Analyst(setups=[setup])

keyword_path = "/content/research/data/word_sets/risk_paper.csv"

result = analyst.process_directory(
    input_dir="/content/drive/MyDrive/QUANTS/RESEARCH/EARNINGS_CALLS/2003",
    output_dir="/content/drive/MyDrive/QUANTS/RESEARCH/RESULTS",
    batch_folder_name="2003",
    run_sentiment=True,
    matching_method=None,  # Skip matching for now
)

print("\nMatching completed!")
print(f"Match ID: {result['match_id']}")
print(f"Files processed: {result['num_files_processed']}")
print(f"Exposure summary CSV: {result['exposure_summary_csv']}")