In [None]:
import pandas as pd
import yfinance as yf
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns

# --- Robustly set the project root path ---
# This allows the notebook to import from other .py files in the project
try:
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
    if project_root not in sys.path:
        sys.path.append(project_root)
except:
    print("Could not automatically set project root. Ensure the notebook is in the 'analysis' folder.")
    project_root = '..' # Fallback

# --- Import your custom feature engineering functions ---
from analysis.feature_engineering import (
    download_nltk_resources,
    load_mock_data,
    calculate_core_linguistic_features,
    calculate_catalyst_score
)

# --- Download NLTK resources (only needs to run once) ---
download_nltk_resources()

print("Setup complete. All modules are loaded.")

In [None]:
print("--- Part 1: Validating 'Core' Signal vs. Future Volatility ---")

try:
    # 1. Load "Core" linguistic features from mock filings
    core_mock_data = load_mock_data('mock_filings.json', project_root)
    core_features_df = calculate_core_linguistic_features(core_mock_data)
    core_features_df['date'] = pd.to_datetime(core_features_df['date'])

    # 2. Get stock data for the required period
    tickers = core_features_df['ticker'].unique().tolist()
    start_date = core_features_df['date'].min() - pd.Timedelta(days=1)
    # We need 90 days of data *after* the last filing to measure future volatility
    end_date = core_features_df['date'].max() + pd.Timedelta(days=91)
    
    stock_data_df = yf.download(tickers, start=start_date, end=end_date, auto_adjust=True)
    stock_data_df = stock_data_df.stack().reset_index()
    stock_data_df.rename(columns={'level_1': 'ticker'}, inplace=True)
    
    # 3. Calculate historical rolling volatility
    stock_data_df['returns'] = stock_data_df.groupby('ticker')['Close'].pct_change()
    stock_data_df['volatility'] = stock_data_df.groupby('ticker')['returns'].transform(
        lambda x: x.rolling(window=30).std() * np.sqrt(252)
    )

    # 4. Merge linguistic features with FUTURE volatility
    merged_data = []
    for _, row in core_features_df.iterrows():
        call_date = row['date']
        future_period = stock_data_df[
            (stock_data_df['ticker'] == row['ticker']) &
            (stock_data_df['Date'] > call_date) &
            (stock_data_df['Date'] <= call_date + pd.Timedelta(days=90))
        ]
        
        if not future_period.empty:
            avg_future_volatility = future_period['volatility'].mean()
            
            new_row = row.to_dict()
            new_row['avg_future_volatility'] = avg_future_volatility
            merged_data.append(new_row)

    final_core_df = pd.DataFrame(merged_data)
    
    # 5. Correlation Analysis
    if not final_core_df.empty:
        core_correlation = final_core_df.drop(columns=['ticker', 'date', 'speaker']).corr()
        
        print("\nCorrelation Matrix for 'Core' Linguistic Features:")
        display(core_correlation[['avg_future_volatility']].style.background_gradient(cmap='viridis'))
        
    else:
        print("Could not merge core features with volatility data.")

except FileNotFoundError as e:
    print(e)

In [None]:
print("\n--- Part 2: Validating 'Catalyst' Signal via Event Study ---")

try:
    # 1. Load "Catalyst" event data
    catalyst_mock_data = load_mock_data('mock_events.json', project_root)
    catalyst_events_df = calculate_catalyst_score(catalyst_mock_data)
    catalyst_events_df['date'] = pd.to_datetime(catalyst_events_df['date'])

    # 2. Analyze the price impact for each event
    event_impacts = []
    for _, event in catalyst_events_df.iterrows():
        ticker = event['ticker']
        event_date = event['date']
        
        # Define the event window
        start_window = event_date - pd.Timedelta(days=30)
        end_window = event_date + pd.Timedelta(days=30)
        
        # Download stock data and benchmark (S&P 500) data
        stock_data = yf.download(ticker, start=start_window, end=end_window, auto_adjust=True)['Close']
        spy_data = yf.download('SPY', start=start_window, end=end_window, auto_adjust=True)['Close']
        
        if stock_data.empty:
            continue
        
        # Find the price on the event day and subsequent days
        event_day_price = stock_data.asof(event_date)
        day5_price = stock_data.asof(event_date + pd.Timedelta(days=5))
        day20_price = stock_data.asof(event_date + pd.Timedelta(days=20))
        
        spy_event_price = spy_data.asof(event_date)
        spy_day5_price = spy_data.asof(event_date + pd.Timedelta(days=5))
        spy_day20_price = spy_data.asof(event_date + pd.Timedelta(days=20))

        # Calculate raw returns and abnormal (market-adjusted) returns
        return_5d = (day5_price / event_day_price) - 1
        return_20d = (day20_price / event_day_price) - 1
        spy_return_5d = (spy_day5_price / spy_event_price) - 1
        spy_return_20d = (spy_day20_price / spy_event_price) - 1
        
        abnormal_return_5d = return_5d - spy_return_5d
        abnormal_return_20d = return_20d - spy_return_20d
        
        impact = event.to_dict()
        impact['abnormal_return_5d'] = abnormal_return_5d
        impact['abnormal_return_20d'] = abnormal_return_20d
        event_impacts.append(impact)

    event_impact_df = pd.DataFrame(event_impacts)
    
    # 3. Display Results
    if not event_impact_df.empty:
        print("\nPrice Impact Analysis for 'Catalyst' Events:")
        
        # Select and format columns for display
        display_cols = [
            'date', 'ticker', 'event_type', 'source', 'Attack_Score', 
            'abnormal_return_5d', 'abnormal_return_20d'
        ]
        # Handle cases where a column might not exist (e.g., Rebuttal_Score)
        if 'Rebuttal_Severity_Score' in event_impact_df.columns:
            display_cols.insert(5, 'Rebuttal_Severity_Score')
        
        # Filter for columns that actually exist in the dataframe
        display_cols = [col for col in display_cols if col in event_impact_df.columns]
        
        formatted_df = event_impact_df[display_cols].copy()
        formatted_df['abnormal_return_5d'] = formatted_df['abnormal_return_5d'].map('{:.2%}'.format)
        formatted_df['abnormal_return_20d'] = formatted_df['abnormal_return_20d'].map('{:.2%}'.format)

        display(formatted_df)

except FileNotFoundError as e:
    print(e)

Unnamed: 0,ticker,date,speaker,complexity_score,sentiment_score,generalizing_score,self_reference_score,future_tense_ratio,past_tense_ratio
0,ENRN,1999-04-12,CEO,9.242647,0.9601,0.025,0.175,0.15,0.15
1,ENRN,1999-04-12,CFO,10.694706,0.9136,0.0,0.153846,0.128205,0.128205
2,ENRN,2000-10-27,CEO,17.305606,0.8225,0.025,0.05,0.2,0.2
3,ENRN,2000-10-27,CFO,18.166667,0.91,0.04,0.02,0.12,0.12
4,AAPL,2023-01-25,CEO,10.243636,0.9359,0.0,0.122449,0.183673,0.183673
