In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

def generate_cleaned_dataset_summary_stats(df):
    """
    Creating summary stats for the cleaned dataset
    """
    
    print("CLEANED DATASET SUMMARY STATISTICS")
    print("=" * 80)
    print(f"Dataset Shape: {df.shape[0]:,} rows × {df.shape[1]:,} columns")
    
    # Basic dataset overview
    print(f"\nDATASET OVERVIEW")
    print("-" * 50)
    
    # Panel structure
    if 'Ticker' in df.columns and 'Year' in df.columns:
        n_companies = df['Ticker'].nunique()
        n_years = df['Year'].nunique()
        year_range = f"{df['Year'].min()}-{df['Year'].max()}"
        
        print(f"   Panel Structure:")
        print(f"      Companies: {n_companies:,}")
        print(f"      Years: {n_years} ({year_range})")
        print(f"      Total observations: {df.shape[0]:,}")
        print(f"      Average years per company: {df.shape[0]/n_companies:.1f}")
        
        # Check balance
        company_counts = df.groupby('Ticker')['Year'].count()
        perfect_balance = (company_counts == n_years).sum()
        print(f"      Perfectly balanced companies: {perfect_balance:,} ({perfect_balance/n_companies:.1%})")
    
    # Sector distribution
    if 'sector_gsector' in df.columns:
        print(f"\n   Sector Distribution (GICS):")
        sector_counts = df['sector_gsector'].value_counts().head(8)
        for sector, count in sector_counts.items():
            pct = (count / len(df)) * 100
            print(f"      {sector}: {count:,} obs ({pct:.1f}%)")
    
    # AI factors analysis
    print(f"\nAI FACTORS ANALYSIS")
    print("-" * 50)
    
    # AI factors by model
    models = {
        'GPT-4o': 'gpt4o',
        'Gemini Flash 1.5': 'flash1_5', 
        'Gemini Flash 2.5': 'flash2_5'
    }
    
    ai_dimensions = ['Strategic Depth', 'Disclosure Sentiment', 'AI Washing Index', 
                    'Forward-Looking', 'Talent & Investment']
    
    for model_name, model_key in models.items():
        print(f"\n   {model_name}:")
        
        # Check categorical distributions
        for dimension in ai_dimensions:
            col_name = f"{dimension}_{model_key}"
            if col_name in df.columns:
                values = df[col_name].dropna()
                if len(values) > 0:
                    completeness = len(values) / len(df) * 100
                    unique_vals = values.nunique()
                    top_value = values.value_counts().index[0]
                    top_pct = (values.value_counts().iloc[0] / len(values)) * 100
                    
                    print(f"      {dimension[:20]:20}: {completeness:5.1f}% complete, "
                          f"{unique_vals} categories, top='{top_value}' ({top_pct:.1f}%)")
        
        # Check composite score
        composite_col = f'Cum_Score_{model_key}'
        if composite_col in df.columns:
            comp_values = df[composite_col].dropna()
            if len(comp_values) > 0:
                print(f"      {'Composite Score':20}: mean={comp_values.mean():.2f}, "
                      f"std={comp_values.std():.2f}, range=[{comp_values.min():.1f}, {comp_values.max():.1f}]")
    
    # Fama-French factors
    print(f"\nFAMA-FRENCH FACTORS")
    print("-" * 50)
    
    ff_factors = ['ff_mktrf', 'ff_smb', 'ff_hml', 'ff_rmw', 'ff_cma', 'ff_rf', 'ff_umd']
    
    print("   Point-in-time Factors (at filing date):")
    for factor in ff_factors:
        if factor in df.columns:
            values = df[factor].dropna()
            if len(values) > 0:
                completeness = len(values) / len(df) * 100
                print(f"      {factor:10}: {completeness:5.1f}% complete, "
                      f"mean={values.mean():8.4f}, std={values.std():8.4f}")
    
    # Cumulative RF rates
    rf_cols = ['rf_3m_cumulative', 'rf_6m_cumulative', 'rf_9m_cumulative', 'rf_12m_cumulative']
    print(f"\n   Cumulative Risk-Free Rates:")
    for rf_col in rf_cols:
        if rf_col in df.columns:
            values = df[rf_col].dropna()
            if len(values) > 0:
                completeness = len(values) / len(df) * 100
                horizon = rf_col.replace('rf_', '').replace('m_cumulative', '')
                print(f"      {horizon:2}m cumulative: {completeness:5.1f}% complete, "
                      f"mean={values.mean():6.4f} ({values.mean()*100:.2f}%)")
    
    # Stock prices and returns
    print(f"\nSTOCK PRICES & RETURNS")
    print("-" * 50)
    
    # Stock prices at different time points
    price_cols = ['price_t0', 'price_t3', 'price_t6', 'price_t9', 'price_t12']
    print("   Stock Prices:")
    for price_col in price_cols:
        if price_col in df.columns:
            values = df[price_col].dropna()
            if len(values) > 0:
                completeness = len(values) / len(df) * 100
                horizon = price_col.replace('price_t', '')
                horizon_label = 'filing' if horizon == '0' else f'{horizon}m after'
                print(f"      {horizon_label:12}: {completeness:5.1f}% complete, "
                      f"median=${values.median():8.2f}, std=${values.std():8.2f}")
    
    # Raw returns
    return_cols = ['return_3mo', 'return_6mo', 'return_9mo', 'return_12mo']
    print(f"\n   Raw Returns:")
    for ret_col in return_cols:
        if ret_col in df.columns:
            values = df[ret_col].dropna()
            if len(values) > 0:
                completeness = len(values) / len(df) * 100
                horizon = ret_col.replace('return_', '').replace('mo', '')
                print(f"      {horizon:2}m horizon : {completeness:5.1f}% complete, "
                      f"mean={values.mean():7.1%}, std={values.std():7.1%}")
    
    # Excess returns
    excess_cols = ['excess_return_3mo', 'excess_return_6mo', 'excess_return_9mo', 'excess_return_12mo']
    print(f"\n   Excess Returns:")
    for exc_col in excess_cols:
        if exc_col in df.columns:
            values = df[exc_col].dropna()
            if len(values) > 0:
                completeness = len(values) / len(df) * 100
                horizon = exc_col.replace('excess_return_', '').replace('mo', '')
                
                # Compare with raw return
                raw_col = exc_col.replace('excess_', '')
                if raw_col in df.columns:
                    raw_values = df[raw_col].dropna()
                    if len(raw_values) > 0:
                        rf_impact = raw_values.mean() - values.mean()
                        print(f"      {horizon:2}m horizon : {completeness:5.1f}% complete, "
                              f"mean={values.mean():7.1%}, RF impact={rf_impact:5.1%}")
                    else:
                        print(f"      {horizon:2}m horizon : {completeness:5.1f}% complete, "
                              f"mean={values.mean():7.1%}")
                else:
                    print(f"      {horizon:2}m horizon : {completeness:5.1f}% complete, "
                          f"mean={values.mean():7.1%}")
    
    # Fundamental variables
    print(f"\nFUNDAMENTAL VARIABLES")
    print("-" * 50)
    
    # Fund variables with descriptions
    fund_mapping = {
        'fund_atq': 'Total Assets',
        'fund_niq': 'Net Income', 
        'fund_teqq': 'Shareholders Equity',
        'fund_revty': 'Revenue',
        'fund_dlttq': 'Long-term Debt',
        'fund_cheq': 'Cash & Short-term Investments',
        'fund_epsfxq': 'EPS Diluted',
        'fund_cshoq': 'Common Shares Outstanding',
        'fund_oiadpq': 'Operating Income After Depreciation'
    }
    
    print("   Key Fundamental Variables:")
    for fund_col, description in fund_mapping.items():
        if fund_col in df.columns:
            values = df[fund_col].dropna()
            if len(values) > 0:
                completeness = len(values) / len(df) * 100
                
                # Format based on typical magnitude
                if 'eps' in fund_col.lower():
                    print(f"      {description[:25]:25}: {completeness:5.1f}% complete, "
                          f"median={values.median():8.2f}")
                else:
                    print(f"      {description[:25]:25}: {completeness:5.1f}% complete, "
                          f"median={values.median():10.2e}")
    
    # Calculated ratios
    print(f"\nCALCULATED FINANCIAL RATIOS")
    print("-" * 50)
    
    # Calculated ratios with descriptions
    ratio_mapping = {
        'calc_roa': 'Return on Assets',
        'calc_roe': 'Return on Equity',
        'calc_debt_to_assets': 'Debt to Assets',
        'calc_debt_to_equity': 'Debt to Equity', 
        'calc_price_to_book': 'Price to Book',
        'calc_price_to_earnings': 'Price to Earnings',
        'calc_market_to_book': 'Market to Book',
        'calc_log_total_assets': 'Log Total Assets',
        'calc_log_market_cap': 'Log Market Cap',
        'calc_profit_margin': 'Profit Margin',
        'calc_operating_margin': 'Operating Margin',
        'calc_asset_turnover': 'Asset Turnover'
    }
    
    print("   Key Financial Ratios:")
    for ratio_col, description in ratio_mapping.items():
        if ratio_col in df.columns:
            values = df[ratio_col].dropna()
            if len(values) > 0:
                completeness = len(values) / len(df) * 100
                
                # Show percentiles for ratios with wide distributions
                if any(x in ratio_col for x in ['price_to_earnings', 'market_to_book']):
                    p25, p50, p75 = values.quantile([0.25, 0.5, 0.75])
                    print(f"      {description[:25]:25}: {completeness:5.1f}% complete, "
                          f"p25={p25:6.1f}, p50={p50:6.1f}, p75={p75:6.1f}")
                else:
                    print(f"      {description[:25]:25}: {completeness:5.1f}% complete, "
                          f"mean={values.mean():8.3f}, std={values.std():8.3f}")
    
    # Sector classification
    print(f"\nGICS SECTOR CLASSIFICATION")
    print("-" * 50)
    
    gics_mapping = {
        'sector_gsector': 'GICS Sector',
        'sector_ggroup': 'GICS Industry Group',
        'sector_gind': 'GICS Industry', 
        'sector_gsubind': 'GICS Sub-Industry'
    }
    
    print("   GICS Classification Levels:")
    for gics_col, description in gics_mapping.items():
        if gics_col in df.columns:
            values = df[gics_col].dropna()
            if len(values) > 0:
                completeness = len(values) / len(df) * 100
                unique_vals = values.nunique()
                print(f"      {description[:20]:20}: {completeness:5.1f}% complete, "
                      f"{unique_vals:3d} unique codes")
    
    # Data quality summary
    print(f"\nDATA QUALITY SUMMARY")
    print("-" * 50)
    
    # Completeness by major categories
    category_completeness = {}
    
    # AI factors (use one representative column per model)
    for model_name, model_key in models.items():
        strategic_col = f'Strategic Depth_{model_key}'
        if strategic_col in df.columns:
            completeness = (df[strategic_col].notna().sum() / len(df)) * 100
            category_completeness[f'AI Factors ({model_name})'] = completeness
    
    # FF factors
    if 'ff_mktrf' in df.columns:
        ff_completeness = (df['ff_mktrf'].notna().sum() / len(df)) * 100
        category_completeness['Fama-French Factors'] = ff_completeness
    
    # Prices
    if 'price_t0' in df.columns:
        price_completeness = (df['price_t0'].notna().sum() / len(df)) * 100
        category_completeness['Stock Prices'] = price_completeness
    
    # Returns
    if 'return_3mo' in df.columns:
        return_completeness = (df['return_3mo'].notna().sum() / len(df)) * 100
        category_completeness['Returns (3m)'] = return_completeness
    
    # Fundamentals
    if 'fund_atq' in df.columns:
        fund_completeness = (df['fund_atq'].notna().sum() / len(df)) * 100
        category_completeness['Fundamentals'] = fund_completeness
    
    # Calculated ratios
    if 'calc_roa' in df.columns:
        ratio_completeness = (df['calc_roa'].notna().sum() / len(df)) * 100
        category_completeness['Calculated Ratios'] = ratio_completeness
    
    print("   Data Completeness by Category:")
    for category, completeness in category_completeness.items():
        status = "Complete" if completeness > 95 else "Good" if completeness > 80 else "Limited"
        print(f"      {category[:30]:30}: {completeness:5.1f}% {status}")
    
    # Overall assessment
    avg_completeness = np.mean(list(category_completeness.values()))
    print(f"\n   Overall Data Quality: {avg_completeness:.1f}%")
    
    if avg_completeness > 90:
        quality_rating = "Excellent - Ready for analysis"
    elif avg_completeness > 80:
        quality_rating = "Good - Minor gaps acceptable"
    else:
        quality_rating = "Fair - Consider data imputation"
    
    print(f"      Rating: {quality_rating}")
    
    # Analysis readiness summary
    print(f"\nANALYSIS READINESS")
    print("-" * 50)
    
    # Count usable observations for key analyses
    analysis_readiness = {}
    
    # AI factor analysis
    if 'Strategic Depth_gpt4o' in df.columns and 'return_3mo' in df.columns:
        ai_analysis_ready = df[['Strategic Depth_gpt4o', 'return_3mo']].dropna().shape[0]
        analysis_readiness['AI Factor to 3m Returns'] = ai_analysis_ready
    
    # FF factor analysis  
    if 'ff_mktrf' in df.columns and 'excess_return_3mo' in df.columns:
        ff_analysis_ready = df[['ff_mktrf', 'excess_return_3mo']].dropna().shape[0]
        analysis_readiness['FF Factors to Excess Returns'] = ff_analysis_ready
    
    # Fundamental analysis
    if 'fund_atq' in df.columns and 'calc_roa' in df.columns:
        fund_analysis_ready = df[['fund_atq', 'calc_roa']].dropna().shape[0]
        analysis_readiness['Fundamentals to Ratios'] = fund_analysis_ready
    
    # Long-term analysis
    if 'return_12mo' in df.columns and 'excess_return_12mo' in df.columns:
        longterm_ready = df[['return_12mo', 'excess_return_12mo']].dropna().shape[0]
        analysis_readiness['12-month Analysis'] = longterm_ready
    
    print("   Usable Observations for Key Analyses:")
    for analysis, count in analysis_readiness.items():
        pct = (count / len(df)) * 100
        status = "Ready" if pct > 80 else "Limited" if pct > 70 else "Insufficient"
        print(f"      {analysis[:30]:30}: {count:4,} obs ({pct:5.1f}%) {status}")
    
    print(f"\nDataset summary complete - ready for analysis")
    
    return True

# Load and analyze the cleaned dataset
def load_and_analyze_cleaned_dataset():
    """
    Load the cleaned dataset and generate summary statistics
    """
    load_path = "/Users/daniel/Library/Mobile Documents/com~apple~CloudDocs/Documents/Master Finance/MasterThesis/ThesisData/Regression/final_clean_dataset_filtered.csv"
    
    print("Loading cleaned dataset")
    print("=" * 50)
    print(f"Path: {load_path}")
    
    try:
        df = pd.read_csv(load_path)
        print(f"Dataset loaded successfully")
        print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]:,} columns")
        
        # Generate summary statistics
        generate_cleaned_dataset_summary_stats(df)
        
        return df
        
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Run the analysis
if __name__ == "__main__":
    print("Running cleaned dataset summary statistics")
    print("=" * 80)
    
    df = load_and_analyze_cleaned_dataset()
    
    if df is not None:
        print(f"\nAnalysis complete - dataset ready for research")
    else:
        print(f"\nAnalysis failed - check dataset path and file integrity")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
from pathlib import Path

# load cleaned dataset
load_path = "/Users/daniel/Library/Mobile Documents/com~apple~CloudDocs/Documents/Master Finance/MasterThesis/ThesisData/Regression/final_clean_dataset_filtered.csv"
if 'df_clean' not in globals():
    df = pd.read_csv(load_path)
    df['CIK'] = df['CIK'].astype(str).str.zfill(10)
    df['gvkey'] = df['gvkey'].astype(str).str.zfill(6)
else:
    df = df_clean.copy()

# create output directory
output_dir = "/Users/daniel/Library/Mobile Documents/com~apple~CloudDocs/Documents/Master Finance/MasterThesis/ThesisData/Regression/SummaryStats/"
os.makedirs(output_dir, exist_ok=True)

print("gics mapping and hierarchical analysis")
print("=" * 60)

# define gics mappings (standard gics names)
gics_sector_names = {
    10: "Energy",
    15: "Materials", 
    20: "Industrials",
    25: "Consumer Discretionary",
    30: "Consumer Staples",
    35: "Health Care",
    40: "Financials",
    45: "Information Technology",
    50: "Communication Services",
    55: "Utilities",
    60: "Real Estate"
}

# gics industry group mapping (level 2) - key groups
gics_group_names = {
    1010: "Energy",
    1510: "Materials",
    2010: "Capital Goods", 2020: "Commercial & Professional Services", 2030: "Transportation",
    2510: "Automobiles & Components", 2520: "Consumer Durables & Apparel", 2530: "Consumer Services", 2540: "Media", 2550: "Retailing",
    3010: "Food & Staples Retailing", 3020: "Food, Beverage & Tobacco", 3030: "Household & Personal Products",
    3510: "Health Care Equipment & Services", 3520: "Pharmaceuticals, Biotechnology & Life Sciences",
    4010: "Banks", 4020: "Diversified Financials", 4030: "Insurance",
    4510: "Software & Services", 4520: "Technology Hardware & Equipment", 4530: "Semiconductors & Semiconductor Equipment",
    5010: "Telecommunication Services", 5020: "Media & Entertainment",
    5510: "Utilities",
    6010: "Real Estate"
}

# create company mapping with names
print("creating comprehensive gics mapping")

# get unique companies with sector information
company_sectors = df[['gvkey', 'CIK', 'Ticker', 'Company Name', 'Sector', 
                     'sector_gsector', 'sector_ggroup', 'sector_gind', 'sector_gsubind']].drop_duplicates(subset=['gvkey'])

# add gics names
company_sectors['GICS_Sector_Code'] = company_sectors['sector_gsector']
company_sectors['GICS_Sector_Name'] = company_sectors['sector_gsector'].map(gics_sector_names)

company_sectors['GICS_Group_Code'] = company_sectors['sector_ggroup'] 
company_sectors['GICS_Group_Name'] = company_sectors['sector_ggroup'].map(gics_group_names)

print(f"mapped gics names for {len(company_sectors):,} companies")

# create distribution tables for each level

print("\ncreating distribution tables")

# sector distribution (level 1)
sector_dist = company_sectors.groupby(['GICS_Sector_Code', 'Sector']).size().reset_index(name='Company_Count')
sector_dist['Percentage'] = (sector_dist['Company_Count'] / sector_dist['Company_Count'].sum() * 100).round(2)
sector_dist = sector_dist.sort_values('Company_Count', ascending=False)

# industry group distribution (level 2)
group_dist = company_sectors.groupby(['GICS_Group_Code', 'GICS_Group_Name', 'Sector']).size().reset_index(name='Company_Count')
group_dist['Percentage'] = (group_dist['Company_Count'] / group_dist['Company_Count'].sum() * 100).round(2)
group_dist = group_dist.sort_values('Company_Count', ascending=False)

print(f"created distribution tables:")
print(f"   sectors: {len(sector_dist)} categories")
print(f"   industry groups: {len(group_dist)} categories") 

# create visualizations
print(f"\ncreating visualizations")

plt.style.use('default')
sns.set_palette("husl")

# sector distribution bar chart
plt.figure(figsize=(14, 8))
bars = plt.bar(range(len(sector_dist)), sector_dist['Company_Count'], 
               color='steelblue', alpha=0.8, edgecolor='navy', linewidth=0.5)

plt.title('Company Distribution by GICS Sector\n', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('\nGICS Sector', fontsize=12, fontweight='bold')
plt.ylabel('Number of Companies\n', fontsize=12, fontweight='bold')

# set x-axis labels (use sector names)
plt.xticks(range(len(sector_dist)), sector_dist['Sector'], rotation=45, ha='right')

# add value labels on bars
for i, bar in enumerate(bars):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 1,
             f'{int(height)}\n({sector_dist.iloc[i]["Percentage"]:.1f}%)',
             ha='center', va='bottom', fontsize=9, fontweight='bold')

plt.grid(axis='y', alpha=0.3, linestyle='--')
plt.tight_layout()
plt.savefig(f'{output_dir}gics_sector_distribution.png', dpi=300, bbox_inches='tight')
plt.close()

# industry group distribution bar chart (top 15)
plt.figure(figsize=(16, 10))
top_groups = group_dist.head(15)

bars = plt.bar(range(len(top_groups)), top_groups['Company_Count'], 
               color='lightcoral', alpha=0.8, edgecolor='darkred', linewidth=0.5)

plt.title('Company Distribution by GICS Industry Group (Top 15)\n', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('\nGICS Industry Group', fontsize=12, fontweight='bold')
plt.ylabel('Number of Companies\n', fontsize=12, fontweight='bold')

# set x-axis labels (truncate if too long)
group_labels = [name[:25] + '...' if len(str(name)) > 25 else str(name) for name in top_groups['GICS_Group_Name']]
plt.xticks(range(len(top_groups)), group_labels, rotation=45, ha='right')

# add value labels on bars
for i, bar in enumerate(bars):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.5,
             f'{int(height)}\n({top_groups.iloc[i]["Percentage"]:.1f}%)',
             ha='center', va='bottom', fontsize=8, fontweight='bold')

plt.grid(axis='y', alpha=0.3, linestyle='--')
plt.tight_layout()
plt.savefig(f'{output_dir}gics_group_distribution.png', dpi=300, bbox_inches='tight')
plt.close()

print(f"created visualizations:")
print(f"   gics_sector_distribution.png")
print(f"   gics_group_distribution.png")

print(f"\ngics analysis complete")
print(f"all files saved to: {output_dir}")

print(f"\ngics hierarchy overview:")
print(f"   level 1 - sectors: {len(sector_dist)} categories")
print(f"   level 2 - industry groups: {len(group_dist)} categories")

print(f"\nfiles created:")
print(f"   visualizations:")
print(f"      • gics_sector_distribution.png")
print(f"      • gics_group_distribution.png")

print(f"\ntop 5 by level:")
print(f"sectors: {', '.join(sector_dist.head()['Sector'].tolist())}")
print(f"groups: {', '.join([str(name) for name in group_dist.head()['GICS_Group_Name'].tolist() if pd.notna(name)])}")

print(f"\nready for thesis appendix and methodology section")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
from pathlib import Path

# load cleaned dataset
load_path = "/Users/daniel/Library/Mobile Documents/com~apple~CloudDocs/Documents/Master Finance/MasterThesis/ThesisData/Regression/final_clean_dataset_filtered.csv"
df = pd.read_csv(load_path)

# ensure proper formatting
df['CIK'] = df['CIK'].astype(str).str.zfill(10)
df['gvkey'] = df['gvkey'].astype(str).str.zfill(6)

# create output directory
output_dir = "/Users/daniel/Library/Mobile Documents/com~apple~CloudDocs/Documents/Master Finance/MasterThesis/ThesisData/Regression/SummaryStats/"
os.makedirs(output_dir, exist_ok=True)

print("ai factor summary analysis")
print("=" * 60)
print(f"output directory: {output_dir}")
print(f"dataset: {df.shape[0]:,} observations, {df['gvkey'].nunique():,} unique companies")

# define ai factor columns with numeric versions
factor_columns = {
    'Strategic Depth': ['Strategic Depth_gpt4o_Numeric', 'Strategic Depth_flash1_5_Numeric', 'Strategic Depth_flash2_5_Numeric'],
    'Disclosure Sentiment': ['Disclosure Sentiment_gpt4o_Numeric', 'Disclosure Sentiment_flash1_5_Numeric', 'Disclosure Sentiment_flash2_5_Numeric'],
    'Risk - Own Adoption': ['Risk - Own Adoption_gpt4o_Numeric', 'Risk - Own Adoption_flash1_5_Numeric', 'Risk - Own Adoption_flash2_5_Numeric'],
    'Risk - External Threats': ['Risk - External Threats_gpt4o_Numeric', 'Risk - External Threats_flash1_5_Numeric', 'Risk - External Threats_flash2_5_Numeric'],
    'Risk - Non-Adoption': ['Risk - Non-Adoption_gpt4o_Numeric', 'Risk - Non-Adoption_flash1_5_Numeric', 'Risk - Non-Adoption_flash2_5_Numeric'],
    'Forward-Looking': ['Forward-Looking_gpt4o_Numeric', 'Forward-Looking_flash1_5_Numeric', 'Forward-Looking_flash2_5_Numeric'],
    'Talent & Investment': ['Talent & Investment_gpt4o_Numeric', 'Talent & Investment_flash1_5_Numeric', 'Talent & Investment_flash2_5_Numeric'],
    'AI Washing Index': ['AI Washing Index_gpt4o_Numeric', 'AI Washing Index_flash1_5_Numeric', 'AI Washing Index_flash2_5_Numeric']
}

model_names = ['GPT-4o', 'Gemini 1.5 Flash', 'Gemini 2.5 Flash']
model_short = ['gpt4o', 'flash1_5', 'flash2_5']

print(f"analyzing {len(factor_columns)} AI factors across {len(model_names)} LLM models")

# create comprehensive factor summary
print("\ncreating factor summary statistics")

factor_summary = {}
all_factor_stats = []

for factor, columns in factor_columns.items():
    factor_data = []
    
    for i, col in enumerate(columns):
        if col in df.columns:
            data = df[col].dropna()
            
            if len(data) == 0:
                print(f"no data for {col}")
                continue
            
            # calculate key metrics
            mean_val = data.mean()
            median_val = data.median()
            std_val = data.std()
            min_val = data.min()
            max_val = data.max()
            
            # map numeric values to letter grades (1=E, 2=D, 3=C, 4=B, 5=A)
            grade_map = {5: 'A', 4: 'B', 3: 'C', 2: 'D', 1: 'E'}
            median_grade = grade_map.get(int(round(median_val)), 'N/A')
            
            # count distribution of scores
            score_counts = data.value_counts().sort_index(ascending=False)
            total_scores = len(data)
            
            # calculate percentages for each grade
            percentages = {}
            for score in [5, 4, 3, 2, 1]:
                count = score_counts.get(score, 0)
                percentages[score] = round(count / total_scores * 100, 1) if total_scores > 0 else 0
            
            factor_data.append({
                'Model': model_names[i],
                'Model_Code': model_short[i],
                'Observations': int(total_scores),
                'Mean': round(mean_val, 2),
                'Median': round(median_val, 1),
                'Median_Grade': median_grade,
                'Std_Dev': round(std_val, 2),
                'Min': int(min_val),
                'Max': int(max_val),
                'Grade_A_Pct': percentages.get(5, 0),
                'Grade_B_Pct': percentages.get(4, 0),
                'Grade_C_Pct': percentages.get(3, 0),
                'Grade_D_Pct': percentages.get(2, 0),
                'Grade_E_Pct': percentages.get(1, 0)
            })
            
            all_factor_stats.append({
                'Factor': factor,
                'Model': model_names[i],
                'Model_Code': model_short[i],
                'Mean': round(mean_val, 2),
                'Median_Grade': median_grade,
                'Observations': int(total_scores)
            })
    
    factor_summary[factor] = pd.DataFrame(factor_data)

# create overall summary dataframe
all_stats_df = pd.DataFrame(all_factor_stats)

print("factor summary statistics created")

# print factor summaries
print("\nfactor summary by dimension")
print("-" * 60)

for factor, summary_df in factor_summary.items():
    if not summary_df.empty:
        print(f"\n{factor}:")
        display_cols = ['Model', 'Observations', 'Mean', 'Median_Grade', 'Std_Dev', 'Grade_A_Pct', 'Grade_B_Pct', 'Grade_C_Pct', 'Grade_D_Pct', 'Grade_E_Pct']
        display_df = summary_df[display_cols].copy()
        display_df.columns = ['Model', 'Obs', 'Mean', 'Med.Grade', 'StdDev', '%A', '%B', '%C', '%D', '%E']
        print(display_df.to_string(index=False))

print(f"\nai factor analysis complete")
print(f"files saved to: {output_dir}")

print(f"\nanalysis overview:")
print(f"   factors analyzed: {len(factor_columns)}")
print(f"   llm models: {len(model_names)}")
print(f"   total observations: {df.shape[0]:,}")
print(f"   unique companies: {df['gvkey'].nunique():,}")

print(f"\nready for thesis empirical analysis and methodology documentation")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
from pathlib import Path

# ensure proper formatting
df['CIK'] = df['CIK'].astype(str).str.zfill(10)
df['gvkey'] = df['gvkey'].astype(str).str.zfill(6)

output_dir = "/Users/daniel/Library/Mobile Documents/com~apple~CloudDocs/Documents/Master Finance/MasterThesis/ThesisData/Regression/SummaryStats/"
os.makedirs(output_dir, exist_ok=True)

print("ai factor summary analysis")
print("=" * 60)
print(f"output directory: {output_dir}")
print(f"dataset: {df.shape[0]:,} observations, {df['gvkey'].nunique():,} unique companies")

# define ai factor columns
factor_columns = {
    'Strategic Depth': ['Strategic Depth_gpt4o_Numeric', 'Strategic Depth_flash1_5_Numeric', 'Strategic Depth_flash2_5_Numeric'],
    'Disclosure Sentiment': ['Disclosure Sentiment_gpt4o_Numeric', 'Disclosure Sentiment_flash1_5_Numeric', 'Disclosure Sentiment_flash2_5_Numeric'],
    'Risk - Own Adoption': ['Risk - Own Adoption_gpt4o_Numeric', 'Risk - Own Adoption_flash1_5_Numeric', 'Risk - Own Adoption_flash2_5_Numeric'],
    'Risk - External Threats': ['Risk - External Threats_gpt4o_Numeric', 'Risk - External Threats_flash1_5_Numeric', 'Risk - External Threats_flash2_5_Numeric'],
    'Risk - Non-Adoption': ['Risk - Non-Adoption_gpt4o_Numeric', 'Risk - Non-Adoption_flash1_5_Numeric', 'Risk - Non-Adoption_flash2_5_Numeric'],
    'Forward-Looking': ['Forward-Looking_gpt4o_Numeric', 'Forward-Looking_flash1_5_Numeric', 'Forward-Looking_flash2_5_Numeric'],
    'Talent & Investment': ['Talent & Investment_gpt4o_Numeric', 'Talent & Investment_flash1_5_Numeric', 'Talent & Investment_flash2_5_Numeric'],
    'AI Washing Index': ['AI Washing Index_gpt4o_Numeric', 'AI Washing Index_flash1_5_Numeric', 'AI Washing Index_flash2_5_Numeric']
}

model_names = ['GPT-4o', 'Gemini 1.5 Flash', 'Gemini 2.5 Flash']
model_short = ['gpt4o', 'flash1_5', 'flash2_5']

print(f"analyzing {len(factor_columns)} AI factors across {len(model_names)} LLM models")

# create comprehensive factor summary
print("\ncreating factor summary statistics")

factor_summary = {}
all_factor_stats = []

for factor, columns in factor_columns.items():
    factor_data = []
    
    for i, col in enumerate(columns):
        if col in df.columns:
            data = df[col].dropna()
            
            if len(data) == 0:
                print(f"no data for {col}")
                continue
            
            # calculate key metrics
            mean_val = data.mean()
            median_val = data.median()
            std_val = data.std()
            min_val = data.min()
            max_val = data.max()
            
            # map numeric values to letter grades
            grade_map = {5: 'A', 4: 'B', 3: 'C', 2: 'D', 1: 'E'}
            median_grade = grade_map.get(int(round(median_val)), 'N/A')
            
            # count distribution of scores
            score_counts = data.value_counts().sort_index(ascending=False)
            total_scores = len(data)
            
            # calculate percentages for each grade
            percentages = {}
            for score in [5, 4, 3, 2, 1]:
                count = score_counts.get(score, 0)
                percentages[score] = round(count / total_scores * 100, 1) if total_scores > 0 else 0
            
            factor_data.append({
                'Model': model_names[i],
                'Model_Code': model_short[i],
                'Observations': int(total_scores),
                'Mean': round(mean_val, 2),
                'Median': round(median_val, 1),
                'Median_Grade': median_grade,
                'Std_Dev': round(std_val, 2),
                'Min': int(min_val),
                'Max': int(max_val),
                'Grade_A_Pct': percentages.get(5, 0),
                'Grade_B_Pct': percentages.get(4, 0),
                'Grade_C_Pct': percentages.get(3, 0),
                'Grade_D_Pct': percentages.get(2, 0),
                'Grade_E_Pct': percentages.get(1, 0)
            })
            
            # add to overall stats
            all_factor_stats.append({
                'Factor': factor,
                'Model': model_names[i],
                'Model_Code': model_short[i],
                'Mean': round(mean_val, 2),
                'Median_Grade': median_grade,
                'Observations': int(total_scores)
            })
    
    factor_summary[factor] = pd.DataFrame(factor_data)

all_stats_df = pd.DataFrame(all_factor_stats)

print("factor summary statistics created")

# print factor summaries
print("\nfactor summary by dimension")
print("-" * 60)

for factor, summary_df in factor_summary.items():
    if not summary_df.empty:
        print(f"\n{factor}:")
        display_cols = ['Model', 'Observations', 'Mean', 'Median_Grade', 'Std_Dev', 'Grade_A_Pct', 'Grade_B_Pct', 'Grade_C_Pct', 'Grade_D_Pct', 'Grade_E_Pct']
        display_df = summary_df[display_cols].copy()
        display_df.columns = ['Model', 'Obs', 'Mean', 'Med.Grade', 'StdDev', '%A', '%B', '%C', '%D', '%E']
        print(display_df.to_string(index=False))

# create visualizations
print(f"\ncreating visualizations")

plt.style.use('default')
sns.set_palette("husl")

# factor distributions - stacked bar chart
fig, axes = plt.subplots(2, 4, figsize=(20, 12))
axes = axes.flatten()

colors = ['#2E8B57', '#32CD32', '#FFD700', '#FF8C00', '#DC143C']  
grade_labels = ['A', 'B', 'C', 'D', 'E']

for i, (factor, summary_df) in enumerate(factor_summary.items()):
    if i < 8 and not summary_df.empty:
        ax = axes[i]
        
        models = summary_df['Model'].tolist()
        grade_data = []
        
        for grade_col in ['Grade_A_Pct', 'Grade_B_Pct', 'Grade_C_Pct', 'Grade_D_Pct', 'Grade_E_Pct']:
            grade_data.append(summary_df[grade_col].tolist())
        
        bottom = np.zeros(len(models))
        bars = []
        
        for j, (grade_pct, color, label) in enumerate(zip(grade_data, colors, grade_labels)):
            bars.append(ax.bar(models, grade_pct, bottom=bottom, color=color, label=f'Grade {label}', alpha=0.8))
            bottom += grade_pct
        
        ax.set_title(f'{factor}', fontsize=11, fontweight='bold')
        ax.set_ylabel('Percentage', fontsize=9)
        ax.set_ylim(0, 100)
        ax.tick_params(axis='x', rotation=45, labelsize=8)
        ax.tick_params(axis='y', labelsize=8)
        ax.grid(axis='y', alpha=0.3)
        
        # add percentage labels for A and B grades
        for j, model_idx in enumerate(range(len(models))):
            if grade_data[0][model_idx] > 5:
                ax.text(model_idx, grade_data[0][model_idx]/2, f'{grade_data[0][model_idx]:.0f}%', 
                       ha='center', va='center', fontsize=7, fontweight='bold', color='white')
            
            if grade_data[1][model_idx] > 5:
                y_pos = grade_data[0][model_idx] + grade_data[1][model_idx]/2
                ax.text(model_idx, y_pos, f'{grade_data[1][model_idx]:.0f}%', 
                       ha='center', va='center', fontsize=7, fontweight='bold', color='white')

if len(factor_summary) > 0:
    axes[7].legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9)

for i in range(len(factor_summary), len(axes)):
    fig.delaxes(axes[i])

plt.suptitle('AI Factor Distributions Across LLM Models', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig(f'{output_dir}ai_factor_distributions.png', dpi=300, bbox_inches='tight')
plt.close()

# model comparison heatmap
print("creating model comparison heatmap...")

heatmap_data = []
factor_names = []

for factor, summary_df in factor_summary.items():
    if not summary_df.empty:
        factor_names.append(factor)
        row_data = []
        for model_code in model_short:
            model_data = summary_df[summary_df['Model_Code'] == model_code]
            if not model_data.empty:
                grade_to_num = {'A': 5, 'B': 4, 'C': 3, 'D': 2, 'E': 1}
                median_grade = model_data['Median_Grade'].iloc[0]
                row_data.append(grade_to_num.get(median_grade, 3))
            else:
                row_data.append(3)
        heatmap_data.append(row_data)

heatmap_df = pd.DataFrame(heatmap_data, columns=model_names, index=factor_names)

plt.figure(figsize=(10, 8))
sns.heatmap(heatmap_df, annot=True, cmap='RdYlGn', center=3, 
            cbar_kws={'label': 'Grade (1=E, 5=A)'}, fmt='.1f',
            linewidths=0.5)
plt.title('AI Factor Median Grades by Model', fontsize=14, fontweight='bold')
plt.xlabel('LLM Model', fontsize=12)
plt.ylabel('AI Factor', fontsize=12)
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig(f'{output_dir}ai_factor_model_comparison.png', dpi=300, bbox_inches='tight')
plt.close()

# overall grade distribution  
print("creating overall grade distribution...")

plt.figure(figsize=(14, 8))

all_grades = []
all_models = []

for factor, summary_df in factor_summary.items():
    for _, row in summary_df.iterrows():
        for grade, pct in zip(['A', 'B', 'C', 'D', 'E'], 
                             [row['Grade_A_Pct'], row['Grade_B_Pct'], row['Grade_C_Pct'], 
                              row['Grade_D_Pct'], row['Grade_E_Pct']]):
            all_grades.extend([grade] * int(pct))
            all_models.extend([row['Model']] * int(pct))

grade_dist_df = pd.DataFrame({'Grade': all_grades, 'Model': all_models})

sns.countplot(data=grade_dist_df, x='Grade', hue='Model', palette='viridis')
plt.title('Overall Grade Distribution Across All AI Factors', fontsize=14, fontweight='bold')
plt.xlabel('Grade', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='LLM Model', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(f'{output_dir}ai_factor_overall_grades.png', dpi=300, bbox_inches='tight')
plt.close()

print(f"created visualizations:")
print(f"   ai_factor_distributions.png")
print(f"   ai_factor_model_comparison.png")
print(f"   ai_factor_overall_grades.png")

print(f"\nai factor analysis complete!")
print(f"all files saved to: {output_dir}")

print(f"\nanalysis overview:")
print(f"   factors analyzed: {len(factor_columns)}")
print(f"   llm models: {len(model_names)}")
print(f"   total observations: {df.shape[0]:,}")
print(f"   unique companies: {df['gvkey'].nunique():,}")

print(f"\nperfect for thesis empirical analysis and methodology documentation!")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
from pathlib import Path

# ensure proper formatting
df['CIK'] = df['CIK'].astype(str).str.zfill(10)
df['gvkey'] = df['gvkey'].astype(str).str.zfill(6)

output_dir = "/Users/daniel/Library/Mobile Documents/com~apple~CloudDocs/Documents/Master Finance/MasterThesis/ThesisData/Regression/SummaryStats/"
os.makedirs(output_dir, exist_ok=True)

print("ai factor summary analysis")
print("=" * 60)
print(f"output directory: {output_dir}")
print(f"dataset: {df.shape[0]:,} observations, {df['gvkey'].nunique():,} unique companies")

# define ai factor columns
factor_columns = {
    'Strategic Depth': ['Strategic Depth_gpt4o_Numeric', 'Strategic Depth_flash1_5_Numeric', 'Strategic Depth_flash2_5_Numeric'],
    'Disclosure Sentiment': ['Disclosure Sentiment_gpt4o_Numeric', 'Disclosure Sentiment_flash1_5_Numeric', 'Disclosure Sentiment_flash2_5_Numeric'],
    'Risk - Own Adoption': ['Risk - Own Adoption_gpt4o_Numeric', 'Risk - Own Adoption_flash1_5_Numeric', 'Risk - Own Adoption_flash2_5_Numeric'],
    'Risk - External Threats': ['Risk - External Threats_gpt4o_Numeric', 'Risk - External Threats_flash1_5_Numeric', 'Risk - External Threats_flash2_5_Numeric'],
    'Risk - Non-Adoption': ['Risk - Non-Adoption_gpt4o_Numeric', 'Risk - Non-Adoption_flash1_5_Numeric', 'Risk - Non-Adoption_flash2_5_Numeric'],
    'Forward-Looking': ['Forward-Looking_gpt4o_Numeric', 'Forward-Looking_flash1_5_Numeric', 'Forward-Looking_flash2_5_Numeric'],
    'Talent & Investment': ['Talent & Investment_gpt4o_Numeric', 'Talent & Investment_flash1_5_Numeric', 'Talent & Investment_flash2_5_Numeric'],
    'AI Washing Index': ['AI Washing Index_gpt4o_Numeric', 'AI Washing Index_flash1_5_Numeric', 'AI Washing Index_flash2_5_Numeric']
}

model_names = ['GPT-4o', 'Gemini 1.5 Flash', 'Gemini 2.5 Flash']
model_short = ['gpt4o', 'flash1_5', 'flash2_5']

print(f"analyzing {len(factor_columns)} AI factors across {len(model_names)} LLM models")

# create comprehensive factor summary
print("\ncreating factor summary statistics")

factor_summary = {}
all_factor_stats = []

for factor, columns in factor_columns.items():
    factor_data = []
    
    for i, col in enumerate(columns):
        if col in df.columns:
            data = df[col].dropna()
            
            if len(data) == 0:
                print(f"no data for {col}")
                continue
            
            # calculate key metrics
            mean_val = data.mean()
            median_val = data.median()
            std_val = data.std()
            min_val = data.min()
            max_val = data.max()
            
            # map numeric values to letter grades
            grade_map = {5: 'A', 4: 'B', 3: 'C', 2: 'D', 1: 'E'}
            median_grade = grade_map.get(int(round(median_val)), 'N/A')
            
            # count distribution of scores
            score_counts = data.value_counts().sort_index(ascending=False)
            total_scores = len(data)
            
            # calculate percentages for each grade
            percentages = {}
            for score in [5, 4, 3, 2, 1]:
                count = score_counts.get(score, 0)
                percentages[score] = round(count / total_scores * 100, 1) if total_scores > 0 else 0
            
            factor_data.append({
                'Model': model_names[i],
                'Model_Code': model_short[i],
                'Observations': int(total_scores),
                'Mean': round(mean_val, 2),
                'Median': round(median_val, 1),
                'Median_Grade': median_grade,
                'Std_Dev': round(std_val, 2),
                'Min': int(min_val),
                'Max': int(max_val),
                'Grade_A_Pct': percentages.get(5, 0),
                'Grade_B_Pct': percentages.get(4, 0),
                'Grade_C_Pct': percentages.get(3, 0),
                'Grade_D_Pct': percentages.get(2, 0),
                'Grade_E_Pct': percentages.get(1, 0)
            })
            
            # add to overall stats
            all_factor_stats.append({
                'Factor': factor,
                'Model': model_names[i],
                'Model_Code': model_short[i],
                'Mean': round(mean_val, 2),
                'Median_Grade': median_grade,
                'Observations': int(total_scores)
            })
    
    factor_summary[factor] = pd.DataFrame(factor_data)

all_stats_df = pd.DataFrame(all_factor_stats)

print("factor summary statistics created")

# print factor summaries  
print("\nfactor summary by dimension")
print("-" * 60)

for factor, summary_df in factor_summary.items():
    if not summary_df.empty:
        print(f"\n{factor}:")
        display_cols = ['Model', 'Observations', 'Mean', 'Median_Grade', 'Std_Dev', 'Grade_A_Pct', 'Grade_B_Pct', 'Grade_C_Pct', 'Grade_D_Pct', 'Grade_E_Pct']
        display_df = summary_df[display_cols].copy()
        display_df.columns = ['Model', 'Obs', 'Mean', 'Med.Grade', 'StdDev', '%A', '%B', '%C', '%D', '%E']
        print(display_df.to_string(index=False))

# create visualizations
print(f"\ncreating visualizations")

plt.style.use('default')
sns.set_palette("husl")

# factor distributions - stacked bar chart
fig, axes = plt.subplots(2, 4, figsize=(20, 12))
axes = axes.flatten()

colors = ['#2E8B57', '#32CD32', '#FFD700', '#FF8C00', '#DC143C']  
grade_labels = ['A', 'B', 'C', 'D', 'E']

for i, (factor, summary_df) in enumerate(factor_summary.items()):
    if i < 8 and not summary_df.empty:
        ax = axes[i]
        
        models = summary_df['Model'].tolist()
        grade_data = []
        
        for grade_col in ['Grade_A_Pct', 'Grade_B_Pct', 'Grade_C_Pct', 'Grade_D_Pct', 'Grade_E_Pct']:
            grade_data.append(summary_df[grade_col].tolist())
        
        bottom = np.zeros(len(models))
        bars = []
        
        for j, (grade_pct, color, label) in enumerate(zip(grade_data, colors, grade_labels)):
            bars.append(ax.bar(models, grade_pct, bottom=bottom, color=color, label=f'Grade {label}', alpha=0.8))
            bottom += grade_pct
        
        ax.set_title(f'{factor}', fontsize=11, fontweight='bold')
        ax.set_ylabel('Percentage', fontsize=9)
        ax.set_ylim(0, 100)
        ax.tick_params(axis='x', rotation=45, labelsize=8)
        ax.tick_params(axis='y', labelsize=8)
        ax.grid(axis='y', alpha=0.3)
        
        # add percentage labels for A and B grades
        for j, model_idx in enumerate(range(len(models))):
            if grade_data[0][model_idx] > 5:
                ax.text(model_idx, grade_data[0][model_idx]/2, f'{grade_data[0][model_idx]:.0f}%', 
                       ha='center', va='center', fontsize=7, fontweight='bold', color='white')
            
            if grade_data[1][model_idx] > 5:
                y_pos = grade_data[0][model_idx] + grade_data[1][model_idx]/2
                ax.text(model_idx, y_pos, f'{grade_data[1][model_idx]:.0f}%', 
                       ha='center', va='center', fontsize=7, fontweight='bold', color='white')

if len(factor_summary) > 0:
    axes[7].legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9)

for i in range(len(factor_summary), len(axes)):
    fig.delaxes(axes[i])

plt.suptitle('AI Factor Distributions Across LLM Models', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig(f'{output_dir}ai_factor_distributions.png', dpi=300, bbox_inches='tight')
plt.close()

# model comparison heatmap
print("creating model comparison heatmap...")

heatmap_data = []
factor_names = []

for factor, summary_df in factor_summary.items():
    if not summary_df.empty:
        factor_names.append(factor)
        row_data = []
        for model_code in model_short:
            model_data = summary_df[summary_df['Model_Code'] == model_code]
            if not model_data.empty:
                grade_to_num = {'A': 5, 'B': 4, 'C': 3, 'D': 2, 'E': 1}
                median_grade = model_data['Median_Grade'].iloc[0]
                row_data.append(grade_to_num.get(median_grade, 3))
            else:
                row_data.append(3)
        heatmap_data.append(row_data)

heatmap_df = pd.DataFrame(heatmap_data, columns=model_names, index=factor_names)

plt.figure(figsize=(10, 8))
sns.heatmap(heatmap_df, annot=True, cmap='RdYlGn', center=3, 
            cbar_kws={'label': 'Grade (1=E, 5=A)'}, fmt='.1f',
            linewidths=0.5)
plt.title('AI Factor Median Grades by Model', fontsize=14, fontweight='bold')
plt.xlabel('LLM Model', fontsize=12)
plt.ylabel('AI Factor', fontsize=12)
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig(f'{output_dir}ai_factor_model_comparison.png', dpi=300, bbox_inches='tight')
plt.close()

# overall grade distribution  
print("creating overall grade distribution...")

plt.figure(figsize=(14, 8))

all_grades = []
all_models = []

for factor, summary_df in factor_summary.items():
    for _, row in summary_df.iterrows():
        for grade, pct in zip(['A', 'B', 'C', 'D', 'E'], 
                             [row['Grade_A_Pct'], row['Grade_B_Pct'], row['Grade_C_Pct'], 
                              row['Grade_D_Pct'], row['Grade_E_Pct']]):
            all_grades.extend([grade] * int(pct))
            all_models.extend([row['Model']] * int(pct))

grade_dist_df = pd.DataFrame({'Grade': all_grades, 'Model': all_models})

sns.countplot(data=grade_dist_df, x='Grade', hue='Model', palette='viridis')
plt.title('Overall Grade Distribution Across All AI Factors', fontsize=14, fontweight='bold')
plt.xlabel('Grade', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='LLM Model', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(f'{output_dir}ai_factor_overall_grades.png', dpi=300, bbox_inches='tight')
plt.close()

# create summary tables
print(f"\ncreating summary tables")

# factor summary table
factor_summary_table = []
for factor, summary_df in factor_summary.items():
    for _, row in summary_df.iterrows():
        factor_summary_table.append({
            'Factor': factor,
            'Model': row['Model'],
            'Observations': row['Observations'],
            'Mean_Score': row['Mean'],
            'Median_Grade': row['Median_Grade'],
            'Std_Dev': row['Std_Dev'],
            'Pct_Grade_A': row['Grade_A_Pct'],
            'Pct_Grade_B': row['Grade_B_Pct'],
            'Pct_Grade_C': row['Grade_C_Pct'],
            'Pct_High_Quality': row['Grade_A_Pct'] + row['Grade_B_Pct']
        })

factor_summary_final = pd.DataFrame(factor_summary_table)

# model comparison table
model_comparison = []
for model_idx, model_name in enumerate(model_names):
    model_data = factor_summary_final[factor_summary_final['Model'] == model_name]
    
    if not model_data.empty:
        model_comparison.append({
            'Model': model_name,
            'Avg_Mean_Score': round(model_data['Mean_Score'].mean(), 2),
            'Avg_Pct_Grade_A': round(model_data['Pct_Grade_A'].mean(), 1),
            'Avg_Pct_Grade_B': round(model_data['Pct_Grade_B'].mean(), 1),
            'Avg_Pct_High_Quality': round(model_data['Pct_High_Quality'].mean(), 1),
            'Most_Common_Grade': model_data['Median_Grade'].mode().iloc[0] if len(model_data['Median_Grade'].mode()) > 0 else 'C',
            'Total_Observations': int(model_data['Observations'].sum())
        })

model_comparison_df = pd.DataFrame(model_comparison)

print(f"summary tables prepared for word document")

# year-over-year analysis
print(f"\nanalyzing ai factor evolution over time")

yearly_analysis = {}

for factor, columns in factor_columns.items():
    gpt4o_col = columns[0]
    if gpt4o_col in df.columns:
        yearly_data = []
        
        for year in sorted(df['Year'].unique()):
            year_data = df[df['Year'] == year][gpt4o_col].dropna()
            
            if len(year_data) > 0:
                grade_counts = year_data.value_counts().sort_index(ascending=False)
                total = len(year_data)
                
                grade_pcts = {}
                for grade in [5, 4, 3, 2, 1]:
                    grade_pcts[grade] = (grade_counts.get(grade, 0) / total * 100) if total > 0 else 0
                
                yearly_data.append({
                    'Year': year,
                    'Mean_Score': round(year_data.mean(), 2),
                    'Median_Score': round(year_data.median(), 1),
                    'Observations': int(total),
                    'Pct_Grade_A': round(grade_pcts[5], 1),
                    'Pct_Grade_B': round(grade_pcts[4], 1),
                    'Pct_Grade_C': round(grade_pcts[3], 1),
                    'Pct_High_Quality': round(grade_pcts[5] + grade_pcts[4], 1)
                })
        
        yearly_analysis[factor] = pd.DataFrame(yearly_data)

print(f"year-over-year analysis created for {len(yearly_analysis)} factors")

print("creating ai evolution over time analysis...")

# mean scores over time
plt.figure(figsize=(12, 8))
for factor, yearly_df in yearly_analysis.items():
    if not yearly_df.empty and len(yearly_df) > 1:
        plt.plot(yearly_df['Year'], yearly_df['Mean_Score'], marker='o', 
                linewidth=3, markersize=8, label=factor, alpha=0.8)

plt.title('AI Factor Mean Scores Evolution (2020-2024)', fontsize=16, fontweight='bold')
plt.xlabel('Year', fontsize=12, fontweight='bold')
plt.ylabel('Mean Score (1-5)', fontsize=12, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
plt.ylim(1, 5)
plt.tight_layout()
plt.savefig(f'{output_dir}ai_mean_scores_evolution.png', dpi=300, bbox_inches='tight')
plt.close()

# high-quality percentage over time
plt.figure(figsize=(12, 8))
for factor, yearly_df in yearly_analysis.items():
    if not yearly_df.empty and len(yearly_df) > 1:
        plt.plot(yearly_df['Year'], yearly_df['Pct_High_Quality'], marker='s', 
                linewidth=3, markersize=8, label=factor, alpha=0.8)

plt.title('High-Quality AI Disclosures Evolution - Grade A + B %', fontsize=16, fontweight='bold')
plt.xlabel('Year', fontsize=12, fontweight='bold')
plt.ylabel('Percentage (%)', fontsize=12, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
plt.ylim(0, 100)
plt.tight_layout()
plt.savefig(f'{output_dir}ai_high_quality_evolution.png', dpi=300, bbox_inches='tight')
plt.close()

# overall average across all factors
all_years_summary = []
years = sorted(df['Year'].unique())

for year in years:
    year_means = []
    year_high_quality = []
    
    for factor, yearly_df in yearly_analysis.items():
        year_data = yearly_df[yearly_df['Year'] == year]
        if not year_data.empty:
            year_means.append(year_data['Mean_Score'].iloc[0])
            year_high_quality.append(year_data['Pct_High_Quality'].iloc[0])
    
    if year_means:
        all_years_summary.append({
            'Year': year,
            'Avg_Mean_Score': np.mean(year_means),
            'Avg_High_Quality': np.mean(year_high_quality)
        })

if all_years_summary:
    summary_df = pd.DataFrame(all_years_summary)
    
    fig, ax1 = plt.subplots(figsize=(12, 8))
    ax2 = ax1.twinx()
    
    line1 = ax1.plot(summary_df['Year'], summary_df['Avg_Mean_Score'], 
                     color='steelblue', marker='o', linewidth=4, markersize=10, 
                     label='Average Score')
    line2 = ax2.plot(summary_df['Year'], summary_df['Avg_High_Quality'], 
                     color='darkred', marker='s', linewidth=4, markersize=10, 
                     label='% High Quality')
    
    ax1.set_title('Overall AI Disclosure Quality Evolution (2020-2024)', fontsize=16, fontweight='bold')
    ax1.set_xlabel('Year', fontsize=12, fontweight='bold')
    ax1.set_ylabel('Average Score (1-5)', color='steelblue', fontsize=12, fontweight='bold')
    ax2.set_ylabel('% High Quality (A+B)', color='darkred', fontsize=12, fontweight='bold')
    ax1.grid(True, alpha=0.3)
    ax1.set_ylim(1, 5)
    ax2.set_ylim(0, 100)
    
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left', fontsize=12)
    
    plt.tight_layout()
    plt.savefig(f'{output_dir}ai_overall_quality_evolution.png', dpi=300, bbox_inches='tight')
    plt.close()

print(f"created individual temporal analysis visualizations:")
print(f"   ai_mean_scores_evolution.png")
print(f"   ai_high_quality_evolution.png") 
print(f"   ai_overall_quality_evolution.png")

print(f"\nai factor analysis complete!")
print(f"all files saved to: {output_dir}")

print(f"\nanalysis overview:")
print(f"   factors analyzed: {len(factor_columns)}")
print(f"   llm models: {len(model_names)}")
print(f"   total observations: {df.shape[0]:,}")
print(f"   unique companies: {df['gvkey'].nunique():,}")

print(f"\nfiles created:")
print(f"   core visualizations:")
print(f"      • ai_factor_distributions.png - grade distributions by factor")
print(f"      • ai_factor_model_comparison.png - model comparison heatmap")
print(f"      • ai_factor_overall_grades.png - overall grade distribution")
print(f"   evolution analysis:")
print(f"      • ai_mean_scores_evolution.png - mean scores over time")
print(f"      • ai_high_quality_evolution.png - high quality trends")
print(f"      • ai_overall_quality_evolution.png - overall quality evolution")

print(f"\nmodel performance highlights:")
for _, row in model_comparison_df.iterrows():
    print(f"   {row['Model']:20}: avg score {row['Avg_Mean_Score']:.1f}/5.0, {row['Avg_Pct_High_Quality']:.1f}% A+B grades")

print(f"\nperfect for thesis empirical analysis and methodology documentation!")
print(f"ready to proceed with factor-based return prediction models")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

output_dir = "/Users/daniel/Library/Mobile Documents/com~apple~CloudDocs/Documents/Master Finance/MasterThesis/ThesisData/Regression/SummaryStats/"
os.makedirs(output_dir, exist_ok=True)

print("inter-model agreement analysis")
print("=" * 60)

# define factor columns
factor_columns = {
    'Strategic Depth': ['Strategic Depth_gpt4o_Numeric', 'Strategic Depth_flash1_5_Numeric', 'Strategic Depth_flash2_5_Numeric'],
    'Disclosure Sentiment': ['Disclosure Sentiment_gpt4o_Numeric', 'Disclosure Sentiment_flash1_5_Numeric', 'Disclosure Sentiment_flash2_5_Numeric'],
    'Risk - Own Adoption': ['Risk - Own Adoption_gpt4o_Numeric', 'Risk - Own Adoption_flash1_5_Numeric', 'Risk - Own Adoption_flash2_5_Numeric'],
    'Risk - External Threats': ['Risk - External Threats_gpt4o_Numeric', 'Risk - External Threats_flash1_5_Numeric', 'Risk - External Threats_flash2_5_Numeric'],
    'Risk - Non-Adoption': ['Risk - Non-Adoption_gpt4o_Numeric', 'Risk - Non-Adoption_flash1_5_Numeric', 'Risk - Non-Adoption_flash2_5_Numeric'],
    'Forward-Looking': ['Forward-Looking_gpt4o_Numeric', 'Forward-Looking_flash1_5_Numeric', 'Forward-Looking_flash2_5_Numeric'],
    'Talent & Investment': ['Talent & Investment_gpt4o_Numeric', 'Talent & Investment_flash1_5_Numeric', 'Talent & Investment_flash2_5_Numeric'],
    'AI Washing Index': ['AI Washing Index_gpt4o_Numeric', 'AI Washing Index_flash1_5_Numeric', 'AI Washing Index_flash2_5_Numeric']
}

# calculate inter-model agreement for each factor
agreement_stats = {}
overall_stats = []

print("calculating inter-model agreement statistics...")

for factor, columns in factor_columns.items():
    if len(columns) == 3 and all(col in df.columns for col in columns):
        # get data with all three models having valid values
        valid_data = df[columns].dropna()
        n_valid = len(valid_data)
        
        if n_valid == 0:
            print(f"no valid data for {factor}")
            continue
        
        # exact match percentages
        gpt_vs_gem15 = (valid_data[columns[0]] == valid_data[columns[1]]).mean() * 100
        gpt_vs_gem25 = (valid_data[columns[0]] == valid_data[columns[2]]).mean() * 100
        gem15_vs_gem25 = (valid_data[columns[1]] == valid_data[columns[2]]).mean() * 100
        
        # within 1 grade percentages
        gpt_vs_gem15_within1 = (abs(valid_data[columns[0]] - valid_data[columns[1]]) <= 1).mean() * 100
        gpt_vs_gem25_within1 = (abs(valid_data[columns[0]] - valid_data[columns[2]]) <= 1).mean() * 100
        gem15_vs_gem25_within1 = (abs(valid_data[columns[1]] - valid_data[columns[2]]) <= 1).mean() * 100
        
        # calculate correlations
        corr_gpt_gem15 = valid_data[columns[0]].corr(valid_data[columns[1]])
        corr_gpt_gem25 = valid_data[columns[0]].corr(valid_data[columns[2]])
        corr_gem15_gem25 = valid_data[columns[1]].corr(valid_data[columns[2]])
        
        agreement_stats[factor] = {
            'GPT-4o vs Gemini 1.5': {
                '% Exact Match': round(gpt_vs_gem15, 1),
                '% Within 1 Grade': round(gpt_vs_gem15_within1, 1),
                'Correlation': round(corr_gpt_gem15, 3)
            },
            'GPT-4o vs Gemini 2.5': {
                '% Exact Match': round(gpt_vs_gem25, 1),
                '% Within 1 Grade': round(gpt_vs_gem25_within1, 1),
                'Correlation': round(corr_gpt_gem25, 3)
            },
            'Gemini 1.5 vs Gemini 2.5': {
                '% Exact Match': round(gem15_vs_gem25, 1),
                '% Within 1 Grade': round(gem15_vs_gem25_within1, 1),
                'Correlation': round(corr_gem15_gem25, 3)
            },
            'Valid Observations': n_valid
        }
        
        # add to overall stats
        overall_stats.extend([
            {'Factor': factor, 'Comparison': 'GPT-4o vs Gemini 1.5', 'Exact_Match': gpt_vs_gem15, 'Within_1': gpt_vs_gem15_within1, 'Correlation': corr_gpt_gem15},
            {'Factor': factor, 'Comparison': 'GPT-4o vs Gemini 2.5', 'Exact_Match': gpt_vs_gem25, 'Within_1': gpt_vs_gem25_within1, 'Correlation': corr_gpt_gem25},
            {'Factor': factor, 'Comparison': 'Gemini 1.5 vs Gemini 2.5', 'Exact_Match': gem15_vs_gem25, 'Within_1': gem15_vs_gem25_within1, 'Correlation': corr_gem15_gem25}
        ])

# create a dataframe for easy visualization
agreement_rows = []
for factor, stats in agreement_stats.items():
    for comparison, values in stats.items():
        if comparison != 'Valid Observations':
            agreement_rows.append({
                'Factor': factor,
                'Model Comparison': comparison,
                'Exact Match %': values['% Exact Match'],
                'Within 1 Grade %': values['% Within 1 Grade'],
                'Correlation': values['Correlation']
            })

agreement_df = pd.DataFrame(agreement_rows)

# print agreement statistics
print("\ninter-model agreement statistics:")
print("-" * 60)

for factor in factor_columns.keys():
    if factor in agreement_stats:
        factor_data = agreement_df[agreement_df['Factor'] == factor]
        if not factor_data.empty:
            print(f"\n{factor}:")
            print(f"   valid observations: {agreement_stats[factor]['Valid Observations']:,}")
            for _, row in factor_data.iterrows():
                print(f"   {row['Model Comparison']:20}: exact={row['Exact Match %']:5.1f}%, within1={row['Within 1 Grade %']:5.1f}%, corr={row['Correlation']:5.3f}")

# calculate overall statistics
if overall_stats:
    overall_df = pd.DataFrame(overall_stats)
    print(f"\noverall agreement summary:")
    print(f"   average exact match: {overall_df['Exact_Match'].mean():.1f}%")
    print(f"   average within 1 grade: {overall_df['Within_1'].mean():.1f}%") 
    print(f"   average correlation: {overall_df['Correlation'].mean():.3f}")

# create visualizations
print(f"\ncreating agreement visualizations...")

plt.style.use('default')
sns.set_palette("viridis")

# exact match heatmap
if not agreement_df.empty:
    agreement_pivot = agreement_df.pivot_table(
        index='Factor', 
        columns='Model Comparison', 
        values='Exact Match %'
    )
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(agreement_pivot, annot=True, cmap='YlGnBu', fmt='.1f', 
                vmin=0, vmax=100, cbar_kws={'label': 'Exact Match %'})
    plt.title('Exact Match Agreement Between LLM Models\nAI Factor Assessment', 
              fontsize=14, fontweight='bold', pad=20)
    plt.xlabel('Model Comparison', fontsize=12, fontweight='bold')
    plt.ylabel('AI Factor', fontsize=12, fontweight='bold')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f'{output_dir}model_agreement_exact_match.png', dpi=300, bbox_inches='tight')
    plt.close()

# within 1 grade heatmap
if not agreement_df.empty:
    within1_pivot = agreement_df.pivot_table(
        index='Factor', 
        columns='Model Comparison', 
        values='Within 1 Grade %'
    )
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(within1_pivot, annot=True, cmap='RdYlGn', fmt='.1f', 
                vmin=0, vmax=100, cbar_kws={'label': 'Within 1 Grade %'})
    plt.title('Agreement Within 1 Grade Between LLM Models\nAI Factor Assessment', 
              fontsize=14, fontweight='bold', pad=20)
    plt.xlabel('Model Comparison', fontsize=12, fontweight='bold')
    plt.ylabel('AI Factor', fontsize=12, fontweight='bold')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f'{output_dir}model_agreement_within1_grade.png', dpi=300, bbox_inches='tight')
    plt.close()

# correlation heatmap
if not agreement_df.empty:
    correlation_pivot = agreement_df.pivot_table(
        index='Factor', 
        columns='Model Comparison', 
        values='Correlation'
    )
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_pivot, annot=True, cmap='coolwarm', fmt='.3f', 
                vmin=0, vmax=1, center=0.5, cbar_kws={'label': 'Correlation'})
    plt.title('Correlation Between LLM Model Assessments\nAI Factor Scores', 
              fontsize=14, fontweight='bold', pad=20)
    plt.xlabel('Model Comparison', fontsize=12, fontweight='bold')
    plt.ylabel('AI Factor', fontsize=12, fontweight='bold')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f'{output_dir}model_agreement_correlations.png', dpi=300, bbox_inches='tight')
    plt.close()

# summary bar chart
if overall_stats:
    # create summary by comparison type
    comparison_summary = overall_df.groupby('Comparison').agg({
        'Exact_Match': 'mean',
        'Within_1': 'mean', 
        'Correlation': 'mean'
    }).reset_index()
    
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 6))
    
    # exact match
    bars1 = ax1.bar(range(len(comparison_summary)), comparison_summary['Exact_Match'], 
                   color='steelblue', alpha=0.8)
    ax1.set_title('Average Exact Match %', fontweight='bold')
    ax1.set_ylabel('Percentage')
    ax1.set_xticks(range(len(comparison_summary)))
    ax1.set_xticklabels([comp.replace(' vs ', '\nvs\n') for comp in comparison_summary['Comparison']], 
                       fontsize=9)
    ax1.set_ylim(0, 100)
    
    # add value labels
    for bar in bars1:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
                f'{height:.1f}%', ha='center', va='bottom', fontweight='bold')
    
    # within 1 grade
    bars2 = ax2.bar(range(len(comparison_summary)), comparison_summary['Within_1'], 
                   color='darkgreen', alpha=0.8)
    ax2.set_title('Average Within 1 Grade %', fontweight='bold')
    ax2.set_ylabel('Percentage')
    ax2.set_xticks(range(len(comparison_summary)))
    ax2.set_xticklabels([comp.replace(' vs ', '\nvs\n') for comp in comparison_summary['Comparison']], 
                       fontsize=9)
    ax2.set_ylim(0, 100)
    
    # add value labels
    for bar in bars2:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + 1,
                f'{height:.1f}%', ha='center', va='bottom', fontweight='bold')
    
    # correlations
    bars3 = ax3.bar(range(len(comparison_summary)), comparison_summary['Correlation'], 
                   color='darkred', alpha=0.8)
    ax3.set_title('Average Correlation', fontweight='bold')
    ax3.set_ylabel('Correlation')
    ax3.set_xticks(range(len(comparison_summary)))
    ax3.set_xticklabels([comp.replace(' vs ', '\nvs\n') for comp in comparison_summary['Comparison']], 
                       fontsize=9)
    ax3.set_ylim(0, 1)
    
    # add value labels
    for bar in bars3:
        height = bar.get_height()
        ax3.text(bar.get_x() + bar.get_width()/2., height + 0.02,
                f'{height:.3f}', ha='center', va='bottom', fontweight='bold')
    
    plt.suptitle('Inter-Model Agreement Summary\nAcross All AI Factors', 
                fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(f'{output_dir}model_agreement_summary.png', dpi=300, bbox_inches='tight')
    plt.close()

print(f"agreement analysis visualizations created:")
print(f"   model_agreement_exact_match.png")
print(f"   model_agreement_within1_grade.png")
print(f"   model_agreement_correlations.png")
print(f"   model_agreement_summary.png")

print(f"\ninter-model agreement analysis complete!")
print(f"all files saved to: {output_dir}")

if overall_stats:
    print(f"\nkey findings:")
    print(f"   average exact match: {overall_df['Exact_Match'].mean():.1f}%")
    print(f"   average within 1 grade: {overall_df['Within_1'].mean():.1f}%")
    print(f"   average correlation: {overall_df['Correlation'].mean():.3f}")
    
    # identify best and worst agreement
    best_factor = overall_df.groupby('Factor')['Correlation'].mean().idxmax()
    worst_factor = overall_df.groupby('Factor')['Correlation'].mean().idxmin()
    
    print(f"\nfactor with highest agreement: {best_factor}")
    print(f"factor with lowest agreement: {worst_factor}")

print(f"\nfiles created:")
print(f"   visualizations:")
print(f"      • model_agreement_exact_match.png")
print(f"      • model_agreement_within1_grade.png") 
print(f"      • model_agreement_correlations.png")
print(f"      • model_agreement_summary.png")

print(f"\nperfect for thesis methodology section on measurement robustness!")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import cohen_kappa_score
import os

output_dir = "/Users/daniel/Library/Mobile Documents/com~apple~CloudDocs/Documents/Master Finance/MasterThesis/ThesisData/Regression/SummaryStats/"
os.makedirs(output_dir, exist_ok=True)

print("enhanced inter-model correlation analysis")
print("=" * 60)

# define factor columns with numeric versions
factor_columns = {
    'Strategic Depth': ['Strategic Depth_gpt4o_Numeric', 'Strategic Depth_flash1_5_Numeric', 'Strategic Depth_flash2_5_Numeric'],
    'Disclosure Sentiment': ['Disclosure Sentiment_gpt4o_Numeric', 'Disclosure Sentiment_flash1_5_Numeric', 'Disclosure Sentiment_flash2_5_Numeric'],
    'Risk - Own Adoption': ['Risk - Own Adoption_gpt4o_Numeric', 'Risk - Own Adoption_flash1_5_Numeric', 'Risk - Own Adoption_flash2_5_Numeric'],
    'Risk - External Threats': ['Risk - External Threats_gpt4o_Numeric', 'Risk - External Threats_flash1_5_Numeric', 'Risk - External Threats_flash2_5_Numeric'],
    'Risk - Non-Adoption': ['Risk - Non-Adoption_gpt4o_Numeric', 'Risk - Non-Adoption_flash1_5_Numeric', 'Risk - Non-Adoption_flash2_5_Numeric'],
    'Forward-Looking': ['Forward-Looking_gpt4o_Numeric', 'Forward-Looking_flash1_5_Numeric', 'Forward-Looking_flash2_5_Numeric'],
    'Talent & Investment': ['Talent & Investment_gpt4o_Numeric', 'Talent & Investment_flash1_5_Numeric', 'Talent & Investment_flash2_5_Numeric'],
    'AI Washing Index': ['AI Washing Index_gpt4o_Numeric', 'AI Washing Index_flash1_5_Numeric', 'AI Washing Index_flash2_5_Numeric']
}

model_names = ['GPT-4o mini', 'Gemini 1.5 Flash', 'Gemini 2.5 Flash']
model_pairs = [
    ('GPT-4o mini', 'Gemini 1.5 Flash'),
    ('GPT-4o mini', 'Gemini 2.5 Flash'),
    ('Gemini 1.5 Flash', 'Gemini 2.5 Flash')
]

print(f"analyzing {len(factor_columns)} factors across {len(model_names)} models")

# comprehensive correlation and agreement analysis
print("\ncalculating comprehensive correlation and agreement metrics...")

correlation_stats = []
agreement_stats = []
kappa_stats = []

def weighted_kappa_ordinal(y1, y2):
    """calculate weighted cohen's kappa for ordinal data - uses quadratic weights"""
    try:
        mask = ~(pd.isna(y1) | pd.isna(y2))
        y1_clean = y1[mask]
        y2_clean = y2[mask]
        
        if len(y1_clean) == 0:
            return np.nan
        
        return cohen_kappa_score(y1_clean, y2_clean, weights='quadratic')
    except:
        return np.nan

def linear_kappa_ordinal(y1, y2):
    """calculate linearly weighted cohen's kappa for ordinal data"""
    try:
        mask = ~(pd.isna(y1) | pd.isna(y2))
        y1_clean = y1[mask]
        y2_clean = y2[mask]
        
        if len(y1_clean) == 0:
            return np.nan
        
        return cohen_kappa_score(y1_clean, y2_clean, weights='linear')
    except:
        return np.nan

for factor, columns in factor_columns.items():
    if len(columns) == 3 and all(col in df.columns for col in columns):
        # name columns for easier reference
        col_gpt = columns[0]
        col_gem15 = columns[1]
        col_gem25 = columns[2]
        
        # get valid data for each pair
        valid_gpt_gem15 = df[[col_gpt, col_gem15]].dropna()
        valid_gpt_gem25 = df[[col_gpt, col_gem25]].dropna()
        valid_gem15_gem25 = df[[col_gem15, col_gem25]].dropna()
        
        if len(valid_gpt_gem15) == 0 or len(valid_gpt_gem25) == 0 or len(valid_gem15_gem25) == 0:
            print(f"insufficient data for {factor}")
            continue
        
        # calculate kendall's tau (for ordinal data)
        tau_gpt_gem15, tau_p_gpt_gem15 = stats.kendalltau(valid_gpt_gem15[col_gpt], valid_gpt_gem15[col_gem15])
        tau_gpt_gem25, tau_p_gpt_gem25 = stats.kendalltau(valid_gpt_gem25[col_gpt], valid_gpt_gem25[col_gem25])
        tau_gem15_gem25, tau_p_gem15_gem25 = stats.kendalltau(valid_gem15_gem25[col_gem15], valid_gem15_gem25[col_gem25])
        
        # calculate spearman's rho (for ordinal data)
        rho_gpt_gem15, rho_p_gpt_gem15 = stats.spearmanr(valid_gpt_gem15[col_gpt], valid_gpt_gem15[col_gem15])
        rho_gpt_gem25, rho_p_gpt_gem25 = stats.spearmanr(valid_gpt_gem25[col_gpt], valid_gpt_gem25[col_gem25])
        rho_gem15_gem25, rho_p_gem15_gem25 = stats.spearmanr(valid_gem15_gem25[col_gem15], valid_gem15_gem25[col_gem25])
        
        # calculate pearson's r (for comparison)
        pearson_gpt_gem15, pearson_p_gpt_gem15 = stats.pearsonr(valid_gpt_gem15[col_gpt], valid_gpt_gem15[col_gem15])
        pearson_gpt_gem25, pearson_p_gpt_gem25 = stats.pearsonr(valid_gpt_gem25[col_gpt], valid_gpt_gem25[col_gem25])
        pearson_gem15_gem25, pearson_p_gem15_gem25 = stats.pearsonr(valid_gem15_gem25[col_gem15], valid_gem15_gem25[col_gem25])
        
        # calculate cohen's kappa (both linear and quadratic weights)
        kappa_linear_gpt_gem15 = linear_kappa_ordinal(valid_gpt_gem15[col_gpt], valid_gpt_gem15[col_gem15])
        kappa_linear_gpt_gem25 = linear_kappa_ordinal(valid_gpt_gem25[col_gpt], valid_gpt_gem25[col_gem25])
        kappa_linear_gem15_gem25 = linear_kappa_ordinal(valid_gem15_gem25[col_gem15], valid_gem15_gem25[col_gem25])
        
        kappa_quad_gpt_gem15 = weighted_kappa_ordinal(valid_gpt_gem15[col_gpt], valid_gpt_gem15[col_gem15])
        kappa_quad_gpt_gem25 = weighted_kappa_ordinal(valid_gpt_gem25[col_gpt], valid_gpt_gem25[col_gem25])
        kappa_quad_gem15_gem25 = weighted_kappa_ordinal(valid_gem15_gem25[col_gem15], valid_gem15_gem25[col_gem25])
        
        # calculate agreement percentages
        exact_gpt_gem15 = (valid_gpt_gem15[col_gpt] == valid_gpt_gem15[col_gem15]).mean() * 100
        exact_gpt_gem25 = (valid_gpt_gem25[col_gpt] == valid_gpt_gem25[col_gem25]).mean() * 100
        exact_gem15_gem25 = (valid_gem15_gem25[col_gem15] == valid_gem15_gem25[col_gem25]).mean() * 100
        
        within1_gpt_gem15 = (abs(valid_gpt_gem15[col_gpt] - valid_gpt_gem15[col_gem15]) <= 1).mean() * 100
        within1_gpt_gem25 = (abs(valid_gpt_gem25[col_gpt] - valid_gpt_gem25[col_gem25]) <= 1).mean() * 100
        within1_gem15_gem25 = (abs(valid_gem15_gem25[col_gem15] - valid_gem15_gem25[col_gem25]) <= 1).mean() * 100
        
        # store correlation stats
        correlations = [
            (f"{model_pairs[0][0]} vs {model_pairs[0][1]}", tau_gpt_gem15, rho_gpt_gem15, pearson_gpt_gem15),
            (f"{model_pairs[1][0]} vs {model_pairs[1][1]}", tau_gpt_gem25, rho_gpt_gem25, pearson_gpt_gem25),
            (f"{model_pairs[2][0]} vs {model_pairs[2][1]}", tau_gem15_gem25, rho_gem15_gem25, pearson_gem15_gem25)
        ]
        
        for pair_str, tau, rho, pearson in correlations:
            correlation_stats.append({
                'Factor': factor,
                'Model Pair': pair_str,
                'Kendall Tau': tau,
                'Spearman Rho': rho,
                'Pearson r': pearson
            })
        
        # store agreement stats
        agreements = [
            (f"{model_pairs[0][0]} vs {model_pairs[0][1]}", exact_gpt_gem15, within1_gpt_gem15),
            (f"{model_pairs[1][0]} vs {model_pairs[1][1]}", exact_gpt_gem25, within1_gpt_gem25),
            (f"{model_pairs[2][0]} vs {model_pairs[2][1]}", exact_gem15_gem25, within1_gem15_gem25)
        ]
        
        for pair_str, exact, within1 in agreements:
            agreement_stats.append({
                'Factor': factor,
                'Model Pair': pair_str,
                'Exact Match %': exact,
                'Within 1 Grade %': within1
            })
        
        # store kappa stats
        kappas = [
            (f"{model_pairs[0][0]} vs {model_pairs[0][1]}", kappa_linear_gpt_gem15, kappa_quad_gpt_gem15),
            (f"{model_pairs[1][0]} vs {model_pairs[1][1]}", kappa_linear_gpt_gem25, kappa_quad_gpt_gem25),
            (f"{model_pairs[2][0]} vs {model_pairs[2][1]}", kappa_linear_gem15_gem25, kappa_quad_gem15_gem25)
        ]
        
        for pair_str, linear_kappa, quad_kappa in kappas:
            kappa_stats.append({
                'Factor': factor,
                'Model Pair': pair_str,
                'Linear Weighted Kappa': linear_kappa,
                'Quadratic Weighted Kappa': quad_kappa
            })

# convert to dataframes
correlation_df = pd.DataFrame(correlation_stats)
agreement_df = pd.DataFrame(agreement_stats)
kappa_df = pd.DataFrame(kappa_stats)

print(f"calculated correlation metrics for {len(correlation_df)} factor-model combinations")

# display results
print("\nordinal correlation measures:")
print("-" * 60)

for factor in factor_columns.keys():
    factor_data = correlation_df[correlation_df['Factor'] == factor]
    if not factor_data.empty:
        print(f"\n{factor}:")
        for _, row in factor_data.iterrows():
            print(f"   {row['Model Pair']:30}: tau={row['Kendall Tau']:6.3f}, rho={row['Spearman Rho']:6.3f}, r={row['Pearson r']:6.3f}")

print("\ncohen's kappa (ordinal agreement):")
print("-" * 60)

for factor in factor_columns.keys():
    factor_data = kappa_df[kappa_df['Factor'] == factor]
    if not factor_data.empty:
        print(f"\n{factor}:")
        for _, row in factor_data.iterrows():
            print(f"   {row['Model Pair']:30}: linear κ={row['Linear Weighted Kappa']:6.3f}, quad κ={row['Quadratic Weighted Kappa']:6.3f}")

print("\nagreement percentages:")
print("-" * 60)

for factor in factor_columns.keys():
    factor_data = agreement_df[agreement_df['Factor'] == factor]
    if not factor_data.empty:
        print(f"\n{factor}:")
        for _, row in factor_data.iterrows():
            print(f"   {row['Model Pair']:30}: exact={row['Exact Match %']:5.1f}%, within1={row['Within 1 Grade %']:5.1f}%")

# create visualizations
print(f"\ncreating enhanced correlation visualizations...")

plt.style.use('default')
sns.set_palette("viridis")

# comprehensive correlation heatmaps (3 measures)
fig, axes = plt.subplots(3, 1, figsize=(14, 16))

# kendall's tau
kendall_pivot = pd.pivot_table(correlation_df, index='Factor', columns='Model Pair', values='Kendall Tau')
sns.heatmap(kendall_pivot, annot=True, cmap='YlGnBu', fmt='.3f', vmin=0, vmax=1, 
            ax=axes[0], cbar_kws={'label': "Kendall's τ"})
axes[0].set_title("Kendall's Tau (Ordinal Correlation)\nPreferred for Ordinal Data", fontsize=14, fontweight='bold', pad=20)

# spearman's rho
spearman_pivot = pd.pivot_table(correlation_df, index='Factor', columns='Model Pair', values='Spearman Rho')
sns.heatmap(spearman_pivot, annot=True, cmap='YlGnBu', fmt='.3f', vmin=0, vmax=1, 
            ax=axes[1], cbar_kws={'label': "Spearman's ρ"})
axes[1].set_title("Spearman's Rho (Rank Correlation)\nRobust to Outliers", fontsize=14, fontweight='bold', pad=20)

# pearson's r
pearson_pivot = pd.pivot_table(correlation_df, index='Factor', columns='Model Pair', values='Pearson r')
sns.heatmap(pearson_pivot, annot=True, cmap='YlGnBu', fmt='.3f', vmin=0, vmax=1, 
            ax=axes[2], cbar_kws={'label': "Pearson's r"})
axes[2].set_title("Pearson's r (Linear Correlation)\nFor Comparison", fontsize=14, fontweight='bold', pad=20)

for ax in axes:
    ax.set_xlabel('Model Pair', fontsize=12, fontweight='bold')
    ax.set_ylabel('AI Factor', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig(f'{output_dir}enhanced_correlation_heatmaps.png', dpi=300, bbox_inches='tight')
plt.close()

# cohen's kappa heatmaps
fig, axes = plt.subplots(2, 1, figsize=(14, 12))

# linear weighted kappa
linear_kappa_pivot = pd.pivot_table(kappa_df, index='Factor', columns='Model Pair', values='Linear Weighted Kappa')
sns.heatmap(linear_kappa_pivot, annot=True, cmap='RdYlGn', fmt='.3f', vmin=0, vmax=1, 
            ax=axes[0], cbar_kws={'label': 'Linear Weighted κ'})
axes[0].set_title("Cohen's Kappa - Linear Weights\nOrdinal Agreement Measure", fontsize=14, fontweight='bold', pad=20)

# quadratic weighted kappa
quad_kappa_pivot = pd.pivot_table(kappa_df, index='Factor', columns='Model Pair', values='Quadratic Weighted Kappa')
sns.heatmap(quad_kappa_pivot, annot=True, cmap='RdYlGn', fmt='.3f', vmin=0, vmax=1, 
            ax=axes[1], cbar_kws={'label': 'Quadratic Weighted κ'})
axes[1].set_title("Cohen's Kappa - Quadratic Weights\nPenalizes Larger Disagreements More", fontsize=14, fontweight='bold', pad=20)

for ax in axes:
    ax.set_xlabel('Model Pair', fontsize=12, fontweight='bold')
    ax.set_ylabel('AI Factor', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig(f'{output_dir}cohens_kappa_heatmaps.png', dpi=300, bbox_inches='tight')
plt.close()

print(f"enhanced correlation visualizations created:")
print(f"   enhanced_correlation_heatmaps.png")
print(f"   cohens_kappa_heatmaps.png")

print(f"\nenhanced correlation analysis complete!")
print(f"all files saved to: {output_dir}")

if not correlation_df.empty:
    print(f"\nkey findings:")
    print(f"   average kendall's τ: {correlation_df['Kendall Tau'].mean():.3f} (preferred for ordinal)")
    print(f"   average spearman's ρ: {correlation_df['Spearman Rho'].mean():.3f}")
    print(f"   average pearson's r: {correlation_df['Pearson r'].mean():.3f}")
    print(f"   average linear κ: {kappa_df['Linear Weighted Kappa'].mean():.3f}")
    print(f"   average quadratic κ: {kappa_df['Quadratic Weighted Kappa'].mean():.3f}")
    print(f"   average within 1 grade: {agreement_df['Within 1 Grade %'].mean():.1f}%")
    
    # identify best and worst performing factors
    factor_performance = correlation_df.groupby('Factor')['Kendall Tau'].mean().sort_values(ascending=False)
    best_factor = factor_performance.index[0]
    worst_factor = factor_performance.index[-1]
    
    print(f"\nfactor with highest agreement: {best_factor} (τ={factor_performance.iloc[0]:.3f})")
    print(f"factor with lowest agreement: {worst_factor} (τ={factor_performance.iloc[-1]:.3f})")
    
    # model pair performance
    pair_performance = correlation_df.groupby('Model Pair')['Kendall Tau'].mean().sort_values(ascending=False)
    print(f"\nmodel pair rankings (by kendall's τ):")
    for i, (pair, score) in enumerate(pair_performance.items(), 1):
        print(f"   {i}. {pair}: {score:.3f}")

print(f"\nfiles created:")
print(f"   visualizations:")
print(f"      • enhanced_correlation_heatmaps.png - kendall, spearman, pearson")
print(f"      • cohens_kappa_heatmaps.png - linear and quadratic weighted kappa")

print(f"\nresearch implications:")
print(f"   demonstrates robust measurement across different llm architectures")
print(f"   provides multiple validation metrics appropriate for ordinal data")
print(f"   cohen's kappa confirms inter-rater reliability for thesis methodology")
print(f"   high within-1-grade agreement shows practical robustness")
print(f"   perfect for thesis robustness and methodology validation sections!")

# quick reference - agreement interpretation
print(f"\nquick reference - agreement interpretation:")
print(f"   cohen's kappa:")
print(f"      >0.80: excellent agreement")
print(f"      0.60-0.80: good agreement") 
print(f"      0.40-0.60: moderate agreement")
print(f"      <0.40: poor agreement")
print(f"   within 1 grade agreement:")
print(f"      >90%: excellent robustness")
print(f"      >80%: good robustness")
print(f"      >70%: acceptable robustness")
print(f"   kendall's tau (ordinal correlation):")
print(f"      >0.70: strong relationship")
print(f"      >0.50: moderate relationship")
print(f"      >0.30: weak relationship")

In [None]:
# robust momentum returns calculator - cumulative approach

import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

class RobustMomentumCalculator:
    """calculate momentum returns using cumulative daily returns approach"""
    
    def __init__(self, daily_prices_path, final_dataset_path):
        self.daily_prices_path = daily_prices_path
        self.final_dataset_path = final_dataset_path
        
        print("robust momentum calculator initialized")
        print(f"   daily prices: {daily_prices_path}")
        print(f"   final dataset: {final_dataset_path}")
        print("   uses cumulative daily returns to avoid price discontinuities")
    
    def load_and_examine_data(self):
        """load and examine data with focus on quality"""
        print("\nloading and examining data")
        print("=" * 45)
        
        # load daily prices
        print("loading daily prices...")
        daily_cols_needed = ['gvkey', 'datadate', 'prccd', 'ajexdi', 'cshoc']
        daily_sample = pd.read_csv(self.daily_prices_path, nrows=5)
        available_cols = [col for col in daily_cols_needed if col in daily_sample.columns]
        print(f"available columns: {available_cols}")
        
        self.daily_prices = pd.read_csv(self.daily_prices_path, usecols=available_cols, dtype={'gvkey': str})
        
        # standardize gvkey
        self.daily_prices['gvkey'] = self.daily_prices['gvkey'].str.strip().str.zfill(6)
        self.daily_prices['datadate'] = pd.to_datetime(self.daily_prices['datadate'])
        
        # use adjusted prices if available, otherwise raw prices
        if 'ajexdi' in self.daily_prices.columns:
            self.daily_prices['adj_price'] = self.daily_prices['prccd'] / self.daily_prices['ajexdi']
            print("   using split/dividend adjusted prices")
        else:
            self.daily_prices['adj_price'] = self.daily_prices['prccd']
            print("   no adjustment factors - using raw prices")
        
        # clean data
        original_count = len(self.daily_prices)
        self.daily_prices = self.daily_prices[
            (self.daily_prices['adj_price'].notna()) & 
            (self.daily_prices['adj_price'] > 0)
        ].copy()
        
        print(f"   cleaned: {original_count:,} -> {len(self.daily_prices):,} ({len(self.daily_prices)/original_count:.1%})")
        
        self.daily_prices = self.daily_prices.sort_values(['gvkey', 'datadate']).reset_index(drop=True)
        
        print(f"daily prices: {self.daily_prices.shape}")
        print(f"   date range: {self.daily_prices['datadate'].min()} to {self.daily_prices['datadate'].max()}")
        print(f"   unique gvkeys: {self.daily_prices['gvkey'].nunique()}")
        
        # load final dataset
        print("\nloading final dataset...")
        self.final_dataset = pd.read_csv(self.final_dataset_path, dtype={'gvkey': str})
        self.final_dataset['gvkey'] = self.final_dataset['gvkey'].str.strip().str.zfill(6)
        self.final_dataset['filingDate'] = pd.to_datetime(self.final_dataset['filingDate'])
        
        print(f"final dataset: {self.final_dataset.shape}")
        print(f"   unique gvkeys: {self.final_dataset['gvkey'].nunique()}")
        
        # check overlap
        daily_gvkeys = set(self.daily_prices['gvkey'].unique())
        final_gvkeys = set(self.final_dataset['gvkey'].unique())
        overlap = daily_gvkeys.intersection(final_gvkeys)
        
        print(f"\ngvkey overlap: {len(overlap)}/{len(final_gvkeys)} ({len(overlap)/len(final_gvkeys):.1%})")
        
        return self
    
    def calculate_clean_daily_returns(self):
        """calculate clean daily returns for all firms"""
        print("\ncalculating clean daily returns")
        print("=" * 45)
        
        # calculate daily returns with robust cleaning
        self.daily_prices['price_lag'] = self.daily_prices.groupby('gvkey')['adj_price'].shift(1)
        self.daily_prices['daily_return'] = (self.daily_prices['adj_price'] / self.daily_prices['price_lag']) - 1
        
        # remove extreme daily returns and missing values  
        original_count = len(self.daily_prices)
        self.daily_prices = self.daily_prices[
            (self.daily_prices['daily_return'].between(-0.50, 0.50)) &  # ±50% max daily
            (self.daily_prices['daily_return'].notna())
        ].copy()
        
        print(f"   after return filtering: {original_count:,} -> {len(self.daily_prices):,}")
        print(f"   daily return stats:")
        returns = self.daily_prices['daily_return']
        print(f"      mean: {returns.mean():.6f}")
        print(f"      std:  {returns.std():.6f}")
        print(f"      min:  {returns.min():.6f}")
        print(f"      max:  {returns.max():.6f}")
        
        return self
    
    def calculate_momentum_via_cumulative_returns(self, months_back=[1, 2, 3]):
        """calculate momentum using cumulative daily returns approach"""
        print(f"\ncalculating momentum via cumulative returns")
        print("=" * 55)
        print(f"   months back: {months_back}")
        print(f"   method: cumulative daily returns over ~21 trading days with winsorization")
        
        momentum_results = []
        
        # get overlapping firms
        daily_gvkeys = set(self.daily_prices['gvkey'].unique())
        final_gvkeys = set(self.final_dataset['gvkey'].unique())
        processable_gvkeys = daily_gvkeys.intersection(final_gvkeys)
        
        print(f"   processing {len(processable_gvkeys)} firms")
        
        # statistics tracking
        stats = {month: {'attempts': 0, 'successful': 0, 'avg_days': 0} for month in months_back}
        
        for gvkey in tqdm(processable_gvkeys, desc="calculating momentum"):
            # get firm data
            firm_data = self.daily_prices[self.daily_prices['gvkey'] == gvkey].copy()
            firm_filings = self.final_dataset[self.final_dataset['gvkey'] == gvkey].copy()
            
            if len(firm_data) < 100:  # need sufficient history
                continue
            
            # calculate log returns for better aggregation properties
            firm_data['log_return'] = np.log(1 + firm_data['daily_return'])
            firm_data = firm_data.sort_values('datadate')
            
            # process each filing
            for _, filing_row in firm_filings.iterrows():
                filing_date = filing_row['filingDate']
                year = filing_row.get('Year', filing_date.year)
                
                result = {
                    'gvkey': gvkey,
                    'Year': year,
                    'filing_date': filing_date
                }
                
                # calculate returns for each lookback period
                for months in months_back:
                    stats[months]['attempts'] += 1
                    
                    # define lookback period (approximately months * 21 trading days)
                    target_days = months * 21
                    lookback_start = filing_date - pd.DateOffset(days=int(months * 35))  # buffer for weekends
                    lookback_end = filing_date - pd.DateOffset(days=5)  # small buffer before filing
                    
                    # get data in lookback window
                    window_data = firm_data[
                        (firm_data['datadate'] >= lookback_start) &
                        (firm_data['datadate'] <= lookback_end)
                    ].copy()
                    
                    if len(window_data) < 10:  # need minimum observations
                        result[f'return_t_minus_{months}m'] = np.nan
                        result[f'n_days_t_minus_{months}m'] = 0
                        continue
                    
                    try:
                        # take the most recent 'target_days' observations
                        recent_data = window_data.tail(min(target_days, len(window_data)))
                        
                        # calculate cumulative return using simple compounding
                        daily_returns = recent_data['daily_return'].values
                        cumulative_return = np.prod(1 + daily_returns) - 1
                        
                        # store results (no hard caps - will winsorize later)
                        result[f'return_t_minus_{months}m'] = cumulative_return
                        result[f'n_days_t_minus_{months}m'] = len(recent_data)
                        result[f'start_date_t_minus_{months}m'] = recent_data['datadate'].iloc[0]
                        result[f'end_date_t_minus_{months}m'] = recent_data['datadate'].iloc[-1]
                        
                        # update statistics
                        stats[months]['successful'] += 1
                        stats[months]['avg_days'] += len(recent_data)
                        
                    except Exception as e:
                        result[f'return_t_minus_{months}m'] = np.nan
                        result[f'n_days_t_minus_{months}m'] = 0
                        continue
                
                momentum_results.append(result)
        
        # calculate final statistics
        for month in stats:
            if stats[month]['successful'] > 0:
                stats[month]['avg_days'] /= stats[month]['successful']
        
        if momentum_results:
            self.momentum_data = pd.DataFrame(momentum_results)
            
            # apply winsorization to handle remaining extreme values
            print(f"\napplying winsorization to momentum returns:")
            for month in months_back:
                return_col = f'return_t_minus_{month}m'
                if return_col in self.momentum_data.columns:
                    original_returns = self.momentum_data[return_col].dropna()
                    if len(original_returns) > 0:
                        # winsorize at 1st and 99th percentiles
                        p01 = original_returns.quantile(0.01)
                        p99 = original_returns.quantile(0.99)
                        
                        winsorized_count = ((original_returns < p01) | (original_returns > p99)).sum()
                        
                        # apply winsorization
                        self.momentum_data[return_col] = self.momentum_data[return_col].clip(lower=p01, upper=p99)
                        
                        print(f"   t-{month}m: winsorized {winsorized_count} values at [{p01:.3f}, {p99:.3f}]")
            
            print(f"\nmomentum calculations completed:")
            print(f"   total observations: {len(self.momentum_data)}")
            
            # success rates
            print(f"\nsuccess rate by lookback period:")
            for month in months_back:
                total = stats[month]['attempts']
                success = stats[month]['successful']
                avg_days = stats[month]['avg_days']
                rate = success/total*100 if total > 0 else 0
                print(f"   t-{month}m: {success:4d}/{total:4d} ({rate:5.1f}%) | avg days: {avg_days:5.1f}")
            
            # return statistics after winsorization
            print(f"\nmomentum return statistics (after winsorization):")
            for month in months_back:
                return_col = f'return_t_minus_{month}m'
                if return_col in self.momentum_data.columns:
                    returns = self.momentum_data[return_col].dropna()
                    if len(returns) > 0:
                        print(f"   t-{month}m: mean={returns.mean():7.4f}, std={returns.std():6.4f}, "
                              f"min={returns.min():7.4f}, max={returns.max():7.4f}")
                        
                        # check for remaining extremes
                        extreme_count = ((returns < -0.5) | (returns > 0.5)).sum()
                        if extreme_count > 0:
                            print(f"           {extreme_count} returns still >50% (after winsorization)")
                        else:
                            print(f"           all returns within reasonable bounds")
        else:
            raise ValueError("no momentum returns calculated!")
        
        return self
    
    def merge_with_final_dataset(self):
        """merge momentum data with final dataset"""
        print("\nmerging with final dataset")
        print("=" * 40)
        
        # merge on gvkey and year
        momentum_cols = [col for col in self.momentum_data.columns 
                        if col not in ['gvkey', 'Year', 'filing_date']]
        merge_cols = ['gvkey', 'Year'] + momentum_cols
        
        self.final_dataset_with_momentum = pd.merge(
            self.final_dataset,
            self.momentum_data[merge_cols],
            on=['gvkey', 'Year'],
            how='left'
        )
        
        total_obs = len(self.final_dataset)
        
        print(f"merge completed:")
        print(f"   total observations: {total_obs}")
        print(f"   new momentum columns: {len(momentum_cols)}")
        
        # availability check
        print(f"\nfinal momentum availability:")
        for month in [1, 2, 3]:
            return_col = f'return_t_minus_{month}m'
            if return_col in self.final_dataset_with_momentum.columns:
                available = self.final_dataset_with_momentum[return_col].notna().sum()
                print(f"   t-{month}m: {available:4d}/{total_obs:4d} ({available/total_obs:.1%})")
        
        return self
    
    def validate_momentum_quality(self):
        """perform quality checks on momentum returns"""
        print(f"\nmomentum quality validation")
        print("=" * 40)
        
        for month in [1, 2, 3]:
            return_col = f'return_t_minus_{month}m'
            if return_col in self.final_dataset_with_momentum.columns:
                returns = self.final_dataset_with_momentum[return_col].dropna()
                
                if len(returns) > 0:
                    print(f"\nt-{month}m quality check:")
                    print(f"   valid observations: {len(returns)}")
                    print(f"   mean: {returns.mean():8.4f}")
                    print(f"   std:  {returns.std():8.4f}")
                    print(f"   skew: {returns.skew():8.4f}")
                    print(f"   kurt: {returns.kurtosis():8.4f}")
                    
                    # percentile analysis
                    percentiles = [1, 5, 10, 25, 50, 75, 90, 95, 99]
                    pct_values = returns.quantile([p/100 for p in percentiles])
                    print(f"   percentiles: " + " | ".join([f"p{p}={pct_values[p/100]:.3f}" for p in [1, 5, 95, 99]]))
                    
                    # flag suspicious patterns
                    if abs(returns.mean()) > 0.05:
                        print(f"   high average return ({returns.mean():.3f}) - check for data issues")
                    if returns.std() > 0.3:
                        print(f"   high volatility ({returns.std():.3f}) - possible remaining outliers")
                    if abs(returns.skew()) > 3:
                        print(f"   high skewness ({returns.skew():.2f}) - asymmetric distribution")
                    
                    # check extreme values
                    extreme_positive = (returns > 0.5).sum()
                    extreme_negative = (returns < -0.5).sum()
                    if extreme_positive + extreme_negative > 0:
                        print(f"   extreme values: {extreme_positive} >50%, {extreme_negative} <-50%")
                    else:
                        print(f"   no extreme values detected")
        
        return self
    
    def save_enhanced_dataset(self, output_path=None):
        """save enhanced dataset"""
        print("\nsaving enhanced dataset")
        print("=" * 35)
        
        if output_path is None:
            output_path = self.final_dataset_path.replace('.csv', '_with_robust_momentum.csv')
        
        self.final_dataset_with_momentum.to_csv(output_path, index=False)
        
        print(f"dataset saved: {output_path}")
        print(f"   shape: {self.final_dataset_with_momentum.shape}")
        
        return output_path
    
    def run_complete_analysis(self):
        """run complete robust momentum analysis"""
        print("running robust momentum analysis")
        print("=" * 45)
        
        try:
            self.load_and_examine_data()
            self.calculate_clean_daily_returns()
            self.calculate_momentum_via_cumulative_returns()
            self.merge_with_final_dataset()
            self.validate_momentum_quality()
            output_path = self.save_enhanced_dataset()
            
            print(f"\nrobust momentum analysis complete!")
            print(f"   used cumulative daily returns approach")
            print(f"   applied strict outlier filtering")
            print(f"   quality validated")
            print(f"   ready for regression analysis!")
            
            return self.final_dataset_with_momentum, output_path
            
        except Exception as e:
            print(f"\nerror: {str(e)}")
            import traceback
            traceback.print_exc()
            return None, None

# main execution
if __name__ == "__main__":
    print("robust momentum calculator")
    print("=" * 35)
    
    calculator = RobustMomentumCalculator(
        daily_prices_path="/Users/daniel/Library/Mobile Documents/com~apple~CloudDocs/Documents/Master Finance/MasterThesis/ThesisData/Controls/daily.csv",
        final_dataset_path="/Users/daniel/Library/Mobile Documents/com~apple~CloudDocs/Documents/Master Finance/MasterThesis/ThesisData/Regression/final_clean_dataset_filtered_with_corrected_factor_loadings.csv"
    )
    
    enhanced_dataset, output_path = calculator.run_complete_analysis()
    
    if enhanced_dataset is not None:
        print(f"\nsuccess! robust momentum returns calculated!")
        print(f"output: {output_path}")
    else:
        print(f"\nanalysis failed")

In [9]:
import pandas as pd

# Define the path
data_path = "/Users/daniel/Library/Mobile Documents/com~apple~CloudDocs/Documents/Master Finance/MasterThesis/ThesisData/Regression/final_clean_dataset_filtered_with_corrected_factor_loadings_with_robust_momentum.csv"

# Load the DataFrame
df = pd.read_csv(data_path)

# Print column names
print(df.columns.tolist())


['CIK', 'Company Name', 'Sector', 'Ticker', 'Year', 'filingDate', 'form', 'gvkey', 'AI Washing Index_flash1_5', 'AI Washing Index_flash2_5', 'AI Washing Index_gpt4o', 'Disclosure Sentiment_flash1_5', 'Disclosure Sentiment_flash2_5', 'Disclosure Sentiment_gpt4o', 'Forward-Looking_flash1_5', 'Forward-Looking_flash2_5', 'Forward-Looking_gpt4o', 'Key AI Terms_flash1_5', 'Key AI Terms_flash2_5', 'Key AI Terms_gpt4o', 'Overall Summary_flash1_5', 'Overall Summary_flash2_5', 'Overall Summary_gpt4o', 'Risk - External Threats_flash1_5', 'Risk - External Threats_flash2_5', 'Risk - External Threats_gpt4o', 'Risk - Non-Adoption_flash1_5', 'Risk - Non-Adoption_flash2_5', 'Risk - Non-Adoption_gpt4o', 'Risk - Own Adoption_flash1_5', 'Risk - Own Adoption_flash2_5', 'Risk - Own Adoption_gpt4o', 'Strategic Depth_flash1_5', 'Strategic Depth_flash2_5', 'Strategic Depth_gpt4o', 'Talent & Investment_flash1_5', 'Talent & Investment_flash2_5', 'Talent & Investment_gpt4o', 'AI Washing Index_flash1_5_Numeric', '

In [17]:
# ENHANCED ENDOGENEITY TESTING MODULE WITH WORD DOCUMENT EXPORT
# ================================================================

import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.linear_model import OLS
from statsmodels.stats.diagnostic import het_white
from statsmodels.stats.stattools import durbin_watson
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.oxml.shared import OxmlElement, qn
import warnings
import os
warnings.filterwarnings('ignore')

class EndogeneityTesterWithWordExport:
    """
    Enhanced endogeneity testing with professional Word document export
    """
    
    def __init__(self, data_path, output_path):
        self.data_path = data_path
        self.output_path = output_path
        self.df = None
        self.results = {}
        self.diagnostics = {}
        
        # Core variables from your dataset
        self.y_var = 'excess_return_12mo_annualized'
        
        # Standard controls available in your dataset
        self.standard_controls_config = {
            'Log Market Cap': 'calc_log_market_cap',
            'Price-to-Book': 'calc_price_to_book',
            'ROA': 'calc_roa',
            'Market-to-Book': 'calc_market_to_book',
            'Operating Margin': 'calc_operating_margin',
            'Asset Turnover': 'calc_asset_turnover'
        }
        
        # Management quality proxies using your data
        self.management_proxies_config = {
            'ROA Volatility': 'calc_roa_volatility',
            'Return Volatility (t-1)': 'return_volatility_lag1',
            'Return Volatility (t-2)': 'return_volatility_lag2',
            'Past Performance (t-1m)': 'return_t_minus_1m',
            'Past Performance (t-2m)': 'return_t_minus_2m', 
            'Past Performance (t-3m)': 'return_t_minus_3m',
            'Debt to Assets': 'calc_debt_to_assets',
            'Profit Margin': 'calc_profit_margin',
            'ROE': 'calc_roe'
        }
        
        # Lagged variables for reverse causality testing
        self.lagged_performance_vars = {
            'Lagged Return (t-1m)': 'return_t_minus_1m',
            'Lagged Return (t-2m)': 'return_t_minus_2m',
            'Lagged Return (t-3m)': 'return_t_minus_3m',
            'Lagged ROA': 'calc_roa_lag1',
            'Lagged Market Cap': 'calc_log_market_cap_lag1'
        }

    def load_and_prepare_data(self):
        """Load data and create management quality proxies"""
        print("📊 Loading data for endogeneity testing...")
        self.df = pd.read_csv(self.data_path, dtype={'gvkey': str, 'CIK': str})
        
        # Create AI factor averages
        self.ai_factors_config = {
            'Strategic Depth': ['Strategic Depth_flash1_5_Numeric', 'Strategic Depth_flash2_5_Numeric', 'Strategic Depth_gpt4o_Numeric'],
            'AI Sentiment': ['Disclosure Sentiment_flash1_5_Numeric', 'Disclosure Sentiment_flash2_5_Numeric', 'Disclosure Sentiment_gpt4o_Numeric'],
            'Risk Own Adoption': ['Risk - Own Adoption_flash1_5_Numeric', 'Risk - Own Adoption_flash2_5_Numeric', 'Risk - Own Adoption_gpt4o_Numeric'],
            'Risk External Threats': ['Risk - External Threats_flash1_5_Numeric', 'Risk - External Threats_flash2_5_Numeric', 'Risk - External Threats_gpt4o_Numeric'],
            'Risk Non-Adoption': ['Risk - Non-Adoption_flash1_5_Numeric', 'Risk - Non-Adoption_flash2_5_Numeric', 'Risk - Non-Adoption_gpt4o_Numeric'],
            'Forward Looking': ['Forward-Looking_flash1_5_Numeric', 'Forward-Looking_flash2_5_Numeric', 'Forward-Looking_gpt4o_Numeric'],
            'AI Washing': ['AI Washing Index_flash1_5_Numeric', 'AI Washing Index_flash2_5_Numeric', 'AI Washing Index_gpt4o_Numeric'],
            'Talent Investment': ['Talent & Investment_flash1_5_Numeric', 'Talent & Investment_flash2_5_Numeric', 'Talent & Investment_gpt4o_Numeric'],
        }
        
        self.average_ai_factor_cols_map = {}
        for factor_name, cols in self.ai_factors_config.items():
            available_cols = [col for col in cols if col in self.df.columns]
            if available_cols:
                avg_col_name = f"{factor_name.replace(' ', '_')}_Average"
                self.df[avg_col_name] = self.df[available_cols].mean(axis=1, skipna=True)
                self.average_ai_factor_cols_map[factor_name] = avg_col_name
                print(f" Created {avg_col_name}: {self.df[avg_col_name].notna().sum():,} obs")
        
        # Create management quality proxies
        self._create_management_proxies()
        
        # Create additional lagged variables
        self._create_additional_lagged_variables()
        
        # Create fixed effects dummies
        self._prepare_fixed_effects_dummies()
        
        return self

    def _create_management_proxies(self):
        """Create management quality proxy variables"""
        print("🔧 Creating management quality proxies...")
        
        # Sort by firm and year for rolling calculations
        self.df = self.df.sort_values(['gvkey', 'Year'])
        
        # ROA Volatility (3-year rolling standard deviation)
        if 'calc_roa' in self.df.columns:
            self.df['calc_roa_volatility'] = self.df.groupby('gvkey')['calc_roa'].rolling(
                window=3, min_periods=2
            ).std().reset_index(0, drop=True)
            
            # Create lagged ROA
            self.df['calc_roa_lag1'] = self.df.groupby('gvkey')['calc_roa'].shift(1)
        
        # Return Volatility using your existing return columns
        return_cols = ['return_t_minus_1m', 'return_t_minus_2m', 'return_t_minus_3m']
        available_return_cols = [col for col in return_cols if col in self.df.columns]
        
        if len(available_return_cols) >= 2:
            # Calculate return volatility from available lagged returns
            self.df['return_volatility_lag1'] = self.df[available_return_cols].std(axis=1, skipna=True)
            # Create a second measure using just t-2 and t-3
            if len(available_return_cols) >= 3:
                self.df['return_volatility_lag2'] = self.df[available_return_cols[1:]].std(axis=1, skipna=True)
        
        # Create lagged market cap
        if 'calc_log_market_cap' in self.df.columns:
            self.df['calc_log_market_cap_lag1'] = self.df.groupby('gvkey')['calc_log_market_cap'].shift(1)
        
        print(" Management quality proxies created")

    def _create_additional_lagged_variables(self):
        """Create additional lagged variables for robustness"""
        print("🔧 Creating additional lagged variables...")
        
        # Create lagged versions of key financial variables
        vars_to_lag = ['calc_price_to_book', 'calc_market_to_book', 'calc_operating_margin']
        
        for var in vars_to_lag:
            if var in self.df.columns:
                self.df[f'{var}_lag1'] = self.df.groupby('gvkey')[var].shift(1)
        
        print(" Additional lagged variables created")

    def _prepare_fixed_effects_dummies(self):
        """Create fixed effects dummy variables"""
        # Year fixed effects
        self.master_year_fe_cols = []
        if 'Year' in self.df.columns:
            valid_years = sorted(self.df['Year'].dropna().unique())
            if len(valid_years) > 1:
                for year_val in valid_years[1:]:  # Drop first year as base
                    fe_col_name = f'year_{int(year_val)}'
                    self.df[fe_col_name] = (self.df['Year'] == year_val).astype(int)
                    self.master_year_fe_cols.append(fe_col_name)
        
        # Sector fixed effects
        self.master_sector_fe_cols = []
        sector_col = 'Sector'
        
        if sector_col in self.df.columns:
            valid_sectors = self.df[sector_col].dropna().unique()
            if len(valid_sectors) > 1:
                for i, sector_val in enumerate(valid_sectors[1:]):  # Drop first sector as base
                    clean_sector_val = str(sector_val).replace(' ', '_').replace('&', 'and').replace('/', '_').replace('-', '_')
                    fe_col_name = f'sector_{clean_sector_val}_{i}'
                    self.df[fe_col_name] = (self.df[sector_col] == sector_val).astype(int)
                    self.master_sector_fe_cols.append(fe_col_name)

    def _run_regression_with_controls(self, y_var, x_vars, control_vars=None, include_fe=True):
        """Helper function to run regression with specified controls"""
        if control_vars is None:
            control_vars = []
        
        # Combine all variables
        all_x_vars = x_vars.copy()
        all_x_vars.extend(control_vars)
        
        if include_fe:
            all_x_vars.extend(self.master_year_fe_cols)
            all_x_vars.extend(self.master_sector_fe_cols)
        
        # Filter to available columns
        all_x_vars = [var for var in all_x_vars if var in self.df.columns]
        all_x_vars = list(set(all_x_vars))  # Remove duplicates
        
        # Prepare regression data
        required_cols = [y_var] + all_x_vars + ['gvkey']
        reg_data = self.df[required_cols].dropna()
        
        if len(reg_data) < len(all_x_vars) + 20:  # Need sufficient observations
            return None
        
        # Winsorize variables
        vars_to_winsorize = [y_var] + all_x_vars
        for var in vars_to_winsorize:
            if pd.api.types.is_numeric_dtype(reg_data[var]):
                if reg_data[var].notna().sum() > 0 and reg_data[var].nunique() > 1:
                    p1, p99 = reg_data[var].quantile([0.01, 0.99])
                    if pd.notna(p1) and pd.notna(p99) and p1 != p99:
                        reg_data[var] = reg_data[var].clip(lower=p1, upper=p99)
        
        # Run regression
        try:
            X = sm.add_constant(reg_data[all_x_vars])
            y = reg_data[y_var]
            model = OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': reg_data['gvkey']})
            return model
        except:
            return None

    def test_omitted_variable_bias(self):
        """Test for omitted variable bias using management quality proxies"""
        print("\n🔬 Testing for Omitted Variable Bias...")
        
        self.results['omitted_variable_bias'] = {}
        
        # Get available standard controls
        std_controls = [col for col in self.standard_controls_config.values() 
                       if col and col in self.df.columns]
        
        # Get available management proxies
        mgmt_proxies = [col for col in self.management_proxies_config.values() 
                       if col and col in self.df.columns]
        
        print(f"Available standard controls: {len(std_controls)}")
        print(f"Available management proxies: {len(mgmt_proxies)}")
        
        for ai_factor_name, ai_col in self.average_ai_factor_cols_map.items():
            if ai_col not in self.df.columns:
                continue
                
            factor_results = {}
            
            # Model 1: Baseline (AI factor + standard controls + FE)
            model_baseline = self._run_regression_with_controls(
                y_var=self.y_var,
                x_vars=[ai_col],
                control_vars=std_controls,
                include_fe=True
            )
            if model_baseline:
                factor_results['Baseline'] = model_baseline
            
            # Model 2: Add management quality proxies
            model_with_mgmt = self._run_regression_with_controls(
                y_var=self.y_var,
                x_vars=[ai_col],
                control_vars=std_controls + mgmt_proxies,
                include_fe=True
            )
            if model_with_mgmt:
                factor_results['With Management Proxies'] = model_with_mgmt
            
            # Model 3: Only management proxies
            model_mgmt_only = self._run_regression_with_controls(
                y_var=self.y_var,
                x_vars=mgmt_proxies,
                control_vars=std_controls,
                include_fe=True
            )
            if model_mgmt_only:
                factor_results['Management Proxies Only'] = model_mgmt_only
            
            if factor_results:
                self.results['omitted_variable_bias'][ai_factor_name] = factor_results
                
                # Print coefficient comparison for this factor
                if 'Baseline' in factor_results and 'With Management Proxies' in factor_results:
                    baseline_coef = factor_results['Baseline'].params.get(ai_col, np.nan)
                    mgmt_coef = factor_results['With Management Proxies'].params.get(ai_col, np.nan)
                    
                    if not np.isnan(baseline_coef) and not np.isnan(mgmt_coef) and baseline_coef != 0:
                        pct_change = ((mgmt_coef - baseline_coef) / abs(baseline_coef)) * 100
                        print(f"  {ai_factor_name}: {baseline_coef:.3f} → {mgmt_coef:.3f} ({pct_change:+.1f}%)")
        
        print(" Omitted Variable Bias testing complete")

    def test_reverse_causality(self):
        """Test for reverse causality by regressing AI metrics on lagged performance"""
        print("\n🔬 Testing for Reverse Causality...")
        
        self.results['reverse_causality'] = {}
        
        # Get available lagged performance variables
        lagged_perf_vars = [col for col in self.lagged_performance_vars.values() 
                           if col and col in self.df.columns]
        
        # Get available standard controls
        std_controls = [col for col in self.standard_controls_config.values() 
                       if col and col in self.df.columns]
        
        print(f"Available lagged performance vars: {lagged_perf_vars}")
        
        for ai_factor_name, ai_col in self.average_ai_factor_cols_map.items():
            if ai_col not in self.df.columns:
                continue
                
            factor_results = {}
            
            # Test 1: AI factor regressed on lagged returns
            lagged_return_vars = [col for col in lagged_perf_vars 
                                 if 'return_t_minus' in col or 'calc_roa' in col]
            
            if lagged_return_vars:
                model_reverse_main = self._run_regression_with_controls(
                    y_var=ai_col,
                    x_vars=lagged_return_vars,
                    control_vars=[],
                    include_fe=True
                )
                if model_reverse_main:
                    factor_results['AI ~ Lagged Performance'] = model_reverse_main
            
            # Test 2: AI factor regressed on lagged returns + controls
            if lagged_return_vars:
                model_reverse_controls = self._run_regression_with_controls(
                    y_var=ai_col,
                    x_vars=lagged_return_vars,
                    control_vars=std_controls,
                    include_fe=True
                )
                if model_reverse_controls:
                    factor_results['AI ~ Lagged Performance + Controls'] = model_reverse_controls
            
            # Test 3: Forward-looking test - current performance on lagged AI
            ai_col_lag1 = f'{ai_col}_lag1'
            if ai_col_lag1 not in self.df.columns:
                self.df[ai_col_lag1] = self.df.groupby('gvkey')[ai_col].shift(1)
            
            if ai_col_lag1 in self.df.columns:
                model_forward = self._run_regression_with_controls(
                    y_var=self.y_var,
                    x_vars=[ai_col_lag1],
                    control_vars=std_controls,
                    include_fe=True
                )
                if model_forward:
                    factor_results['Current Return ~ Lagged AI'] = model_forward
            
            if factor_results:
                self.results['reverse_causality'][ai_factor_name] = factor_results
        
        print(" Reverse Causality testing complete")

    def test_coefficient_stability(self):
        """Test coefficient stability across different specifications"""
        print("\n🔬 Testing Coefficient Stability...")
        
        self.results['coefficient_stability'] = {}
        
        std_controls = [col for col in self.standard_controls_config.values() 
                       if col and col in self.df.columns]
        mgmt_proxies = [col for col in self.management_proxies_config.values() 
                       if col and col in self.df.columns]
        
        for ai_factor_name, ai_col in self.average_ai_factor_cols_map.items():
            if ai_col not in self.df.columns:
                continue
                
            factor_results = {}
            
            # Specification 1: AI factor only + FE
            model_fe_only = self._run_regression_with_controls(
                y_var=self.y_var,
                x_vars=[ai_col],
                control_vars=[],
                include_fe=True
            )
            if model_fe_only:
                factor_results['FE Only'] = model_fe_only
            
            # Specification 2: + Standard controls
            model_std = self._run_regression_with_controls(
                y_var=self.y_var,
                x_vars=[ai_col],
                control_vars=std_controls,
                include_fe=True
            )
            if model_std:
                factor_results['+ Standard Controls'] = model_std
            
            # Specification 3: + Management proxies
            model_mgmt = self._run_regression_with_controls(
                y_var=self.y_var,
                x_vars=[ai_col],
                control_vars=std_controls + mgmt_proxies,
                include_fe=True
            )
            if model_mgmt:
                factor_results['+ Management Proxies'] = model_mgmt
            
            # Specification 4: No fixed effects
            model_no_fe = self._run_regression_with_controls(
                y_var=self.y_var,
                x_vars=[ai_col],
                control_vars=std_controls,
                include_fe=False
            )
            if model_no_fe:
                factor_results['No Fixed Effects'] = model_no_fe
            
            if factor_results:
                self.results['coefficient_stability'][ai_factor_name] = factor_results
        
        print(" Coefficient Stability testing complete")

    def run_all_endogeneity_tests(self):
        """Run all endogeneity tests"""
        print("\n🚀 Running comprehensive endogeneity testing...")
        
        self.test_omitted_variable_bias()
        self.test_reverse_causality()
        self.test_coefficient_stability()
        
        print("All endogeneity tests complete")
        return self

    def _get_significance_stars(self, pvalue):
        """Get significance stars for p-values"""
        if pd.isna(pvalue):
            return ""
        if pvalue < 0.01:
            return "***"
        elif pvalue < 0.05:
            return "**"
        elif pvalue < 0.10:
            return "*"
        else:
            return ""

    def _set_table_borders(self, table):
        """Add professional borders to table"""
        tbl = table._tbl
        tblBorders = OxmlElement('w:tblBorders')
        
        for border_name in ['top', 'left', 'bottom', 'right', 'insideH', 'insideV']:
            border = OxmlElement(f'w:{border_name}')
            border.set(qn('w:val'), 'single')
            border.set(qn('w:sz'), '4')
            border.set(qn('w:space'), '0')
            border.set(qn('w:color'), '000000')
            tblBorders.append(border)
        
        tbl.tblPr.append(tblBorders)

    def create_word_document(self):
        """Create comprehensive Word document with all endogeneity test results"""
        print("\n Creating comprehensive Word document...")
        
        doc = Document()
        
        # Add title
        title = doc.add_heading('Endogeneity Testing Results', 0)
        title.alignment = WD_ALIGN_PARAGRAPH.CENTER
        
        # Add summary paragraph
        summary = doc.add_paragraph()
        summary.add_run("Summary: ").bold = True
        summary.add_run("This document presents comprehensive endogeneity testing results for AI factors in corporate disclosure analysis. "
                       "Tests include omitted variable bias detection using management quality proxies, reverse causality analysis "
                       "through lagged performance variables, and coefficient stability assessment across multiple specifications.")
        
        doc.add_page_break()
        
        # Table 1: Omitted Variable Bias Test
        doc.add_heading('Table 1: Omitted Variable Bias Test', level=1)
        
        ovb_para = doc.add_paragraph()
        ovb_para.add_run("This table tests whether AI factor coefficients remain stable when management quality proxies are added as controls. "
                        "Large coefficient changes suggest potential omitted variable bias.")
        
        if 'omitted_variable_bias' in self.results:
            # Create table
            table = doc.add_table(rows=1, cols=6)
            table.style = 'Table Grid'
            self._set_table_borders(table)
            
            # Header row
            hdr_cells = table.rows[0].cells
            headers = ['AI Factor', 'Baseline Coef.', 'Baseline p-val', 'With Mgmt Proxies Coef.', 'With Mgmt p-val', 'Change (%)']
            for i, header in enumerate(headers):
                hdr_cells[i].text = header
                hdr_cells[i].paragraphs[0].runs[0].bold = True
                hdr_cells[i].paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER
            
            # Data rows
            for ai_factor, results in self.results['omitted_variable_bias'].items():
                if 'Baseline' in results and 'With Management Proxies' in results:
                    baseline_model = results['Baseline']
                    mgmt_model = results['With Management Proxies']
                    
                    ai_col = self.average_ai_factor_cols_map.get(ai_factor)
                    if ai_col and ai_col in baseline_model.params and ai_col in mgmt_model.params:
                        baseline_coef = baseline_model.params[ai_col]
                        mgmt_coef = mgmt_model.params[ai_col]
                        baseline_pval = baseline_model.pvalues[ai_col]
                        mgmt_pval = mgmt_model.pvalues[ai_col]
                        
                        if baseline_coef != 0:
                            pct_change = ((mgmt_coef - baseline_coef) / abs(baseline_coef)) * 100
                            
                            row_cells = table.add_row().cells
                            row_cells[0].text = ai_factor
                            row_cells[1].text = f"{baseline_coef:.3f}{self._get_significance_stars(baseline_pval)}"
                            row_cells[2].text = f"{baseline_pval:.3f}"
                            row_cells[3].text = f"{mgmt_coef:.3f}{self._get_significance_stars(mgmt_pval)}"
                            row_cells[4].text = f"{mgmt_pval:.3f}"
                            row_cells[5].text = f"{pct_change:+.1f}%"
                            
                            # Center align numeric columns
                            for i in range(1, 6):
                                row_cells[i].paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER
        
        doc.add_paragraph()
        doc.add_paragraph("Note: ***, **, * indicate significance at 1%, 5%, and 10% levels respectively. "
                         "Changes <25% suggest robust coefficients.")
        
        doc.add_page_break()
        
        # Table 2: Reverse Causality Test
        doc.add_heading('Table 2: Reverse Causality Test', level=1)
        
        rc_para = doc.add_paragraph()
        rc_para.add_run("This table examines whether AI factors are influenced by past performance (reverse causality) "
                       "and tests forward predictive power using lagged AI metrics.")
        
        if 'reverse_causality' in self.results:
            # Create table
            table = doc.add_table(rows=1, cols=5)
            table.style = 'Table Grid'
            self._set_table_borders(table)
            
            # Header row
            hdr_cells = table.rows[0].cells
            headers = ['AI Factor', 'Reverse Causality', 'F-statistic', 'Forward Predictive Power', 'Forward p-value']
            for i, header in enumerate(headers):
                hdr_cells[i].text = header
                hdr_cells[i].paragraphs[0].runs[0].bold = True
                hdr_cells[i].paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER
            
            # Data rows
            for ai_factor, results in self.results['reverse_causality'].items():
                row_cells = table.add_row().cells
                row_cells[0].text = ai_factor
                
                # Check for reverse causality
                if 'AI ~ Lagged Performance' in results:
                    model = results['AI ~ Lagged Performance']
                    significant_lags = []
                    
                    for param_name in model.params.index:
                        if 'return_t_minus' in param_name or 'lag' in param_name.lower():
                            if param_name != 'const':
                                pval = model.pvalues[param_name]
                                if pval < 0.05:
                                    significant_lags.append(param_name)
                    
                    if significant_lags:
                        row_cells[1].text = " Detected"
                        try:
                            row_cells[2].text = f"{model.fvalue:.2f}"
                        except:
                            row_cells[2].text = "N/A"
                    else:
                        row_cells[1].text = " Limited"
                        try:
                            row_cells[2].text = f"{model.fvalue:.2f}"
                        except:
                            row_cells[2].text = "N/A"
                
                # Check forward predictive power
                if 'Current Return ~ Lagged AI' in results:
                    model = results['Current Return ~ Lagged AI']
                    ai_col_lag = f"{self.average_ai_factor_cols_map.get(ai_factor)}_lag1"
                    
                    if ai_col_lag in model.params:
                        pval = model.pvalues[ai_col_lag]
                        coef = model.params[ai_col_lag]
                        stars = self._get_significance_stars(pval)
                        
                        row_cells[3].text = f"{coef:.3f}{stars}"
                        row_cells[4].text = f"{pval:.3f}"
                
                # Center align
                for i in range(1, 5):
                    row_cells[i].paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER
        
        doc.add_paragraph()
        doc.add_paragraph("Note: Reverse causality test examines whether lagged performance predicts current AI metrics. "
                         "Forward predictive power tests whether lagged AI metrics predict current returns.")
        
        doc.add_page_break()
        
        # Table 3: Coefficient Stability Test
        doc.add_heading('Table 3: Coefficient Stability Across Specifications', level=1)
        
        cs_para = doc.add_paragraph()
        cs_para.add_run("This table shows how AI factor coefficients change across different regression specifications. "
                       "Stable coefficients suggest robust relationships.")
        
        if 'coefficient_stability' in self.results:
            # Create table
            table = doc.add_table(rows=1, cols=6)
            table.style = 'Table Grid'
            self._set_table_borders(table)
            
            # Header row
            hdr_cells = table.rows[0].cells
            headers = ['AI Factor', 'FE Only', '+ Standard Controls', '+ Management Proxies', 'No Fixed Effects', 'Stability Score']
            for i, header in enumerate(headers):
                hdr_cells[i].text = header
                hdr_cells[i].paragraphs[0].runs[0].bold = True
                hdr_cells[i].paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER
            
            # Data rows
            for ai_factor, results in self.results['coefficient_stability'].items():
                row_cells = table.add_row().cells
                row_cells[0].text = ai_factor
                
                ai_col = self.average_ai_factor_cols_map.get(ai_factor)
                coefficients = {}
                
                # Extract coefficients from each specification
                for spec_name, model in results.items():
                    if ai_col and ai_col in model.params:
                        coef = model.params[ai_col]
                        pval = model.pvalues[ai_col]
                        stars = self._get_significance_stars(pval)
                        coefficients[spec_name] = f"{coef:.3f}{stars}"
                
                # Fill in the coefficients
                spec_order = ['FE Only', '+ Standard Controls', '+ Management Proxies', 'No Fixed Effects']
                for i, spec in enumerate(spec_order, 1):
                    if spec in coefficients:
                        row_cells[i].text = coefficients[spec]
                    else:
                        row_cells[i].text = "N/A"
                    row_cells[i].paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER
                
                # Calculate stability score (between FE Only and Full Controls)
                if 'FE Only' in results and '+ Management Proxies' in results:
                    fe_coef = results['FE Only'].params.get(ai_col, np.nan)
                    mgmt_coef = results['+ Management Proxies'].params.get(ai_col, np.nan)
                    
                    if not np.isnan(fe_coef) and not np.isnan(mgmt_coef) and fe_coef != 0:
                        pct_change = abs((mgmt_coef - fe_coef) / fe_coef) * 100
                        stability = 100 - pct_change
                        
                        if stability >= 75:
                            row_cells[5].text = f"{stability:.1f}%"
                        else:
                            row_cells[5].text = f"{stability:.1f}% "
                    else:
                        row_cells[5].text = "N/A"
                else:
                    row_cells[5].text = "N/A"
                
                row_cells[5].paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER
        
        doc.add_paragraph()
        doc.add_paragraph("Note: Stability Score shows percentage stability between FE Only and Full Controls specifications. "
                         "Scores ≥75% indicate robust coefficients.")
        
        doc.add_page_break()
        
        # Summary and Interpretation
        doc.add_heading('Summary and Interpretation', level=1)
        
        # Overall assessment
        doc.add_heading('Overall Assessment', level=2)
        
        robust_factors = []
        concern_factors = []
        
        if 'omitted_variable_bias' in self.results:
            for ai_factor, results in self.results['omitted_variable_bias'].items():
                if 'Baseline' in results and 'With Management Proxies' in results:
                    baseline_model = results['Baseline']
                    mgmt_model = results['With Management Proxies']
                    
                    ai_col = self.average_ai_factor_cols_map.get(ai_factor)
                    if ai_col and ai_col in baseline_model.params and ai_col in mgmt_model.params:
                        baseline_coef = baseline_model.params[ai_col]
                        mgmt_coef = mgmt_model.params[ai_col]
                        
                        if baseline_coef != 0:
                            pct_change = abs((mgmt_coef - baseline_coef) / baseline_coef) * 100
                            
                            if pct_change < 25:
                                robust_factors.append(ai_factor)
                            else:
                                concern_factors.append(ai_factor)
        
        assessment_para = doc.add_paragraph()
        assessment_para.add_run("Robust Factors: ").bold = True
        if robust_factors:
            assessment_para.add_run(f"{len(robust_factors)} out of {len(self.average_ai_factor_cols_map)} AI factors demonstrate robust coefficients "
                                  f"with minimal sensitivity to omitted variable bias: {', '.join(robust_factors)}.")
        else:
            assessment_para.add_run("No factors show complete robustness.")
        
        if concern_factors:
            concern_para = doc.add_paragraph()
            concern_para.add_run("Factors Requiring Attention: ").bold = True
            concern_para.add_run(f"The following factors show coefficient instability (>25% change): {', '.join(concern_factors)}. "
                               "Consider using the more conservative estimates from full specifications.")
        
        # Reverse causality assessment
        rc_assess_para = doc.add_paragraph()
        rc_assess_para.add_run("Reverse Causality: ").bold = True
        
        reverse_causality_detected = False
        forward_predictive_factors = []
        
        if 'reverse_causality' in self.results:
            for ai_factor, results in self.results['reverse_causality'].items():
                if 'AI ~ Lagged Performance' in results:
                    model = results['AI ~ Lagged Performance']
                    significant_lags = []
                    
                    for param_name in model.params.index:
                        if 'return_t_minus' in param_name or 'lag' in param_name.lower():
                            if param_name != 'const':
                                pval = model.pvalues[param_name]
                                if pval < 0.05:
                                    significant_lags.append(param_name)
                    
                    if significant_lags:
                        reverse_causality_detected = True
                
                if 'Current Return ~ Lagged AI' in results:
                    model = results['Current Return ~ Lagged AI']
                    ai_col_lag = f"{self.average_ai_factor_cols_map.get(ai_factor)}_lag1"
                    
                    if ai_col_lag in model.params:
                        pval = model.pvalues[ai_col_lag]
                        if pval < 0.05:
                            forward_predictive_factors.append(ai_factor)
        
        if not reverse_causality_detected:
            rc_assess_para.add_run("Limited evidence of reverse causality across AI factors, supporting causal interpretation. ")
        else:
            rc_assess_para.add_run("Some evidence of reverse causality detected. Exercise caution in causal interpretation. ")
        
        if forward_predictive_factors:
            rc_assess_para.add_run(f"Strong forward predictive power demonstrated by: {', '.join(forward_predictive_factors)}.")
        
        # Methodological implications
        doc.add_heading('Methodological Implications', level=2)
        
        method_para = doc.add_paragraph()
        method_para.add_run("1. Specification Choice: ").bold = True
        method_para.add_run("Use full specification results (with management proxies) as primary estimates to address potential omitted variable bias.")
        
        method_para2 = doc.add_paragraph()
        method_para2.add_run("2. Causal Interpretation: ").bold = True
        if not reverse_causality_detected:
            method_para2.add_run("Limited reverse causality supports treating AI factors as leading indicators of future performance.")
        else:
            method_para2.add_run("Some reverse causality detected; frame results as predictive associations rather than causal effects.")
        
        method_para3 = doc.add_paragraph()
        method_para3.add_run("3. Robustness: ").bold = True
        if len(robust_factors) >= len(concern_factors):
            method_para3.add_run("Majority of factors demonstrate coefficient stability, supporting the reliability of main findings.")
        else:
            method_para3.add_run("Mixed coefficient stability suggests need for careful interpretation and conservative estimates.")
        
        # Technical details
        doc.add_page_break()
        doc.add_heading('Technical Details and Variable Definitions', level=1)
        
        # Control variables
        doc.add_heading('Control Variables Used', level=2)
        
        controls_para = doc.add_paragraph()
        controls_para.add_run("Standard Controls: ").bold = True
        std_controls_list = list(self.standard_controls_config.keys())
        controls_para.add_run(f"{', '.join(std_controls_list)}.")
        
        mgmt_para = doc.add_paragraph()
        mgmt_para.add_run("Management Quality Proxies: ").bold = True
        mgmt_proxies_list = list(self.management_proxies_config.keys())
        mgmt_para.add_run(f"{', '.join(mgmt_proxies_list)}.")
        
        # Fixed effects
        fe_para = doc.add_paragraph()
        fe_para.add_run("Fixed Effects: ").bold = True
        fe_para.add_run(f"Year fixed effects ({len(self.master_year_fe_cols)} dummies) and "
                       f"Sector fixed effects ({len(self.master_sector_fe_cols)} dummies) included in all specifications.")
        
        # Sample information
        doc.add_heading('Sample Information', level=2)
        
        sample_para = doc.add_paragraph()
        sample_para.add_run("Dataset: ").bold = True
        sample_para.add_run(f"Russell 3000 companies, 2020-2024 panel. Total observations: {len(self.df):,}.")
        
        winsor_para = doc.add_paragraph()
        winsor_para.add_run("Data Treatment: ").bold = True
        winsor_para.add_run("All continuous variables winsorized at 1st and 99th percentiles. "
                           "Standard errors clustered by firm (gvkey).")
        
        # Footer
        doc.add_paragraph()
        footer_para = doc.add_paragraph()
        footer_para.add_run("Generated by: ").italic = True
        footer_para.add_run("AI Factor Endogeneity Testing Module").italic = True
        
        return doc

    def save_word_document(self, filename=None):
        """Save the Word document to specified path"""
        if filename is None:
            filename = os.path.join(self.output_path, "AI_Factor_Endogeneity_Tests_Comprehensive.docx")
        
        doc = self.create_word_document()
        
        # Ensure directory exists
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        
        doc.save(filename)
        print(f" Word document saved to: {filename}")
        return filename

    def print_summary_results(self):
        """Print comprehensive summary of endogeneity test results"""
        print("\n" + "="*80)
        print(" ENDOGENEITY TESTING SUMMARY")
        print("="*80)
        
        # Omitted Variable Bias Summary
        if 'omitted_variable_bias' in self.results:
            print("\n1. OMITTED VARIABLE BIAS TEST:")
            print("-" * 40)
            
            for ai_factor, results in self.results['omitted_variable_bias'].items():
                if 'Baseline' in results and 'With Management Proxies' in results:
                    baseline_model = results['Baseline']
                    mgmt_model = results['With Management Proxies']
                    
                    ai_col = self.average_ai_factor_cols_map.get(ai_factor)
                    if ai_col and ai_col in baseline_model.params and ai_col in mgmt_model.params:
                        baseline_coef = baseline_model.params[ai_col]
                        mgmt_coef = mgmt_model.params[ai_col]
                        baseline_pval = baseline_model.pvalues[ai_col]
                        mgmt_pval = mgmt_model.pvalues[ai_col]
                        
                        if baseline_coef != 0:
                            pct_change = ((mgmt_coef - baseline_coef) / abs(baseline_coef)) * 100
                            baseline_stars = self._get_significance_stars(baseline_pval)
                            mgmt_stars = self._get_significance_stars(mgmt_pval)
                            
                            print(f"{ai_factor}:")
                            print(f"  Baseline: {baseline_coef:.3f}{baseline_stars}")
                            print(f"  With mgmt proxies: {mgmt_coef:.3f}{mgmt_stars}")
                            print(f"  Change: {pct_change:+.1f}%")
                            
                            if abs(pct_change) < 25:
                                print(f"  Assessment: ROBUST (small change)")
                            else:
                                print(f"  Assessment:  POTENTIAL BIAS (large change)")
        
        # Reverse Causality Summary
        if 'reverse_causality' in self.results:
            print("\n2. REVERSE CAUSALITY TEST:")
            print("-" * 40)
            
            for ai_factor, results in self.results['reverse_causality'].items():
                print(f"{ai_factor}:")
                
                if 'AI ~ Lagged Performance' in results:
                    model = results['AI ~ Lagged Performance']
                    significant_lags = []
                    
                    for param_name in model.params.index:
                        if 'return_t_minus' in param_name or 'lag' in param_name.lower():
                            if param_name != 'const':
                                pval = model.pvalues[param_name]
                                if pval < 0.05:
                                    coef = model.params[param_name]
                                    stars = self._get_significance_stars(pval)
                                    significant_lags.append(f"{param_name}: {coef:.3f}{stars}")
                    
                    if significant_lags:
                        print(f"   REVERSE CAUSALITY CONCERN:")
                        for lag in significant_lags:
                            print(f"    {lag}")
                    else:
                        print(f"   LIMITED REVERSE CAUSALITY")
                
                if 'Current Return ~ Lagged AI' in results:
                    model = results['Current Return ~ Lagged AI']
                    ai_col_lag = f"{self.average_ai_factor_cols_map.get(ai_factor)}_lag1"
                    
                    if ai_col_lag in model.params:
                        pval = model.pvalues[ai_col_lag]
                        coef = model.params[ai_col_lag]
                        stars = self._get_significance_stars(pval)
                        
                        if pval < 0.05:
                            print(f" FORWARD PREDICTIVE POWER: {coef:.3f}{stars}")
                        else:
                            print(f"  WEAK FORWARD PREDICTION: {coef:.3f} (p={pval:.3f})")
        
        print("\n" + "="*80)

    def run_complete_endogeneity_analysis(self):
        """Run complete endogeneity analysis and create Word document"""
        print(" RUNNING COMPREHENSIVE ENDOGENEITY TESTING")
        
        try:
            self.load_and_prepare_data()
            self.run_all_endogeneity_tests()
            self.print_summary_results()
            
            # Create and save Word document
            word_filename = self.save_word_document()
            
            print(f"\n Complete endogeneity analysis saved to: {word_filename}")
            
            return self.results
        except Exception as e:
            print(f" Critical error in endogeneity testing: {str(e)}")
            import traceback
            traceback.print_exc()
            return None


# USAGE CODE FOR YOUR SPECIFIC PATH
if __name__ == "__main__":
    # Your file paths
    data_path = "/Users/daniel/Library/Mobile Documents/com~apple~CloudDocs/Documents/Master Finance/MasterThesis/ThesisData/Regression/final_clean_dataset_filtered_with_corrected_factor_loadings_with_robust_momentum.csv"
    
    # Your specified output directory
    output_dir = "/Users/daniel/Library/Mobile Documents/com~apple~CloudDocs/Documents/Master Finance/MasterThesis/ThesisData/Regression/RegressionTables/EndogeneityTesting/"
    
    import os
    if not os.path.exists(data_path):
        print(f"❌ERROR: Data file not found at {data_path}")
    else:
        # Run endogeneity testing with Word document export
        tester = EndogeneityTesterWithWordExport(data_path=data_path, output_path=output_dir)
        results = tester.run_complete_endogeneity_analysis()
        
        if results:
            print(f"\n ENDOGENEITY TESTING COMPLETED SUCCESSFULLY!")
            print(f" Word document saved to: {output_dir}")
            print(f" Tables include:")
            print(f"   - Table 1: Omitted Variable Bias Test")
            print(f"   - Table 2: Reverse Causality Test") 
            print(f"   - Table 3: Coefficient Stability Test")
            print(f"   - Summary and interpretation section")
            print(f"   - Technical details and variable definitions")
        else:
            print(f"\n Endogeneity testing failed.")


# TO RUN WITH YOUR DATA:
# 1. Copy this entire code
# 2. Run it - it will automatically create the Word document in your specified directory
# 3. The Word document will contain publication-ready tables and analysis

🚀 RUNNING COMPREHENSIVE ENDOGENEITY TESTING
📊 Loading data for endogeneity testing...
✅ Created Strategic_Depth_Average: 3,995 obs
✅ Created AI_Sentiment_Average: 3,995 obs
✅ Created Risk_Own_Adoption_Average: 3,995 obs
✅ Created Risk_External_Threats_Average: 3,995 obs
✅ Created Risk_Non-Adoption_Average: 3,995 obs
✅ Created Forward_Looking_Average: 3,995 obs
✅ Created AI_Washing_Average: 3,995 obs
✅ Created Talent_Investment_Average: 3,995 obs
🔧 Creating management quality proxies...
✅ Management quality proxies created
🔧 Creating additional lagged variables...
✅ Additional lagged variables created

🚀 Running comprehensive endogeneity testing...

🔬 Testing for Omitted Variable Bias...
Available standard controls: 6
Available management proxies: 9
  Strategic Depth: -0.032 → -0.030 (+7.0%)
  AI Sentiment: -0.036 → -0.036 (+1.1%)
  Risk Own Adoption: -0.049 → -0.044 (+10.2%)
  Risk External Threats: -0.012 → -0.012 (-1.4%)
  Risk Non-Adoption: -0.051 → -0.067 (-31.8%)
  Forward Looking