# CEI Score Impact on Stock Returns Analysis

This notebook analyzes how Corporate Equality Index (CEI) scores affect stock returns around release dates.

## Analysis Steps:
1. Load CEI scores and stock price data
2. Aggregate companies by CEI score bins (every 10 points)
3. Analyze returns by year and score bin around release dates
4. Create overall analysis across all years

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

## 1. Load and Prepare Data

In [None]:
# Load CEI data
print("Loading CEI data...")
cei_df = pd.read_csv('data/processed/cei_with_dates.csv')
print(f"CEI records: {len(cei_df):,}")
print(f"Years covered: {sorted(cei_df['year'].unique())}")

# Load stock price data
print("\nLoading stock price data...")
stock_df = pd.read_csv('data/processed/stock_prices_event_window.csv')
stock_df['date'] = pd.to_datetime(stock_df['date'])
stock_df['cei_release_date'] = pd.to_datetime(stock_df['cei_release_date'])
print(f"Stock price records: {len(stock_df):,}")
print(f"Unique firms: {stock_df['cusip6'].nunique()}")
print(f"Date range: {stock_df['date'].min()} to {stock_df['date'].max()}")

# Display data info
print("\nCEI Data Sample:")
display(cei_df.head())

print("\nStock Data Sample:")
display(stock_df.head())

In [None]:
# Create CUSIP6 from CEI data for matching
def get_cusip6(cusip):
    """Extract first 6 digits of CUSIP."""
    if pd.isna(cusip):
        return None
    cusip_str = str(cusip).strip()
    if len(cusip_str) >= 6:
        return cusip_str[:6]
    return None

cei_df['cusip6'] = cei_df['cusip'].apply(get_cusip6)
cei_df = cei_df.dropna(subset=['cusip6', 'cei_score'])

print(f"CEI records with valid CUSIP6 and scores: {len(cei_df):,}")

In [None]:
# Create CEI score bins (every 10 points)
def create_score_bin(score):
    """Create score bins: 0-9, 10-19, ..., 90-100"""
    if pd.isna(score):
        return None
    bin_start = int(score // 10) * 10
    bin_end = min(bin_start + 9, 100)
    return f"{bin_start}-{bin_end}"

cei_df['score_bin'] = cei_df['cei_score'].apply(create_score_bin)

# Show score distribution
print("CEI Score Distribution by Bin:")
score_dist = cei_df['score_bin'].value_counts().sort_index()
print(score_dist)

plt.figure(figsize=(10, 6))
score_dist.plot(kind='bar')
plt.title('Distribution of CEI Scores by Bin')
plt.xlabel('CEI Score Bin')
plt.ylabel('Number of Company-Year Observations')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 2. Merge CEI and Stock Data

In [None]:
# Merge CEI data with stock data
# First, create year from release date in stock data
stock_df['cei_year'] = stock_df['cei_release_date'].dt.year

# Merge on cusip6 and year
merged_df = stock_df.merge(
    cei_df[['cusip6', 'year', 'cei_score', 'score_bin', 'employer']], 
    left_on=['cusip6', 'cei_year'], 
    right_on=['cusip6', 'year'], 
    how='inner'
)

print(f"Merged records: {len(merged_df):,}")
print(f"Unique firms in merged data: {merged_df['cusip6'].nunique()}")
print(f"Years with data: {sorted(merged_df['year'].unique())}")

# Clean data - remove missing returns
merged_df = merged_df.dropna(subset=['RET'])
print(f"Records with valid returns: {len(merged_df):,}")

display(merged_df.head())

## 3. Analyze Returns by Year and Score Bin

In [None]:
# Calculate average returns by year, score bin, and days from release
yearly_analysis = merged_df.groupby(['year', 'score_bin', 'days_from_release'])['RET'].agg([
    'mean', 'std', 'count'
]).reset_index()

yearly_analysis.columns = ['year', 'score_bin', 'days_from_release', 'avg_return', 'std_return', 'count']

print("Sample of yearly analysis:")
display(yearly_analysis.head(10))

In [None]:
# Plot returns by score bin for each year around release dates
years_to_plot = sorted(merged_df['year'].unique())[:6]  # Plot first 6 years

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for i, year in enumerate(years_to_plot):
    year_data = yearly_analysis[yearly_analysis['year'] == year]
    
    # Pivot data for plotting
    pivot_data = year_data.pivot(index='days_from_release', columns='score_bin', values='avg_return')
    
    ax = axes[i]
    
    # Plot each score bin
    for score_bin in pivot_data.columns:
        if not pivot_data[score_bin].isna().all():
            ax.plot(pivot_data.index, pivot_data[score_bin] * 100, 
                   marker='o', label=f'CEI {score_bin}', linewidth=2)
    
    ax.axvline(x=0, color='red', linestyle='--', alpha=0.7, label='Release Date')
    ax.set_title(f'Returns Around CEI Release - {year}')
    ax.set_xlabel('Days from Release')
    ax.set_ylabel('Average Return (%)')
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Aggregate Analysis Across All Years

In [None]:
# Aggregate across all years
overall_analysis = merged_df.groupby(['score_bin', 'days_from_release'])['RET'].agg([
    'mean', 'std', 'count'
]).reset_index()

overall_analysis.columns = ['score_bin', 'days_from_release', 'avg_return', 'std_return', 'count']

# Calculate standard error
overall_analysis['std_error'] = overall_analysis['std_return'] / np.sqrt(overall_analysis['count'])

print("Overall analysis sample:")
display(overall_analysis.head(10))

In [None]:
# Create main plot: Returns vs Days from Release by Score Bin
plt.figure(figsize=(14, 10))

# Get unique score bins and sort them
score_bins = sorted(overall_analysis['score_bin'].unique())
colors = plt.cm.RdYlGn(np.linspace(0.2, 0.8, len(score_bins)))

for i, score_bin in enumerate(score_bins):
    bin_data = overall_analysis[overall_analysis['score_bin'] == score_bin]
    
    if not bin_data.empty:
        plt.plot(bin_data['days_from_release'], bin_data['avg_return'] * 100, 
                marker='o', label=f'CEI {score_bin}', linewidth=2.5, 
                color=colors[i], markersize=6)
        
        # Add confidence intervals
        plt.fill_between(bin_data['days_from_release'], 
                        (bin_data['avg_return'] - 1.96 * bin_data['std_error']) * 100,
                        (bin_data['avg_return'] + 1.96 * bin_data['std_error']) * 100,
                        alpha=0.2, color=colors[i])

# Add vertical line at release date
plt.axvline(x=0, color='red', linestyle='--', linewidth=2, alpha=0.8, label='CEI Release Date')
plt.axhline(y=0, color='black', linestyle='-', alpha=0.5)

plt.title('Stock Returns Around CEI Release Dates by Score (All Years Combined)', fontsize=16, fontweight='bold')
plt.xlabel('Days from CEI Release Date', fontsize=14)
plt.ylabel('Average Daily Return (%)', fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Create cumulative returns plot
plt.figure(figsize=(14, 10))

for i, score_bin in enumerate(score_bins):
    bin_data = overall_analysis[overall_analysis['score_bin'] == score_bin].sort_values('days_from_release')
    
    if not bin_data.empty:
        # Calculate cumulative returns
        cumulative_returns = (1 + bin_data['avg_return']).cumprod() - 1
        
        plt.plot(bin_data['days_from_release'], cumulative_returns * 100, 
                marker='o', label=f'CEI {score_bin}', linewidth=2.5, 
                color=colors[i], markersize=6)

plt.axvline(x=0, color='red', linestyle='--', linewidth=2, alpha=0.8, label='CEI Release Date')
plt.axhline(y=0, color='black', linestyle='-', alpha=0.5)

plt.title('Cumulative Returns Around CEI Release Dates by Score', fontsize=16, fontweight='bold')
plt.xlabel('Days from CEI Release Date', fontsize=14)
plt.ylabel('Cumulative Return (%)', fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Statistical Analysis

In [None]:
# Calculate event window returns (e.g., 3 days before to 3 days after)
event_window = merged_df[merged_df['days_from_release'].between(-3, 3)]

# Calculate cumulative returns for each firm during event window
firm_event_returns = event_window.groupby(['cusip6', 'year', 'score_bin'])['RET'].sum().reset_index()
firm_event_returns.columns = ['cusip6', 'year', 'score_bin', 'event_return']

print("Event Window Returns Summary:")
summary_stats = firm_event_returns.groupby('score_bin')['event_return'].agg([
    'count', 'mean', 'std', 'min', 'max'
])
summary_stats.columns = ['N_firms', 'Mean_Return', 'Std_Return', 'Min_Return', 'Max_Return']
summary_stats['Mean_Return_pct'] = summary_stats['Mean_Return'] * 100
display(summary_stats)

In [None]:
# Box plot of event returns by score bin
plt.figure(figsize=(12, 8))

# Prepare data for box plot
box_data = []
labels = []

for score_bin in sorted(firm_event_returns['score_bin'].unique()):
    returns = firm_event_returns[firm_event_returns['score_bin'] == score_bin]['event_return'] * 100
    box_data.append(returns)
    labels.append(f'CEI {score_bin}')

plt.boxplot(box_data, labels=labels)
plt.title('Distribution of 7-Day Event Window Returns by CEI Score', fontsize=16, fontweight='bold')
plt.xlabel('CEI Score Bin', fontsize=14)
plt.ylabel('Event Window Return (%)', fontsize=14)
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Test for significant differences between high and low CEI scores
from scipy import stats

# Compare highest scoring companies (90-100) vs lowest (0-19)
high_scores = firm_event_returns[firm_event_returns['score_bin'].isin(['90-100'])]['event_return']
low_scores = firm_event_returns[firm_event_returns['score_bin'].isin(['0-9', '10-19'])]['event_return']

if len(high_scores) > 0 and len(low_scores) > 0:
    t_stat, p_value = stats.ttest_ind(high_scores, low_scores)
    
    print(f"Statistical Test: High CEI (90-100) vs Low CEI (0-19) Scores")
    print(f"High CEI mean return: {high_scores.mean()*100:.3f}%")
    print(f"Low CEI mean return: {low_scores.mean()*100:.3f}%")
    print(f"Difference: {(high_scores.mean() - low_scores.mean())*100:.3f} percentage points")
    print(f"T-statistic: {t_stat:.3f}")
    print(f"P-value: {p_value:.3f}")
    print(f"Significant at 5% level: {'Yes' if p_value < 0.05 else 'No'}")
else:
    print("Insufficient data for statistical test")

## 6. Key Findings Summary

In [None]:
# Summary statistics table
print("=== CEI STOCK RETURN ANALYSIS SUMMARY ===")
print(f"\nData Coverage:")
print(f"  • Total observations: {len(merged_df):,}")
print(f"  • Unique firms: {merged_df['cusip6'].nunique()}")
print(f"  • Years analyzed: {len(merged_df['year'].unique())}")
print(f"  • CEI release dates: {len(merged_df['cei_release_date'].unique())}")

print(f"\nScore Distribution:")
for score_bin in sorted(summary_stats.index):
    n_firms = summary_stats.loc[score_bin, 'N_firms']
    mean_ret = summary_stats.loc[score_bin, 'Mean_Return_pct']
    print(f"  • CEI {score_bin}: {n_firms} firms, {mean_ret:.3f}% avg return")

print(f"\nEvent Window Analysis (-3 to +3 days):")
if len(high_scores) > 0 and len(low_scores) > 0:
    print(f"  • High CEI firms (90-100): {high_scores.mean()*100:.3f}% average return")
    print(f"  • Low CEI firms (0-19): {low_scores.mean()*100:.3f}% average return")
    print(f"  • Difference: {(high_scores.mean() - low_scores.mean())*100:.3f} percentage points")
    print(f"  • Statistical significance: {'Yes' if p_value < 0.05 else 'No'} (p = {p_value:.3f})")

print("\n" + "="*50)