Short script to look at price data pickle.

In [None]:
import pandas as pd

file_path = 'data/raw/all_tickers_historical_data.pkl'

try:
    data = pd.read_pickle(file_path)
    
    # --- PART 1: Inspecting the Nested 'QCOM' Dictionary ---
    print(f"--- 1. Deep Dive into 'QCOM' ---")
    
    if 'QCOM' in data:
        qcom_val = data['QCOM']
        
        # We know qcom_val is a dict, so let's look at its keys
        if isinstance(qcom_val, dict):
            print(f"Keys found inside 'QCOM': {list(qcom_val.keys())}\n")
            
            for sub_key, sub_val in qcom_val.items():
                print(f"Key: ['{sub_key}']")
                print(f"   Type: {type(sub_val)}")
                
                # If it's a DataFrame, show columns and first row
                if isinstance(sub_val, pd.DataFrame):
                    print(f"   Shape: {sub_val.shape}")
                    print(f"   Columns: {list(sub_val.columns)}")
                    print(sub_val.head(2).to_string()) # Print first 2 rows clearly
                
                # If it's a simple value or list, print it directly
                else:
                    print(f"   Value: {sub_val}")
                print("-" * 30)
    else:
        print("Key 'QCOM' not found (weird, since we saw it earlier).")

    # --- PART 2: Searching for S&P 500 Keys ---
    print(f"\n--- 2. Searching for S&P 500 Keys ---")
    
    # Common variations for the S&P 500
    search_terms = ['sp500', 's&p', 'spx', 'gspc', 'spy', 'index']
    found_keys = []

    # Case-insensitive search through all top-level keys
    for key in data.keys():
        key_str = str(key).lower()
        if any(term in key_str for term in search_terms):
            found_keys.append(key)

    if found_keys:
        print(f"Found {len(found_keys)} potential matches:")
        print(found_keys)
    else:
        print("No keys found matching typical S&P 500 terms (sp500, s&p, gspc, spy).")

except Exception as e:
    print(f"Error: {e}")

Data exploration for the new multiclass labels.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from IPython.display import display  # For nice table rendering in Jupyter

# ================= CONFIGURATION =================
FILE_PATH = 'data/processed/ml_dataset_with_multiclass_labels.csv'
CLIP_PCT = 0.03  # 3% top/bottom removal for stats & plotting
# =================================================

def plot_grid(df, cols, title_suffix, log_scale=False, clip_pct=CLIP_PCT):
    n_cols = 2
    n_rows = (len(cols) + 1) // n_cols
    
    # Adjust figure size for inline display
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
    axes = axes.flatten()
    
    print(f"\n--- {title_suffix} Distributions ---")
    
    for i, col in enumerate(cols):
        ax = axes[i]
        
        # Calculate Clipping Bounds
        lower = df[col].quantile(clip_pct)
        upper = df[col].quantile(1 - clip_pct)
        
        # Filter data strictly for plotting
        filtered_data = df[(df[col] >= lower) & (df[col] <= upper)][col]
        
        bins = 100 if log_scale else 50
        color = 'teal' if log_scale else 'skyblue'
        
        sns.histplot(filtered_data, bins=bins, kde=True, ax=ax, color=color)
        
        if log_scale:
            ax.set_yscale('log')
            ax.set_ylabel('Frequency (Log Scale)')
            ax.set_title(f'{col} (Log Scale)\nClipped {clip_pct*100:.0f}%-{100 - clip_pct*100:.0f}%')
        else:
            ax.set_ylabel('Frequency')
            ax.set_title(f'{col} (Linear)\nClipped {clip_pct*100:.0f}%-{100 - clip_pct*100:.0f}%')

        ax.set_xlabel('CAGR Outperformance vs SPY')
        ax.axvline(0, color='red', linestyle='--', alpha=0.7, label='Market Neutral (0.0)')
        ax.legend()

    # Hide empty subplots
    for j in range(i + 1, len(axes)):
        axes[j].axis('off')

    plt.tight_layout()
    plt.show()

def main():
    if not os.path.exists(FILE_PATH):
        print(f"Error: File not found at {FILE_PATH}")
        return

    print(f"Loading {FILE_PATH}...")
    df = pd.read_csv(FILE_PATH)
    
    label_cols = [col for col in df.columns if col.startswith('performance_')]
    if not label_cols:
        print("Error: No 'performance_' columns found.")
        return

    # --- PART 1: Enhanced Statistics ---
    print("\n### Summary Statistics (Full Data vs Trimmed)")
    
    # 1. Standard Stats
    stats = df[label_cols].describe(percentiles=[CLIP_PCT, 0.5, 1 - CLIP_PCT]).T
    
    # 2. Calculate Trimmed Mean (removing top/bottom 3%)
    trimmed_means = []
    for col in label_cols:
        q_low = df[col].quantile(CLIP_PCT)
        q_high = df[col].quantile(1 - CLIP_PCT)
        
        # Filter strictly between bounds
        trimmed_data = df[(df[col] >= q_low) & (df[col] <= q_high)][col]
        trimmed_means.append(trimmed_data.mean())
        
    # Add to the stats table
    trim_col_name = f'Trimmed Mean ({CLIP_PCT*100:.0f}-{100-CLIP_PCT*100:.0f}%)'
    stats[trim_col_name] = trimmed_means
    
    # Reorder columns
    display_cols = ['count', 'mean', trim_col_name, 
                    'std', f'{CLIP_PCT*100:.0f}%', '50%', f'{100-CLIP_PCT*100:.0f}%', 'min', 'max']
    
    # Display nicely in Notebook
    display(stats[display_cols])

    # --- PART 2: Plots ---
    # Linear
    plot_grid(df, label_cols, "Linear", log_scale=False, clip_pct=CLIP_PCT)

    # Logarithmic
    plot_grid(df, label_cols, "Log Scale", log_scale=True, clip_pct=CLIP_PCT)

# Run directly
main()

Loading data/processed/ml_dataset_with_multiclass_labels.csv...

--- Summary Statistics (Full Data) ---
                  count          mean           std        3%       50%         97%          min           max
performance_1W  32416.0  1.025525e+19  1.846373e+21 -4.150252  0.000000  382.271776 -4234.366422  3.324291e+23
performance_2W  32416.0  2.157428e+09  3.861010e+11 -2.370280 -0.005316   40.955893   -88.542772  6.951473e+13
performance_1M  32416.0  1.238301e+73  2.229493e+75 -1.497150 -0.011283   10.044790   -13.313260  4.014078e+77
performance_2M  32416.0  3.540711e+01  5.616448e+03 -1.100880 -0.017624    4.149672    -4.843071  1.009511e+06
performance_3M  32416.0  1.943745e+00  1.624496e+02 -0.940112 -0.016628    2.547790    -2.999390  2.311315e+04
performance_6M  32416.0  8.151636e-02  2.275161e+00 -0.742332 -0.015363    1.352727    -1.464731  3.687599e+02
performance_8M  32416.0  1.621908e+05  2.920154e+07 -0.703810 -0.016576    1.095628    -1.427931  5.257575e+09
performa