In [1]:
import pandas as pd
import numpy as np
import yfinance as yf

def greedy_index_minimize_stocks(correlation_threshold=0.88):
    # Load data
    returns_df = pd.read_csv("data/returns.csv", parse_dates=["Date"]).set_index("Date")
    sp100_df = pd.read_csv("data/sp100returns.csv", parse_dates=["Date"]).set_index("Date")

    # Align on common dates
    common_dates = returns_df.index.intersection(sp100_df.index)
    returns_df = returns_df.loc[common_dates]
    sp100 = sp100_df.loc[common_dates]["Portfolio_Return"]

    # 70/30 split
    split_point = int(len(returns_df) * 0.7)
    is_returns = returns_df.iloc[:split_point]
    sp100_is = sp100.iloc[:split_point].dropna()

    # Drop stocks with NaNs in IS
    is_returns = is_returns.loc[sp100_is.index].dropna(axis=1, how="any")

    print("IS SP100 shape:", sp100_is.shape)
    print("IS returns shape:", is_returns.shape)

    # Compute correlations
    stock_corrs = is_returns.corrwith(sp100_is).dropna().sort_values(ascending=False)

    if stock_corrs.empty:
        print("No stocks have non-null correlation with SP100.")
        return [], [], pd.DataFrame()

    print("Top correlations:")
    print(stock_corrs.head(10))

    # Fetch market caps for top 50
    
    tickers_caps = {}
    tickers_info = yf.Tickers(" ".join(stock_corrs.index[:50]))
    for t in stock_corrs.index[:50]:
        try:
            tickers_caps[t] = tickers_info.tickers[t].info.get("marketCap", 1e9)
        except:
            tickers_caps[t] = 1e9

    # Greedy ticker selection
    selected = []
    weights = None
    for ticker in stock_corrs.index:
        selected.append(ticker)
        caps = np.array([tickers_caps[t] for t in selected], dtype=np.float64)
        caps /= caps.sum()

        combined = (is_returns[selected] * caps).sum(axis=1)
        corr = combined.corr(sp100_is)

        if corr >= correlation_threshold:
            weights = caps
            break

    if weights is None or len(weights) == 0:
        print(f"No combination met correlation >= {correlation_threshold}")
        return [], [], pd.DataFrame()

    print(f"Selected {len(selected)} tickers with correlation >= {correlation_threshold}:")
    print(stock_corrs.loc[selected])

    # Final in-sample correlation
    combined_is = (is_returns[selected] * weights).sum(axis=1)
    in_sample_corr = combined_is.corr(sp100_is)
    print(f"Final in-sample correlation: {in_sample_corr:.4f}")

    # OOS evaluation
    oos_returns = returns_df.iloc[split_point:]
    sp100_oos = sp100.iloc[split_point:]
    oos_subset = oos_returns[selected].dropna()
    sp100_oos = sp100_oos.loc[oos_subset.index]
    oos_combined = (oos_subset * weights).sum(axis=1)

    perf = pd.DataFrame({
        "SP100": sp100_oos,
        "Index": oos_combined
    })

    print(f"Out-of-sample correlation: {perf['SP100'].corr(perf['Index']):.4f}")
    print("Selected tickers for index:", selected)
    print("Final weights:", weights)

    return selected, weights, perf


In [2]:
greedy_index_minimize_stocks(0.89)

IS SP100 shape: (44,)
IS returns shape: (44, 100)
Top correlations:
QCOM     0.857736
AAPL     0.837900
ADBE     0.807286
CRM      0.807009
ACN      0.801877
MSFT     0.786524
NOW      0.763279
NFLX     0.752581
INTU     0.747903
GOOGL    0.715237
dtype: float64
Fetching market caps...
Selected 5 tickers with correlation >= 0.89:
QCOM    0.857736
AAPL    0.837900
ADBE    0.807286
CRM     0.807009
ACN     0.801877
dtype: float64
Final in-sample correlation: 0.8975
Out-of-sample correlation: 0.7963
Selected tickers for index: ['QCOM', 'AAPL', 'ADBE', 'CRM', 'ACN']
Final weights: [0.04256571 0.79178159 0.04361947 0.07056339 0.05146984]


(['QCOM', 'AAPL', 'ADBE', 'CRM', 'ACN'],
 array([0.04256571, 0.79178159, 0.04361947, 0.07056339, 0.05146984]),
                SP100     Index
 Date                          
 2023-08-27  0.014589  0.024520
 2023-09-03  0.030212  0.059052
 2023-09-10 -0.009965 -0.050101
 2023-09-17 -0.000585 -0.018962
 2023-09-24 -0.033827 -0.006472
 2023-10-01 -0.004347 -0.017846
 2023-10-08  0.015488  0.032854
 2023-10-15  0.003644  0.004599
 2023-10-22 -0.024978 -0.028182
 2023-10-29 -0.025904 -0.028527
 2023-11-05  0.063077  0.057742
 2023-11-12  0.024313  0.052183
 2023-11-19  0.021148  0.019993
 2023-11-26  0.007687  0.003800
 2023-12-03 -0.000574  0.017539
 2023-12-10  0.006802  0.016795
 2023-12-17  0.020451  0.013058
 2023-12-24  0.012773 -0.011934
 2023-12-31  0.002193 -0.005556)