In [10]:
import yfinance as yf
import pandas as pd
import time

returns = pd.read_csv('data/returns.csv')
tickers = list(returns.columns)[1:]  

data = []

for ticker in tickers:
    try:
        info = yf.Ticker(ticker).info
        row = {
            'Ticker': ticker,
            'MarketCap': info.get('marketCap', 0),
            'Sector': info.get('sector', 'Unknown'),
            'Country': info.get('country', 'Unknown')
        }
        data.append(row)
        time.sleep(1.5)  
    except Exception as e:
        print(f"Error retrieving {ticker}: {e}")
        data.append({'Ticker': ticker, 'MarketCap': 0, 'Sector': 'Unknown', 'Country': 'Unknown'})


In [34]:
df = pd.DataFrame(data)

# Size classification
df['SmallCap'] = (df['MarketCap'] < 2e15).astype(int)
df['MidCap'] = ((df['MarketCap'] >= 2e15) & (df['MarketCap'] < 1e20)).astype(int)
df['LargeCap'] = (df['MarketCap'] >= 1e20).astype(int)


# Sector classification
df['Tech'] = df['Sector'].str.contains('Technology', case=False, na=False).astype(int)
df['Finance'] = df['Sector'].str.contains('Financial|Bank', case=False, na=False).astype(int)
df['Healthcare'] = df['Sector'].str.contains('Health', case=False, na=False).astype(int)
df['Consumer'] = df['Sector'].str.contains('Consumer', case=False, na=False).astype(int)
df['Energy'] = df['Sector'].str.contains('Energy|Oil|Gas', case=False, na=False).astype(int)
df['Industrial'] = df['Sector'].str.contains('Industrials', case=False, na=False).astype(int)
df['Utilities'] = df['Sector'].str.contains('Utilities', case=False, na=False).astype(int)

# Country classification
df['International'] = (df['Country'] != 'United States').astype(int)
df['Domestic'] = (df['Country'] == 'United States').astype(int)



In [35]:
binary_df = df[['Ticker', 'SmallCap', 'MidCap', 'LargeCap',
                'Tech', 'Finance', 'Healthcare', 'Consumer',
                'Energy', 'Industrial', 'Utilities',
                'International', 'Domestic']]

binary_df.to_csv("data/ticker_attributes.csv", index=False)


In [36]:
import pandas as pd

#generate data file

binary_df = pd.read_csv('data/ticker_attributes.csv')

binary_df = binary_df.set_index("Ticker")
tickers = binary_df.index.tolist()
features = binary_df.columns.tolist()

# Example target vector 
targets = {
    "Tech": 0.3,
    "Finance": 0.2,
    "Healthcare": .05,
    "Consumer": .05,
    "Utilities": .1,
    "Energy": .1,
    "Industrial": .1,
    "SmallCap": 0.25,
    "MidCap": 0.25,
    "LargeCap": 0.5,
    "Domestic": 0.7,
    "International": 0.3
}


#example weights (replace with max_corr weights)
x_orig = {ticker: 1 / len(tickers) for ticker in tickers}

# Write AMPL-compatible .dat file
with open("attributes.dat", "w") as f_out:
    # STOCKS
    f_out.write("set STOCKS := " + " ".join(tickers) + " ;\n\n")

    # FEATURES
    f_out.write("set FEATURES := " + " ".join(features) + " ;\n\n")

    # x_orig
    f_out.write("param x_orig :=\n")
    for t in tickers:
        f_out.write(f"  {t} {x_orig[t]:.6f}\n")
    f_out.write(";\n\n")

    # Feature matrix: param a
    f_out.write("param a : " + " ".join(features) + " :=\n")
    for t in tickers:
        row = " ".join(str(int(binary_df.loc[t, feat])) for feat in features)
        f_out.write(f"{t} {row}\n")
    f_out.write(";\n\n")

    # Feature targets: param f
    f_out.write("param f :=\n")
    for feat in features:
        f_out.write(f"  {feat} {targets[feat]:.6f}\n")
    f_out.write(";\n")


In [72]:
import pandas as pd
import numpy as np
from amplpy import AMPL

def run_attribute_rebalanced_index(ms):
    # Load data
    returns = pd.read_csv("data/returns.csv", index_col=0)
    sp100_returns = pd.read_csv("data/sp100returns.csv")
    combined = sp100_returns[['Portfolio_Return']].rename(columns={'Portfolio_Return': 'SP100'})
    
    # Load feature matrix
    binary_df = pd.read_csv("data/ticker_attributes.csv").set_index("Ticker")
    all_features = binary_df.columns.tolist()

    # Load custom weights from file
    weights_df = pd.read_csv("data/sp100weights.csv").set_index("Ticker")

    for m in ms:
        # Split into IS and OOS
        split_point = int(0.7 * len(returns))
        out_sample = returns.iloc[split_point:].copy()
        n_rows = len(out_sample)
        period_length = n_rows // m
        periods = np.repeat(np.arange(1, m + 1), period_length)
        if n_rows % m != 0:
            periods = np.append(periods, [m] * (n_rows - len(periods)))
        out_sample['period'] = periods

        # Convert OOS to long format
        out_sample_tall = out_sample.reset_index().melt(
            id_vars=["Date", "period"],
            var_name="Ticker",
            value_name="Return"
        )

        # Create IS windows
        is_windows = []
        for i in range(1, m + 1):
            start_date = out_sample[out_sample['period'] == i].index[0]
            is_window = returns.loc[:start_date].iloc[-split_point:]
            is_windows.append(is_window)

        # Run AMPL model for each rebalancing period
        for i in range(m):
            window_returns = is_windows[i]
            tickers = list(window_returns.columns)

            # Use only relevant features
            available = [feat for feat in all_features if binary_df.loc[tickers, feat].sum() > 0]
            if not available:
                print(f"(m={m}) period {i+1}: ❌ No usable features found — skipping.")
                continue

            # Get weights for current tickers from file
            valid_weights = weights_df.loc[tickers]['Weight']
            valid_weights = valid_weights / valid_weights.sum()  # Normalize

            # Compute weighted attribute targets
            sub_df = binary_df.loc[tickers, available]
            target_dict = (sub_df.T @ valid_weights).round(6).to_dict()

            # Write AMPL data file
            with open("attributes.dat", "w") as f:
                f.write("set STOCKS := " + " ".join(tickers) + " ;\n\n")
                f.write("set FEATURES := " + " ".join(available) + " ;\n\n")

                f.write("param x_orig :=\n")
                for t in tickers:
                    f.write(f"  {t} {valid_weights[t]:.6f}\n")
                f.write(";\n\n")

                f.write("param a : " + " ".join(available) + " :=\n")
                for t in tickers:
                    row = " ".join(str(int(binary_df.loc[t, feat])) for feat in available)
                    f.write(f"{t} {row}\n")
                f.write(";\n\n")

                f.write("param f :=\n")
                for feat in available:
                    f.write(f"  {feat} {target_dict[feat]:.6f}\n")
                f.write(";\n")

            # Solve optimization
            ampl = AMPL()
            ampl.setOption("solver", "gurobi")
            ampl.read("attributes.mod.txt")
            ampl.readData("attributes.dat")
            ampl.solve()

            x = ampl.getVariable("x").getValues().to_pandas()
            nonzero = x[x["x.val"] > 0]
            print(f"(m={m}) period {i+1}: ✅ solution found with {len(nonzero)} non-zero weights")

    return combined



In [73]:
combined_returns = run_attribute_rebalanced_index(ms=[2, 4])


Gurobi 12.0.1: optimal solution; objective 0
103 simplex iterations
(m=2) period 1: ✅ solution found with 100 non-zero weights
Gurobi 12.0.1: optimal solution; objective 0
103 simplex iterations
(m=2) period 2: ✅ solution found with 100 non-zero weights
Gurobi 12.0.1: optimal solution; objective 0
103 simplex iterations
(m=4) period 1: ✅ solution found with 100 non-zero weights
Gurobi 12.0.1: optimal solution; objective 0
103 simplex iterations
(m=4) period 2: ✅ solution found with 100 non-zero weights
Gurobi 12.0.1: optimal solution; objective 0
103 simplex iterations
(m=4) period 3: ✅ solution found with 100 non-zero weights
Gurobi 12.0.1: optimal solution; objective 0
103 simplex iterations
(m=4) period 4: ✅ solution found with 100 non-zero weights


In [74]:
import pandas as pd
import numpy as np
import yfinance as yf
from amplpy import AMPL

def run_attribute_rebalanced_index(q, ms):
    # Load data
    returns = pd.read_csv("data/returns.csv", index_col=0)
    sp100_returns = pd.read_csv("data/sp100returns.csv")
    combined = sp100_returns[['Portfolio_Return']].rename(columns={'Portfolio_Return': 'SP100'})

    binary_df = pd.read_csv("data/ticker_attributes.csv").set_index("Ticker")
    all_features = binary_df.columns.tolist()

    for m in ms:
        # Split into IS and OOS
        split_point = int(0.7 * len(returns))
        out_sample = returns.iloc[split_point:].copy()
        n_rows = len(out_sample)
        period_length = n_rows // m
        periods = np.repeat(np.arange(1, m + 1), period_length)
        if n_rows % m != 0:
            periods = np.append(periods, [m] * (n_rows - len(periods)))
        out_sample['period'] = periods
        out_sample_tall = out_sample.reset_index().melt(
            id_vars=["Date", "period"],
            var_name="Ticker",
            value_name="Return"
        )

        # Create IS windows
        is_windows = []
        for i in range(1, m + 1):
            start_date = out_sample[out_sample['period'] == i].index[0]
            is_window = returns.loc[:start_date].iloc[-split_point:]
            is_windows.append(is_window)

        # Run selection + attribute optimization per period
        for i in range(m):
            window_returns = is_windows[i]
            correlations = window_returns.corr()
            tickers = list(correlations.columns)

            # Step 1: run max_corr model to select top q tickers
            with open("data.txt", "w") as f:
                f.write("set STOCKS := " + " ".join(tickers) + " ;\n\n")
                f.write("param q := " + str(q) + " ;\n\n")
                f.write("param r:\n    " + " ".join(tickers) + " :=\n")
                for t1 in tickers:
                    row = " ".join(f"{correlations.loc[t1, t2]:.4f}" for t2 in tickers)
                    f.write(f"{t1} {row}\n")
                f.write(";\n")

            ampl = AMPL()
            ampl.setOption("solver", "gurobi")
            ampl.read("max_corr.txt")  # Your max_corr.mod file
            ampl.readData("data.txt")
            ampl.solve()
            y = ampl.getVariable("y").getValues().to_pandas()
            selected = y[y["y.val"] == 1].index.tolist()

            if not selected:
                print(f"(q={q}, m={m}) period {i+1}: ❌ No tickers selected — skipping.")
                continue

            # Step 2: calculate market cap weights
            market_caps = {}
            for ticker in selected:
                try:
                    info = yf.Ticker(ticker).info
                    market_caps[ticker] = info.get('marketCap', 0)
                except:
                    market_caps[ticker] = 0
            total_cap = sum(market_caps.values())
            weights = {t: market_caps[t] / total_cap if total_cap > 0 else 0 for t in selected}

            # Step 3: derive attribute targets
            available = [feat for feat in all_features if binary_df.loc[selected, feat].sum() > 0]
            if not available:
                print(f"(q={q}, m={m}) period {i+1}: ❌ No usable features found — skipping.")
                continue

            sub_df = binary_df.loc[selected, available]
            weight_vec = pd.Series(weights).reindex(sub_df.index)
            target_dict = (sub_df.T @ weight_vec).round(6).to_dict()

            # Step 4: write .dat file
            with open("attributes.dat", "w") as f:
                f.write("set STOCKS := " + " ".join(selected) + " ;\n\n")
                f.write("set FEATURES := " + " ".join(available) + " ;\n\n")

                f.write("param x_orig :=\n")
                for t in selected:
                    f.write(f"  {t} {weights[t]:.6f}\n")
                f.write(";\n\n")

                f.write("param a : " + " ".join(available) + " :=\n")
                for t in selected:
                    row = " ".join(str(int(binary_df.loc[t, feat])) for feat in available)
                    f.write(f"{t} {row}\n")
                f.write(";\n\n")

                f.write("param f :=\n")
                for feat in available:
                    f.write(f"  {feat} {target_dict[feat]:.6f}\n")
                f.write(";\n")

            # Step 5: solve attribute model
            ampl = AMPL()
            ampl.setOption("solver", "gurobi")
            ampl.read("attributes.mod.txt")
            ampl.readData("attributes.dat")
            ampl.solve()

            x = ampl.getVariable("x").getValues().to_pandas()
            nonzero = x[x["x.val"] > 0]
            print(f"(q={q}, m={m}) period {i+1}: ✅ solution found with {len(nonzero)} non-zero weights")

    return combined


In [75]:
combined_returns = run_attribute_rebalanced_index(q=15, ms=[2, 4])

Gurobi 12.0.1: optimal solution; objective 77.2324
581 simplex iterations
1 branching node
Gurobi 12.0.1: optimal solution; objective 0
18 simplex iterations
(q=15, m=2) period 1: ✅ solution found with 15 non-zero weights
Gurobi 12.0.1: optimal solution; objective 70.7877
601 simplex iterations
1 branching node
Gurobi 12.0.1: optimal solution; objective 1e-06
23 simplex iterations
(q=15, m=2) period 2: ✅ solution found with 15 non-zero weights
Gurobi 12.0.1: optimal solution; objective 77.2324
581 simplex iterations
1 branching node
Gurobi 12.0.1: optimal solution; objective 0
18 simplex iterations
(q=15, m=4) period 1: ✅ solution found with 15 non-zero weights
Gurobi 12.0.1: optimal solution; objective 71.7706
612 simplex iterations
1 branching node
absmipgap=0.00715, relmipgap=9.9623e-05
Gurobi 12.0.1: optimal solution; objective 9.999999999e-07
19 simplex iterations
(q=15, m=4) period 2: ✅ solution found with 15 non-zero weights
Gurobi 12.0.1: optimal solution; objective 70.6739
599

In [80]:
import pandas as pd
import numpy as np
import yfinance as yf
from amplpy import AMPL

def run_attribute_rebalanced_index(qs, ms):
    returns = pd.read_csv("data/returns.csv", index_col=0)
    sp100_returns = pd.read_csv("data/sp100returns.csv")
    combined = sp100_returns[['Portfolio_Return']].rename(columns={'Portfolio_Return': 'SP100'})

    binary_df = pd.read_csv("data/ticker_attributes.csv").set_index("Ticker")
    all_features = binary_df.columns.tolist()

    for q in qs:
        for m in ms:
            print(f"\n>>> Processing q={q}, m={m}")
            split_point = int(0.7 * len(returns))
            out_sample = returns.iloc[split_point:].copy()
            n_rows = len(out_sample)
            period_length = n_rows // m
            periods = np.repeat(np.arange(1, m + 1), period_length)
            if n_rows % m != 0:
                periods = np.append(periods, [m] * (n_rows - len(periods)))
            out_sample['period'] = periods
            out_sample_tall = out_sample.reset_index().melt(
                id_vars=["Date", "period"], var_name="Ticker", value_name="Return"
            )

            is_windows = []
            for i in range(1, m + 1):
                start_date = out_sample[out_sample['period'] == i].index[0]
                is_window = returns.loc[:start_date].iloc[-split_point:]
                is_windows.append(is_window)

            all_portfolio_returns = []

            for i in range(m):
                window_returns = is_windows[i]
                correlations = window_returns.corr()
                tickers = list(correlations.columns)

                # Step 1: run max_corr to select q tickers
                with open("data.txt", "w") as f:
                    f.write("set STOCKS := " + " ".join(tickers) + " ;\n\n")
                    f.write("param q := " + str(q) + " ;\n\n")
                    f.write("param r:\n    " + " ".join(tickers) + " :=\n")
                    for t1 in tickers:
                        row = " ".join(f"{correlations.loc[t1, t2]:.4f}" for t2 in tickers)
                        f.write(f"{t1} {row}\n")
                    f.write(";\n")

                ampl = AMPL()
                ampl.setOption("solver", "gurobi")
                ampl.read("max_corr.txt")  # Your max_corr.mod file
                ampl.readData("data.txt")
                ampl.solve()

                y = ampl.getVariable("y").getValues().to_pandas()
                selected = y[y["y.val"] == 1].index.tolist()

                if not selected:
                    print(f"(q={q}, m={m}) period {i+1}: No tickers selected — skipping.")
                    continue

                # Step 2: calculate market cap weights
                market_caps = {}
                for ticker in selected:
                    try:
                        info = yf.Ticker(ticker).info
                        market_caps[ticker] = info.get('marketCap', 0)
                    except:
                        market_caps[ticker] = 0
                total_cap = sum(market_caps.values())
                weights = {t: market_caps[t] / total_cap if total_cap > 0 else 0 for t in selected}

                # Step 3: build attribute targets
                available = [feat for feat in all_features if binary_df.loc[selected, feat].sum() > 0]
                if not available:
                    print(f"(q={q}, m={m}) period {i+1}: No usable features — skipping.")
                    continue

                sub_df = binary_df.loc[selected, available]
                weight_vec = pd.Series(weights).reindex(sub_df.index)
                target_dict = (sub_df.T @ weight_vec).round(6).to_dict()

                # Step 4: write attributes.dat for AMPL
                with open("attributes.dat", "w") as f:
                    f.write("set STOCKS := " + " ".join(selected) + " ;\n\n")
                    f.write("set FEATURES := " + " ".join(available) + " ;\n\n")

                    f.write("param x_orig :=\n")
                    for t in selected:
                        f.write(f"  {t} {weights[t]:.6f}\n")
                    f.write(";\n\n")

                    f.write("param a : " + " ".join(available) + " :=\n")
                    for t in selected:
                        row = " ".join(str(int(binary_df.loc[t, feat])) for feat in available)
                        f.write(f"{t} {row}\n")
                    f.write(";\n\n")

                    f.write("param f :=\n")
                    for feat in available:
                        f.write(f"  {feat} {target_dict[feat]:.6f}\n")
                    f.write(";\n")

                # Step 5: solve attribute optimization
                ampl = AMPL()
                ampl.setOption("solver", "gurobi")
                ampl.read("attributes.mod.txt")
                ampl.readData("attributes.dat")
                ampl.solve()

                x = ampl.getVariable("x").getValues().to_pandas()
                nonzero = x[x["x.val"] > 0].reset_index()
                nonzero = nonzero.rename(columns={nonzero.columns[0]: "Ticker", nonzero.columns[1]: "x.val"})

                nonzero["Weight"] = nonzero["x.val"]
                nonzero["period"] = i + 1
                all_portfolio_returns.append(nonzero)

                print(f"(q={q}, m={m}) period {i+1}: solution found with {len(nonzero)} non-zero weights")

            # Step 6: combine results and calculate return
            if all_portfolio_returns:
                weights_df = pd.concat(all_portfolio_returns)
                portfolio = pd.merge(out_sample_tall, weights_df, on=['period', 'Ticker'], how='inner')
                portfolio['Weighted_Return'] = portfolio['Return'] * portfolio['Weight']
                portfolio_return = portfolio.groupby('Date')['Weighted_Return'].sum().reset_index()
                portfolio_return = portfolio_return.rename(columns={'Weighted_Return': 'Portfolio_Return'})
                label = f"(q={q}, m={m})"
                combined[label] = portfolio_return['Portfolio_Return']

    return combined


In [81]:
combined_returns = run_attribute_rebalanced_index(qs=[5,15], ms=[2, 4])


>>> Processing q=5, m=2
Gurobi 12.0.1: optimal solution; objective 70.6261
894 simplex iterations
1 branching node
Gurobi 12.0.1: optimal solution; objective 0
2 simplex iterations
(q=5, m=2) period 1: ✅ solution found with 5 non-zero weights
Gurobi 12.0.1: optimal solution; objective 61.6403
1203 simplex iterations
1 branching node
Gurobi 12.0.1: optimal solution; objective 0
2 simplex iterations
(q=5, m=2) period 2: ✅ solution found with 5 non-zero weights

>>> Processing q=5, m=4
Gurobi 12.0.1: optimal solution; objective 70.6261
894 simplex iterations
1 branching node
Gurobi 12.0.1: optimal solution; objective 0
2 simplex iterations
(q=5, m=4) period 1: ✅ solution found with 5 non-zero weights
Gurobi 12.0.1: optimal solution; objective 63.3561
1110 simplex iterations
1 branching node
Gurobi 12.0.1: optimal solution; objective 0
6 simplex iterations
(q=5, m=4) period 2: ✅ solution found with 5 non-zero weights
Gurobi 12.0.1: optimal solution; objective 61.5819
1281 simplex iteratio