In [1]:
import yfinance as yf
import pandas as pd
import time

returns = pd.read_csv('returns.csv')
tickers = list(returns.columns)[1:]  

data = []

for ticker in tickers:
    try:
        info = yf.Ticker(ticker).info
        row = {
            'Ticker': ticker,
            'MarketCap': info.get('marketCap', 0),
            'Sector': info.get('sector', 'Unknown'),
            'Country': info.get('country', 'Unknown')
        }
        data.append(row)
        time.sleep(1.5)  
    except Exception as e:
        print(f"Error retrieving {ticker}: {e}")
        data.append({'Ticker': ticker, 'MarketCap': 0, 'Sector': 'Unknown', 'Country': 'Unknown'})


In [5]:
df = pd.DataFrame(data)

# Size classification
df['SmallCap'] = (df['MarketCap'] < 2e9).astype(int)
df['MidCap'] = ((df['MarketCap'] >= 2e9) & (df['MarketCap'] < 1e10)).astype(int)
df['LargeCap'] = (df['MarketCap'] >= 1e10).astype(int)

# Sector classification
df['Tech'] = df['Sector'].str.contains('Technology', case=False, na=False).astype(int)
df['Finance'] = df['Sector'].str.contains('Financial|Bank', case=False, na=False).astype(int)
df['Healthcare'] = df['Sector'].str.contains('Health', case=False, na=False).astype(int)
df['Consumer'] = df['Sector'].str.contains('Consumer', case=False, na=False).astype(int)
df['Energy'] = df['Sector'].str.contains('Energy|Oil|Gas', case=False, na=False).astype(int)
df['Industrial'] = df['Sector'].str.contains('Industrials', case=False, na=False).astype(int)
df['Utilities'] = df['Sector'].str.contains('Utilities', case=False, na=False).astype(int)

# Country classification
df['International'] = (df['Country'] != 'United States').astype(int)
df['Domestic'] = (df['Country'] == 'United States').astype(int)



In [2]:
binary_df = df[['Ticker', 'SmallCap', 'MidCap', 'LargeCap',
                'Tech', 'Finance', 'Healthcare', 'Consumer',
                'Energy', 'Industrial', 'Utilities',
                'International', 'Domestic']]

binary_df.to_csv("ticker_attributes.csv", index=False)


NameError: name 'df' is not defined

In [7]:
import pandas as pd

#generate data file

binary_df = pd.read_csv('ticker_attributes.csv')

binary_df = binary_df.set_index("Ticker")
tickers = binary_df.index.tolist()
features = binary_df.columns.tolist()

# Example target vector 
targets = {
    "Tech": 0.3,
    "Finance": 0.2,
    "Healthcare": .05,
    "Consumer": .05,
    "Utilities": .1,
    "Energy": .1,
    "Industrial": .1,
    "SmallCap": 0.25,
    "MidCap": 0.25,
    "LargeCap": 0.5,
    "Domestic": 0.7,
    "International": 0.3
}


#example weights (replace with max_corr weights)
x_orig = {ticker: 1 / len(tickers) for ticker in tickers}

# Write AMPL-compatible .dat file
with open("attributes.dat", "w") as f_out:
    # STOCKS
    f_out.write("set STOCKS := " + " ".join(tickers) + " ;\n\n")

    # FEATURES
    f_out.write("set FEATURES := " + " ".join(features) + " ;\n\n")

    # x_orig
    f_out.write("param x_orig :=\n")
    for t in tickers:
        f_out.write(f"  {t} {x_orig[t]:.6f}\n")
    f_out.write(";\n\n")

    # Feature matrix: param a
    f_out.write("param a : " + " ".join(features) + " :=\n")
    for t in tickers:
        row = " ".join(str(int(binary_df.loc[t, feat])) for feat in features)
        f_out.write(f"{t} {row}\n")
    f_out.write(";\n\n")

    # Feature targets: param f
    f_out.write("param f :=\n")
    for feat in features:
        f_out.write(f"  {feat} {targets[feat]:.6f}\n")
    f_out.write(";\n")


In [16]:
import pandas as pd
import numpy as np
import yfinance as yf
import time
from amplpy import AMPL




def run_attribute_rebalanced_index(qs, ms):
    returns = pd.read_csv("returns.csv", index_col=0)
    sp100_returns = pd.read_csv("sp100returns.csv")
    combined = sp100_returns[['Portfolio_Return']].rename(columns={'Portfolio_Return': 'SP100'})

    binary_df = pd.read_csv("ticker_attributes.csv").set_index("Ticker")
    features = binary_df.columns.tolist()
    attribute_targets = {
        "Tech": 0.3, "Finance": 0.2, "Healthcare": .05, "Consumer": .05,
        "Utilities": .1, "Energy": .1, "Industrial": .1,
        "SmallCap": 0.25, "MidCap": 0.25, "LargeCap": 0.5,
        "Domestic": 0.7, "International": 0.3
    }

    correlations_dict = {}

    for m in ms:
        split_point = int(0.7 * len(returns))
        out_sample = returns.iloc[split_point:].copy()
        n_rows = len(out_sample)
        period_length = n_rows // m
        periods = np.repeat(np.arange(1, m + 1), period_length)
        remainder = n_rows - len(periods)
        if remainder > 0:
            periods = np.append(periods, [m] * remainder)
        out_sample['period'] = periods
        out_sample_tall = out_sample.reset_index().melt(id_vars=["Date", "period"], var_name="Ticker", value_name="Return")

        is_windows = []
        for i in range(1, m + 1):    
            start_date = out_sample[out_sample['period'] == i].index[0]
            is_window = returns.loc[:start_date].iloc[-split_point:]
            is_windows.append(is_window)

        for q in qs:
            results = []
            for i in range(m):
                # Step 1: Correlation-based selection
                window_returns = is_windows[i]
                correlations = window_returns.corr()
                tickers = list(correlations.columns)

                with open("attributes.dat", "w") as f:
                    f.write("set STOCKS := " + " ".join(tickers) + " ;\n\n")
                    f.write("param q := " + str(q) + " ;\n\n")
                    f.write("param r:\n    " + " ".join(tickers) + " :=\n")
                    for t1 in tickers:
                        row = " ".join(f"{correlations.loc[t1, t2]:.4f}" for t2 in tickers)
                        f.write(f"{t1} {row}\n")
                    f.write(";\n")

                ampl = AMPL()
                ampl.setOption("solver", "gurobi")
                ampl.read("attributes.mod.txt")
                ampl.readData("attributes.dat")
                ampl.solve()
                y = ampl.getVariable("y").getValues().toPandas()



In [13]:
combined_returns = run_attribute_rebalanced_index(qs=[5,10], ms=[2, 4])


*******************************************************************************
*                                                                             *
* Please make sure that the AMPL directory is in the system search path, or   *
* add it before instantiating the AMPL object with:                           *
*                                                                             *
*     from amplpy import AMPL, add_to_path                                    *
*     add_to_path(r"full path to the AMPL installation directory")            *
*     ampl = AMPL()                                                           *
*                                                                             *
* Or, if you are using amplpy.modules, please make sure that they are installed: *
*                                                                             *
*     # Install solver modules (e.g., HiGHS, CBC, Gurobi)                     *
*     $ python -m amplpy.modules inst

RuntimeError: AMPL could not be started. Message from process thread:
cannot execute ampl: No such file or directory

