In [10]:
import numpy as np
import pandas as pd
from sklearn.covariance import LedoitWolf
import cvxpy as cp
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler


In [11]:
# CONFIG

LOOKBACK = 252            # days for μ, Σ
HORIZON = 21              # holding period (trading days)
STEP = 21                 # form a new, independent portfolio every 21 days
GAMMA = 5.0               # objective: 0.5 w'Σw - GAMMA μ'w
NAME_CAP = 0.10           # upper limit for each stock’s weight; set None to disable 
COST_BPS_ONE_WAY = 5.0    # linear cost per side, in bps of traded notional
MIN_NONMISSING_PCT = 0.50 # require >=50% non-missing in lookback
STARTING_NOTIONAL = 1000000.0
TARGET_COL = "monthly_log_return_future"
FEATURE_COLS = [
    "volume_log", "log_daily_vol_21", "log_monthly_var_3",
    "high_low_spread", "open_close_spread",
    "close_sma_ratio_20", "close_ema_ratio_20",
    "close_sma_ratio_50", "close_ema_ratio_50",
    "close_sma_ratio_100", "close_ema_ratio_100",
    "macd_line_pct", "macd_signal_pct", "macd_hist_pct",
    "daily_log_return", "daily_volume_return", "Dividends"
]

Configurations: 
1) We start every portfolio cycle with 1000000 USD
2) Gamma(RISK aversion parameter in portfolio) = 0.5
3) I limited each stock to take maximum of 10% of the portfolio
4) Every time when i buy or sell, i lose 0.05 % of notional value to trading costs

Benchmark expected return:
Find average daily return r, coumpound it by 21 days to give you return over 21 day horizon 
E.G. Benchmark expected return = (1+r)^21 - 1

Benchmark covariance
From the 252 past data window, find the daily covariance and multiply it by 21 days to find the 21-day covariance. 
This is assuming that daily returns i.i.d. --> no autocorrelation across days --> justifies linear scaling

Evaluation performed
1) Average 21-day Net Return
2) Standard Deviation of 21-day Returns
3) Annualized Return
4) Annualized Volatility
5) Sharpe Ratio

In [5]:
# Helper functions

# Takes in a stock's average daily simple return and returns 21-day expected compounded return 
# This gives you the expected return of a stock after 21 trading days 
def compound_from_daily_mean(mu_d, horizon=HORIZON):
    return (1.0 + mu_d)**horizon - 1.0


# This function gives N×N covariance matrix 
def ledoit_wolf_cov(returns_window):
    lw = LedoitWolf(store_precision=False, assume_centered=False)
    lw.fit(returns_window)
    return lw.covariance_

# Solving Markowitz
def solve_markowitz_long_only(mu_21, Sigma_21, gamma=GAMMA, name_cap=NAME_CAP, mode = "standard", l1 = 0.0, l2 = 0.0, 
                              rp_tau=1e-2): # strength for risk parity
    # mu_21: vector of expected 21-day returns (size N)
    # Sigma_21: 21-day covariance matrix assuming no serial correlation --> 21 * covariance matrix 
    # gamma: risk aversion
    n = len(mu_21)
    w = cp.Variable(n)
    Sigma = cp.atoms.affine.wraps.psd_wrap(Sigma_21)

    # --- constraints ---
    cons = [cp.sum(w) == 1, w >= 0]
    if name_cap is not None:
        cons.append(w <= name_cap)

    # Change objective based on mode
    if mode == "standard":
        objective = 0.5 * cp.quad_form(w, Sigma) - gamma * (mu_21 @ w)

    elif mode == "ridge":
        objective = 0.5 * cp.quad_form(w, Sigma) - gamma * (mu_21 @ w)
        objective += l2 * cp.sum_squares(w)

    elif mode == "lasso":
        objective = 0.5 * cp.quad_form(w, Sigma) - gamma * (mu_21 @ w)
        objective += l1 * cp.norm1(w)

    elif mode == "enet":
        objective = 0.5 * cp.quad_form(w, Sigma) - gamma * (mu_21 @ w)
        objective += l2 * cp.sum_squares(w) + l1 * cp.norm1(w)

    elif mode == "minvar":
        # Global minimum variance (risk-only)
        objective = 0.5 * cp.quad_form(w, Sigma)

    elif mode == "invvol":
        # Target inverse-vol weights and project onto simplex + caps
        diag = np.sqrt(np.clip(np.diag(Sigma_21), 1e-12, None))
        w_target = (1.0 / diag)
        w_target = w_target / w_target.sum()
        objective = cp.sum_squares(w - w_target)   # projection objective

    elif mode == "risk_parity":
        # Convex surrogate for ERC: minimize 0.5 w' Σ w - τ ∑ log(w_i)
        # enforce strictly positive weights for log
        eps = 1e-8
        cons.append(w >= eps)
        if name_cap is not None:
            # ensure feasibility with caps (cap must be > eps)
            pass
        objective = 0.5 * cp.quad_form(w, Sigma) - rp_tau * cp.sum(cp.log(w))

    else:
        raise ValueError(f"Unknown mode: {mode}")

    cons = [cp.sum(w) == 1, w >= 0]
    if name_cap is not None:
        cons.append(w <= name_cap)
    prob = cp.Problem(cp.Minimize(objective), cons)
    # choose solver: OSQP for QP; SCS handles log/|.| terms
    use_scs = (mode in {"risk_parity"} or l1 > 0)
    prob.solve(solver=cp.SCS if use_scs else cp.OSQP, verbose=False)

    if w.value is None:
        Sigma_r = Sigma_21 + 1e-6 * np.eye(n)
        Sigma_wrapped_r = cp.atoms.affine.wraps.psd_wrap(Sigma_r)
        if mode == "minvar":
            objective = 0.5 * cp.quad_form(w, Sigma_wrapped_r)
        elif mode == "risk_parity":
            objective = 0.5 * cp.quad_form(w, Sigma_wrapped_r) - rp_tau * cp.sum(cp.log(w))
        elif mode == "invvol":
            objective = cp.sum_squares(w - w_target)
        else:
            objective = 0.5 * cp.quad_form(w, Sigma_wrapped_r) - gamma * (mu_21 @ w)
            if l2 > 0 and l1 == 0: objective += l2 * cp.sum_squares(w)
            if l1 > 0 and l2 == 0: objective += l1 * cp.norm1(w)
            if l1 > 0 and l2 > 0:  objective += l1 * cp.norm1(w) + l2 * cp.sum_squares(w)
        prob = cp.Problem(cp.Minimize(objective), cons)
        prob.solve(solver=cp.SCS if use_scs else cp.OSQP, verbose=False)
        
    if w.value is None:
        raise RuntimeError("QP failed; check Σ conditioning or constraints.")
    return np.clip(w.value, 0, 1)

# Find the realised 21- days return per ticker
def ticker_period_compound(returns_df, start_idx, horizon=HORIZON):
    r = returns_df.iloc[start_idx+1 : start_idx+horizon+1]
    return (1.0 + r).prod(axis=0) - 1.0  # Series by ticker


In [111]:
# Main backtesting

def backtest_markowitz_independent_legs_from_returns(rets: pd.DataFrame, is_in_sp500: pd.DataFrame, mode: str = "stats", ml_params: dict = None,
                                                     markowitz_version: str = "standard", l1: float = 0.0, l2: float = 0.0, rp_tau: float = 1e-2):
    """
    rets: DataFrame (Date x ticker) of DAILY simple returns (not log), aligned and sorted.
    is_in_sp500: DataFrame (Date x ticker) with {0,1}, aligned to rets index/columns.
    """
    rets = rets.sort_index().dropna(how="all")
    is_in_sp500 = is_in_sp500.reindex_like(rets).fillna(0).astype(int)
    dates = rets.index 

    start = LOOKBACK - 1
    end = len(dates) - HORIZON - 1
    legs = []

    for t in range(start, end + 1, STEP):
        date_t = dates[t] # todays date 

        # Universe on formation day
        tickers = is_in_sp500.columns[is_in_sp500.loc[date_t] == 1].tolist()
        if not tickers:
            continue

        # 252d window of daily returns
        win = rets.iloc[t-LOOKBACK+1 : t+1][tickers]

        # Data quality filter
        non_missing_ratio = 1.0 - win.isna().mean()
        keep = non_missing_ratio[non_missing_ratio >= MIN_NONMISSING_PCT].index.tolist()
        win = win[keep].dropna(how="all", axis=1)
        if win.shape[1] < 2:
            continue
        win = win.dropna(how="any")
        if len(win) < 30:
            continue

        tickers = win.columns.tolist()

        # we will do this no matter the mode used. mode only determines what method is used to get mu_21
        # Σ: Ledoit–Wolf daily -> 21d
        Sigma_d = ledoit_wolf_cov(win.values)
        Sigma_21 = HORIZON * Sigma_d

        if mode == "stats":
            # μ: mean daily -> compounded 21d
            mu_d = win.mean(axis=0).values
            mu_21 = compound_from_daily_mean(mu_d, HORIZON)

        else: # if not stats then prepare data for ML models
            df = ml_params["df_train"]
            df = df[df["Date"] <= date_t] 
            df_ticker= df[df['ticker'].isin(tickers)]
            # Data Preparation and Scaling
            X_train = df_ticker[df_ticker['Date'] < date_t][FEATURE_COLS] # you will not have information of log returns on day t, so the window is only until day t+1
            y_train = df_ticker[df_ticker['Date'] < date_t][TARGET_COL]
            X_test = df_ticker[df_ticker['Date'] == date_t][FEATURE_COLS]
            y_test = df_ticker[df_ticker['Date'] == date_t][TARGET_COL]

            scaler = StandardScaler().fit(X_train)
            X_train_scaled = scaler.transform(X_train)
            X_test_scaled = scaler.transform(X_test)

        if mode == "ridge":
            ridge = Ridge(alpha=ml_params["l2"])  
            ridge.fit(X_train_scaled, y_train)
            mu_21 = ridge.predict(X_test_scaled)

        if mode == "lasso":
            lasso = Lasso(alpha=ml_params["l1"])
            lasso.fit(X_train_scaled, y_train)
            mu_21 = lasso.predict(X_test_scaled)


        if mode == "elasticnet":
            enet = ElasticNet(alpha=ml_params["lambda total"],
                              l1_ratio=ml_params["l1 ratio"]) # I HEAVILY DISAGREE WITH THIS NAMING CONVENTION IT IS SO CONFUSING TO CALL IT ALPHA 
            enet.fit(X_train_scaled, y_train)
            mu_21 = enet.predict(X_test_scaled)
            

        # Optimize
        try:
            w = solve_markowitz_long_only(mu_21, Sigma_21, gamma=GAMMA, name_cap=NAME_CAP, mode = markowitz_version, l1= l1, l2 = l2, rp_tau = rp_tau)
        except RuntimeError:
            continue

        # Realized leg
        comp_rets = ticker_period_compound(rets[tickers], start_idx=t, horizon=HORIZON)
        gross_leg_ret = float(np.dot(w, comp_rets.values))

        # Round-trip linear costs: entry 1 + exit 1 = 2
        rt_turnover = 2.0
        cost_frac = (COST_BPS_ONE_WAY * 1e-4) * rt_turnover
        net_leg_ret = gross_leg_ret - cost_frac

        start_value = STARTING_NOTIONAL
        end_value = start_value * (1.0 + net_leg_ret)

        legs.append({
            "formation_date": date_t,
            "n_names": len(tickers),
            "gross_21d_ret": gross_leg_ret,
            "net_21d_ret": net_leg_ret,
            "roundtrip_cost_frac": cost_frac,
            "start_value": start_value,
            "end_value": end_value,
            "gamma": GAMMA,
            "name_cap": NAME_CAP
        })

    legs_df = pd.DataFrame(legs).set_index("formation_date")

    # Evaluation (your main score = average net 21d return)
    if not legs_df.empty:
        legs_per_year = 252.0 / HORIZON
        avg_net = legs_df["net_21d_ret"].mean()
        std_net = legs_df["net_21d_ret"].std(ddof=1)
        ann_return = (1.0 + avg_net)**legs_per_year - 1.0
        ann_vol = std_net * np.sqrt(legs_per_year)
        sharpe = ann_return / ann_vol if ann_vol > 0 else np.nan

        summary = {
            "legs": int(len(legs_df)),
            "avg_net_21d_return": float(avg_net),
            "std_net_21d_return": float(std_net),
            "annual_return_from_avg_leg": float(ann_return),
            "annual_vol_from_leg_std": float(ann_vol),
            "sharpe_from_legs": float(sharpe),
            "cost_bps_one_way": COST_BPS_ONE_WAY,
            "gamma": GAMMA,
            "name_cap": NAME_CAP
        }
    else:
        summary = {}

    return legs_df, summary

In [107]:
df = pd.read_csv("training_data.csv", parse_dates=["Date"], low_memory=False)
df["daily_simple_return"] = np.expm1(df["daily_log_return"])
df["Dividends"] = (
    df["Dividends"]
    .astype(str)                           # ensure all string
    .str.replace("USD", "", regex=False)   # remove USD
    .str.replace(",", "", regex=False)     # remove commas
    .str.replace(r"[^\d\.-]", "", regex=True) # keep numbers / dots / minus
    .replace("", np.nan)                   # empty → NaN
    .astype(float)
)
df["Date"] = pd.to_datetime(df["Date"]).dt.normalize()

# Building my wide df
rets = df.pivot(index="Date", columns="ticker", values="daily_simple_return").sort_index()
is_in = df.pivot(index="Date", columns="ticker", values="is_in_sp500").sort_index()
is_in = is_in.fillna(0).astype(float).clip(0,1).astype(int)

split_idx = len(rets) - 252*2
split_date = rets.index[split_idx]
test_date_split = split_date - pd.DateOffset(years=1)
print(test_date_split) # 2018-02-16 so when you do trainig data, do up to 2018 



# include the 252-day overlap so the first test formation has a full lookback
df_test = df[df["Date"] >= test_date_split]
rets_test = rets.iloc[split_idx - LOOKBACK:]
is_in_test = is_in.iloc[split_idx - LOOKBACK:]

df_train = df[df["Date"] <= split_date]
rets_train = rets.iloc[:split_idx]
is_in_train = is_in.iloc[:split_idx]



# print(df_train)

2017-02-16 00:00:00


In [81]:
legs_df, summary = backtest_markowitz_independent_legs_from_returns(rets_test, is_in_test, mode = 'stats', markowitz_version = "standard")


(425,)


In [112]:
ridge_params = {"df_train": df_train, "l2": 0.1}
legs_df_ridge, summary_ridge = backtest_markowitz_independent_legs_from_returns(rets_train, is_in_train, mode = 'ridge', ml_params = ridge_params, markowitz_version = "standard")

summary_ridge

{'legs': 49,
 'avg_net_21d_return': 0.01038756239874083,
 'std_net_21d_return': 0.04678985864463267,
 'annual_return_from_avg_leg': 0.13202468803485612,
 'annual_vol_from_leg_std': 0.16208482490293924,
 'sharpe_from_legs': 0.81454070801456,
 'cost_bps_one_way': 5.0,
 'gamma': 5.0,
 'name_cap': 0.1}

In [113]:
lasso_params = {"df_train": df_train, "l1": 0.1}

legs_df_lasso, summary_lasso = backtest_markowitz_independent_legs_from_returns(
    rets_train,
    is_in_train,
    mode='lasso',
    ml_params=lasso_params,
    markowitz_version='standard'
)

summary_lasso

{'legs': 49,
 'avg_net_21d_return': 0.008472343823215633,
 'std_net_21d_return': 0.025495377542908454,
 'annual_return_from_avg_leg': 0.10654202451411687,
 'annual_vol_from_leg_std': 0.088318578524936,
 'sharpe_from_legs': 1.2063376278642848,
 'cost_bps_one_way': 5.0,
 'gamma': 5.0,
 'name_cap': 0.1}

In [114]:
enet_params = {"df_train": df_train, "lambda total": 0.05, "l1 ratio": 0.05}

legs_df_enet, summary_enet = backtest_markowitz_independent_legs_from_returns(
    rets_train,
    is_in_train,
    mode='elasticnet',
    ml_params=enet_params,
    markowitz_version='standard'
)

summary_enet

{'legs': 49,
 'avg_net_21d_return': 0.005729461271071971,
 'std_net_21d_return': 0.04931992665180772,
 'annual_return_from_avg_leg': 0.07096201501143695,
 'annual_vol_from_leg_std': 0.1708492375730027,
 'sharpe_from_legs': 0.41534873681315304,
 'cost_bps_one_way': 5.0,
 'gamma': 5.0,
 'name_cap': 0.1}

# Create function to perform parameter tuning for the ml models

In [None]:
def parameter_tuning(rets_train, is_in_train, df_train,  mode = 'stats', markowitz_version = "standard", important_values = None):
    best_params = None
    best_returns = -1.0
    for params in important_values:
        if mode == "ridge":
            ridge_params = {'df_train': df_train, 'l2': params}
            legs_df, summary = backtest_markowitz_independent_legs_from_returns(rets_train, is_in_train, mode = 'ridge', ml_params = ridge_params, markowitz_version = "standard")
        elif mode =="lasso":
            lasso_params = {'df_train': df_train, 'l1':params}
            legs_df, summary = backtest_markowitz_independent_legs_from_returns(rets_train, is_in_train, mode='lasso', ml_params=lasso_params, markowitz_version='standard')
        elif mode == "elasticnet":
            enet_params = {'df_train': df_train, 'lambda total': params[0], 'l1 ratio': params[1]}
            legs_df, summary = backtest_markowitz_independent_legs_from_returns(rets_train, is_in_train, mode='elasticnet', ml_params=enet_params, markowitz_version='standard')
        if summary['avg_net_21d_return'] > best_returns:
            best_returns = summary['avg_net_21d_return']
            best_params = {
                'params' : params,
                'summary': summary
            }

    return best_params


lambda_vals =  np.logspace(-4, 4, num=20) #for ridge and lasso
alpha_vals  = np.linspace(0.0, 1.0, 6) # 0, .2, .4, .6, .8, 1, for enet only
rp_tau_grid = np.logspace(-3, 1, num=10) 


In [None]:
# best_params_ridge = parameter_tuning(rets_train, is_in_train,df_train, mode = 'ridge', markowitz_version = "standard", important_values = lambda_vals)
# print(best_params_ridge) # 206.913808111479

{'params': np.float64(206.913808111479), 'summary': {'legs': 49, 'avg_net_21d_return': 0.014960940136178975, 'std_net_21d_return': 0.047129569910097174, 'annual_return_from_avg_leg': 0.19506616200553073, 'annual_vol_from_leg_std': 0.16326161924631533, 'sharpe_from_legs': 1.1948072235595766, 'cost_bps_one_way': 5.0, 'gamma': 5.0, 'name_cap': 0.1}}


In [124]:
ridge_params = {"df_train": df_test, "l2": 206.913808111479}
legs_df_ridge, summary_ridge = backtest_markowitz_independent_legs_from_returns(rets_test, is_in_test, mode = 'ridge', ml_params = ridge_params, markowitz_version = "standard")

summary_ridge

{'legs': 24,
 'avg_net_21d_return': 0.020128464439864886,
 'std_net_21d_return': 0.050885880725237954,
 'annual_return_from_avg_leg': 0.27015987546427556,
 'annual_vol_from_leg_std': 0.17627386160800393,
 'sharpe_from_legs': 1.532614495421076,
 'cost_bps_one_way': 5.0,
 'gamma': 5.0,
 'name_cap': 0.1}

In [None]:
# best_params_lasso = parameter_tuning(rets_train, is_in_train,df_train, mode = 'lasso', markowitz_version = "standard", important_values = lambda_vals)
# print(best_params_lasso) # 0.00026366508987303583


{'params': np.float64(0.00026366508987303583), 'summary': {'legs': 49, 'avg_net_21d_return': 0.011830388367440036, 'std_net_21d_return': 0.0468415111668224, 'annual_return_from_avg_leg': 0.15157604344526465, 'annual_vol_from_leg_std': 0.16226375448848263, 'sharpe_from_legs': 0.9341337128743895, 'cost_bps_one_way': 5.0, 'gamma': 5.0, 'name_cap': 0.1}}


In [125]:
lasso_params = {"df_train": df_test, "l1": 0.00026366508987303583}

legs_df_lasso, summary_lasso = backtest_markowitz_independent_legs_from_returns(
    rets_test,
    is_in_test,
    mode='lasso',
    ml_params=lasso_params,
    markowitz_version='standard'
)

summary_lasso

{'legs': 24,
 'avg_net_21d_return': 0.013418194921560121,
 'std_net_21d_return': 0.0511469426682313,
 'annual_return_from_avg_leg': 0.17344940267511633,
 'annual_vol_from_leg_std': 0.17717820670637816,
 'sharpe_from_legs': 0.9789544995370607,
 'cost_bps_one_way': 5.0,
 'gamma': 5.0,
 'name_cap': 0.1}

In [None]:
# lambda_alpha_pairs = [[lam, a] for lam in lambda_vals for a in alpha_vals]

# best_params_en = parameter_tuning(rets_train, is_in_train,df_train, mode = 'elasticnet', markowitz_version = "standard", important_values = lambda_alpha_pairs)
# print(best_params_en) # [np.float64(0.0006951927961775605), np.float64(0.0)]

Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sk

{'params': [np.float64(0.0006951927961775605), np.float64(0.0)], 'summary': {'legs': 49, 'avg_net_21d_return': 0.015220096878751505, 'std_net_21d_return': 0.04661762844828838, 'annual_return_from_avg_leg': 0.19873303925559793, 'annual_vol_from_leg_std': 0.16148820200160752, 'sharpe_from_legs': 1.2306350358252156, 'cost_bps_one_way': 5.0, 'gamma': 5.0, 'name_cap': 0.1}}


In [126]:
enet_params = {"df_train": df_test, "lambda total": 0.0006951927961775605, "l1 ratio": 0.0}

legs_df_enet, summary_enet = backtest_markowitz_independent_legs_from_returns(
    rets_test,
    is_in_test,
    mode='elasticnet',
    ml_params=enet_params,
    markowitz_version='standard'
)

summary_enet

Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sk

{'legs': 24,
 'avg_net_21d_return': 0.020206109053246425,
 'std_net_21d_return': 0.05041493879140152,
 'annual_return_from_avg_leg': 0.27132046303010404,
 'annual_vol_from_leg_std': 0.17464247089436502,
 'sharpe_from_legs': 1.5535766394087271,
 'cost_bps_one_way': 5.0,
 'gamma': 5.0,
 'name_cap': 0.1}

## Stress test evaluation

In [None]:
stress_test = pd.read_csv("covid_stress_test_data.csv",parse_dates=["Date"])
stress_test["daily_simple_return"] = np.expm1(stress_test["daily_log_return"])
stress_test
date_start = stress_test["Date"].min()
add_date = date_start - pd.DateOffset(years=1)
df_add = df[df["Date"] >= add_date]
df_out = pd.concat([df_add, stress_test], axis=0, ignore_index=True)


Unnamed: 0,Date,Dividends,ticker,is_in_sp500,daily_log_return,daily_volume_return,lag_1,lag_2,lag_3,lag_4,...,close_sma_ratio_20,close_ema_ratio_20,close_sma_ratio_50,close_ema_ratio_50,close_sma_ratio_100,close_ema_ratio_100,macd_line_pct,macd_signal_pct,macd_hist_pct,daily_simple_return
0,2019-02-20,0.0,A,1,0.013327,-0.357205,0.090395,0.111624,0.087035,0.102130,...,0.038185,0.040002,0.104132,0.083690,0.135977,0.118439,0.025417,0.025003,0.000413,0.013416
1,2019-02-21,0.0,A,1,-0.008566,0.542106,0.088169,0.090395,0.111624,0.087035,...,0.025104,0.028083,0.092316,0.071320,0.125006,0.106513,0.024922,0.025159,-0.000237,-0.008529
2,2019-02-22,0.0,A,1,0.006781,-0.243992,0.088121,0.088169,0.090395,0.111624,...,0.027859,0.031632,0.097285,0.075295,0.131363,0.111533,0.024455,0.024882,-0.000427,0.006805
3,2019-02-25,0.0,A,1,0.011664,-0.398590,0.090285,0.088121,0.088169,0.090395,...,0.036054,0.039406,0.107535,0.084173,0.143132,0.121806,0.024591,0.024593,-0.000002,0.011732
4,2019-02-26,0.0,A,1,-0.010007,0.059233,0.094579,0.090285,0.088121,0.088169,...,0.022214,0.026217,0.094375,0.070297,0.130533,0.108208,0.024084,0.024689,-0.000605,-0.009957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231400,2020-12-22,0.0,ZTS,1,0.006136,-0.191077,-0.033443,0.000795,-0.016640,-0.028660,...,0.010396,0.004064,-0.003656,0.003359,0.004212,0.024023,-0.001469,-0.004086,0.002617,0.006155
231401,2020-12-23,0.0,ZTS,1,-0.012311,-0.037581,-0.026645,-0.033443,0.000795,-0.016640,...,-0.001590,-0.007443,-0.015208,-0.008570,-0.008415,0.011264,-0.002000,-0.003709,0.001709,-0.012235
231402,2020-12-24,0.0,ZTS,1,0.005428,-1.026088,-0.027702,-0.026645,-0.033443,0.000795,...,0.003678,-0.001847,-0.009514,-0.003050,-0.003344,0.016430,-0.001935,-0.003339,0.001403,0.005443
231403,2020-12-28,0.0,ZTS,1,0.010337,1.293998,-0.002113,-0.027702,-0.026645,-0.033443,...,0.013816,0.007706,0.000826,0.007020,0.006707,0.026443,-0.001031,-0.002850,0.001819,0.010390


In [132]:
# Building my wide df
stress_rets = stress_test.pivot(index="Date", columns="ticker", values="daily_simple_return").sort_index()

is_in_stress = stress_test.pivot(index="Date", columns="ticker", values="is_in_sp500").sort_index()
is_in_stress = is_in_stress.fillna(0).astype(float).clip(0,1).astype(int)

stress_rets

ticker,A,AAL,AAP,AAPL,ABBV,ABT,ACN,ADBE,ADI,ADM,...,XEL,XOM,XRAY,XRX,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-02-20,-0.011833,0.006354,0.016843,-0.010259,0.000956,-0.010072,-0.009356,-0.011271,0.005327,0.005913,...,-0.001695,-0.007955,-0.019628,0.000272,0.007958,-0.012549,-0.010186,0.009231,0.009605,-0.003540
2020-02-21,0.008656,-0.024202,-0.013125,-0.022635,0.007747,-0.011418,-0.009397,-0.015859,-0.014372,-0.007461,...,0.002123,-0.012195,-0.010271,-0.011964,-0.001579,-0.003466,-0.006313,-0.017839,-0.017730,-0.009195
2020-02-24,-0.053721,-0.085190,-0.004978,-0.047500,-0.019166,-0.032247,-0.035537,-0.041668,-0.043988,-0.024601,...,-0.006354,-0.046846,-0.037467,-0.040726,-0.023610,-0.031495,-0.019442,-0.046101,-0.036760,-0.026998
2020-02-25,-0.031677,-0.091552,-0.016511,-0.033872,-0.042517,-0.041357,-0.035526,-0.026888,-0.023006,-0.055815,...,-0.021742,-0.038325,-0.058845,0.045898,-0.024297,-0.022943,-0.029806,-0.032542,-0.031535,-0.025797
2020-02-26,0.002053,-0.035035,-0.022239,0.015864,-0.008634,0.002835,-0.007255,0.010178,0.001570,-0.013851,...,-0.007118,-0.021956,-0.012039,-0.020022,-0.001665,-0.004594,-0.021172,-0.009955,-0.013214,0.009865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-22,-0.003481,-0.038509,-0.006792,0.028465,-0.008774,0.001758,-0.007459,0.011597,0.011852,-0.012963,...,-0.002001,-0.016925,-0.012014,0.000000,-0.013106,-0.011323,0.015007,0.010774,-0.009192,0.006155
2020-12-23,-0.000596,0.026486,-0.000690,-0.006976,0.004669,-0.007665,-0.007824,-0.014400,-0.007347,0.012723,...,-0.008021,0.012852,-0.001934,0.017873,0.003320,-0.005680,-0.012265,-0.000579,0.034967,-0.012235
2020-12-24,0.000085,-0.014475,0.008726,0.007712,-0.000194,0.008376,-0.000544,0.005937,0.008728,0.001824,...,0.004976,-0.004070,0.004069,-0.008780,0.005214,0.007023,0.003429,-0.007505,-0.003907,0.005443
2020-12-28,0.004433,0.025543,-0.011700,0.035766,0.001840,-0.005168,0.009250,-0.001820,0.001454,0.005259,...,0.010676,0.003366,0.009456,0.031001,0.001197,0.020830,-0.002412,0.015124,0.000461,0.010390


In [133]:
rets_past = rets.iloc[-252:]
isin_past = is_in.iloc[-252:]


stress_rets_final = pd.concat([rets_past,stress_rets])
is_in_stress_final = pd.concat([isin_past, is_in_stress], axis=0).sort_index()



In [134]:
is_in_stress_final

ticker,A,AAL,AAP,AAPL,ABBV,ABT,ACN,ADBE,ADI,ADM,...,XEL,XOM,XRAY,XRX,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-02-20,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,0,1,1
2019-02-21,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,0,1,1
2019-02-22,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,0,1,1
2019-02-25,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,0,1,1
2019-02-26,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-22,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2020-12-23,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2020-12-24,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2020-12-28,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [135]:
common_cols = stress_rets_final.columns.intersection(is_in_stress_final.columns)
stress_rets_final = stress_rets_final[common_cols]
is_in_stress_final = is_in_stress_final[common_cols]
is_in_stress_final = is_in_stress_final.reindex_like(stress_rets_final).fillna(0).astype(int)


In [43]:
legs_df, summary = backtest_markowitz_independent_legs_from_returns(
    stress_rets_final, is_in_stress_final
)
summary

{'legs': 10,
 'avg_net_21d_return': 0.02032604116050905,
 'std_net_21d_return': 0.14447030630484925,
 'annual_return_from_avg_leg': 0.27311505056056684,
 'annual_vol_from_leg_std': 0.5004598214100744,
 'sharpe_from_legs': 0.5457282260762701,
 'cost_bps_one_way': 5.0,
 'gamma': 5.0,
 'name_cap': 0.1}

In [136]:
ridge_params = {"df_train": df_out, "l2": 206.913808111479}
legs_df_ridge, summary_ridge = backtest_markowitz_independent_legs_from_returns(stress_rets_final, is_in_stress_final, mode = 'ridge', ml_params = ridge_params, markowitz_version = "standard")

summary_ridge

{'legs': 10,
 'avg_net_21d_return': 0.06473511370790971,
 'std_net_21d_return': 0.23781050673361076,
 'annual_return_from_avg_leg': 1.1227503535064463,
 'annual_vol_from_leg_std': 0.8237997604726288,
 'sharpe_from_legs': 1.362892303904415,
 'cost_bps_one_way': 5.0,
 'gamma': 5.0,
 'name_cap': 0.1}

In [137]:
lasso_params = {"df_train": df_out, "l1": 0.00026366508987303583}

legs_df_lasso, summary_lasso = backtest_markowitz_independent_legs_from_returns(stress_rets_final, is_in_stress_final,
    mode='lasso',
    ml_params=lasso_params,
    markowitz_version='standard'
)

summary_lasso

{'legs': 10,
 'avg_net_21d_return': 0.06649762207420015,
 'std_net_21d_return': 0.24660221106955285,
 'annual_return_from_avg_leg': 1.165303099177561,
 'annual_vol_from_leg_std': 0.8542551176625794,
 'sharpe_from_legs': 1.3641160293731385,
 'cost_bps_one_way': 5.0,
 'gamma': 5.0,
 'name_cap': 0.1}

In [138]:
enet_params = {"df_train": df_out, "lambda total": 0.0006951927961775605, "l1 ratio": 0.0}

legs_df_enet, summary_enet = backtest_markowitz_independent_legs_from_returns(stress_rets_final, is_in_stress_final,
    mode='elasticnet',
    ml_params=enet_params,
    markowitz_version='standard'
)

summary_enet

Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sk

{'legs': 10,
 'avg_net_21d_return': 0.06804220306904782,
 'std_net_21d_return': 0.24122214070551015,
 'annual_return_from_avg_leg': 1.2032357359338528,
 'annual_vol_from_leg_std': 0.8356180072249444,
 'sharpe_from_legs': 1.4399351444444728,
 'cost_bps_one_way': 5.0,
 'gamma': 5.0,
 'name_cap': 0.1}