# Portfolio Construction: Factor Timing with Regime Model

This notebook implements the factor timing strategy based on regime similarity:

1. For each month t, compute similarity of all prior dates to month t
2. Split prior dates into 5 quintiles by similarity (Q1 = most similar)
3. For each factor, compute average return in month t+1 for dates in each quintile
4. Go LONG factor if avg return of Q1 dates was positive, SHORT if negative
5. Equal-weight across the 6 Fama-French factors

**Factors used**: Mkt-RF, SMB, HML, RMW, CMA, Mom (Fama-French 5 factors + Momentum)

In [2]:
import sys
from pathlib import Path
import importlib
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

repo_root = Path.cwd()
if not (repo_root / 'src').exists() and (repo_root.parent / 'src').exists():
    repo_root = repo_root.parent
if (repo_root / 'src').exists():
    sys.path.insert(0, str(repo_root))

import src.regime_model as regime_model
importlib.reload(regime_model)

from src.regime_model import (
    load_feature_matrix,
    compute_similarity_quintiles,
    compute_rolling_similarity_quintiles,
)

## 1. Load Data

Load the feature matrix (transformed macro variables) and factor returns.

In [3]:
# Load feature matrix
feature_path = repo_root / 'data' / 'processed' / 'feature_matrix_clean.csv'
features = load_feature_matrix(feature_path)
print(f"Features shape: {features.shape}")
print(f"Date range: {features.index.min()} to {features.index.max()}")
features.head()

Features shape: (623, 7)
Date range: 1973-12-31 00:00:00 to 2025-10-31 00:00:00


Unnamed: 0_level_0,market_transformed,yield_curve_transformed,oil ($/bbl)_transformed,copper ($/metric ton)_transformed,monetary_policy_transformed,volatility_transformed,stock_bond_corr_transformed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1973-12-31,-2.249706,-1.388954,3.0,2.531366,1.35073,2.869834,0.64847
1974-01-31,-2.093306,-1.00179,3.0,1.788305,0.976164,1.914132,0.978212
1974-02-28,-1.689956,-0.862877,3.0,2.037113,0.836261,0.439552,0.995444
1974-03-31,-1.835159,-1.023341,3.0,2.369871,1.011982,-0.051967,0.874601
1974-04-30,-1.717254,-1.542605,3.0,2.712843,1.530578,-0.046426,0.396054


In [31]:
# Load Fama-French 5 factors + Momentum
ff_path = repo_root / 'data' / 'raw' / 'F-F_Research_Data_5_Factors_2x3_with_mom.csv'
ff_raw = pd.read_csv(ff_path)

# Parse dates (format: YYYY-MM)
ff_raw['Date'] = pd.to_datetime(ff_raw['Date'], format='%Y-%m')

# Set to end of month to match feature matrix
ff_raw['Date'] = ff_raw['Date'] + pd.offsets.MonthEnd(0)
ff_raw = ff_raw.set_index('Date').sort_index()

# Remove duplicate dates (keep first occurrence)
ff_raw = ff_raw[~ff_raw.index.duplicated(keep='first')]

# Convert returns from percentage to decimal
factor_cols = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']
factors = ff_raw[factor_cols] / 100

print(f"Factors shape: {factors.shape}")
print(f"Date range: {factors.index.min()} to {factors.index.max()}")
print(f"Factors: {factor_cols}")
factors.head()

Factors shape: (746, 6)
Date range: 1963-07-31 00:00:00 to 2025-08-31 00:00:00
Factors: ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']


Unnamed: 0_level_0,Mkt-RF,SMB,HML,RMW,CMA,Mom
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1963-07-31,-0.0039,-0.0048,-0.0081,0.0064,-0.0115,0.0101
1963-08-31,0.0508,-0.008,0.017,0.004,-0.0038,0.01
1963-09-30,-0.0157,-0.0043,0.0,-0.0078,0.0015,0.0012
1963-10-31,0.0254,-0.0134,-0.0004,0.0279,-0.0225,0.0313
1963-11-30,-0.0086,-0.0085,0.0173,-0.0043,0.0227,-0.0078


In [32]:
# Align features and factors to common dates
common_dates = features.index.intersection(factors.index)
print(f"Common dates: {len(common_dates)} months")
print(f"Date range: {common_dates.min()} to {common_dates.max()}")

features_aligned = features.loc[common_dates]
factors_aligned = factors.loc[common_dates]

Common dates: 621 months
Date range: 1973-12-31 00:00:00 to 2025-08-31 00:00:00


## 2. Compute Rolling Similarity Quintiles

For each month t, compute similarity of all prior dates and assign to quintiles.

In [33]:
# Compute rolling similarity quintiles (this may take a few minutes)
print("Computing rolling similarity quintiles...")
similarity_quintiles = compute_rolling_similarity_quintiles(
    features=features_aligned,
    metric='mahalanobis',
    standardize=True,
    n_quintiles=5,
    min_history=60  # Need at least 5 years of history
)
print(f"Computed quintiles for {len(similarity_quintiles)} dates")

Computing rolling similarity quintiles...
Computed quintiles for 561 dates


In [7]:
# Example: Show quintiles for the most recent date
latest_date = max(similarity_quintiles.keys())
print(f"Similarity quintiles for {latest_date.date()}:")
print(f"\nQuintile 1 (most similar) - top 5:")
q1 = similarity_quintiles[latest_date][similarity_quintiles[latest_date]['quintile'] == 1]
print(q1.head())

Similarity quintiles for 2025-08-31:

Quintile 1 (most similar) - top 5:
          date  distance quintile
619 2025-07-31  0.207965        1
618 2025-06-30  0.607156        1
617 2025-05-31  0.930495        1
215 1991-11-30  1.099717        1
79  1980-07-31  1.201225        1


## 3. Generate Factor Timing Signals

For each month t:
- Get dates in quintile 1 (most similar)
- Look up factor returns in month t+1 for those dates
- If average return > 0 â†’ LONG, else SHORT

In [34]:
def generate_factor_signals(similarity_quintiles, factors, quintile=1):
    """
    Generate factor timing signals based on similar historical dates.
    
    For each date t:
    1. Get dates in the specified quintile (most similar if quintile=1)
    2. Look up factor returns in month t+1 for those similar dates
    3. Compute average return
    4. Signal = +1 if avg > 0 (long), -1 if avg < 0 (short)
    
    Returns:
    --------
    signals : pd.DataFrame
        Factor signals (+1 or -1) for each date and factor
    avg_returns : pd.DataFrame
        Average historical returns used to generate signals
    """
    signals = {}
    avg_returns = {}
    
    # Create shifted factor returns (return in month t+1)
    factors_next = factors.shift(-1)
    
    for target_date, quintile_df in similarity_quintiles.items():
        # Get dates in the specified quintile
        q_dates = quintile_df[quintile_df['quintile'] == quintile]['date'].values
        q_dates = pd.DatetimeIndex(q_dates)
        
        # Get factor returns in month t+1 for those similar dates
        valid_dates = q_dates.intersection(factors_next.index)
        
        if len(valid_dates) > 0:
            # Average return across similar dates
            avg_ret = factors_next.loc[valid_dates].mean()
            avg_returns[target_date] = avg_ret
            
            # Signal: +1 if positive, -1 if negative
            signals[target_date] = np.sign(avg_ret)
    
    signals_df = pd.DataFrame(signals).T
    avg_returns_df = pd.DataFrame(avg_returns).T
    
    return signals_df, avg_returns_df

# Generate signals for each quintile
signals_by_quintile = {}
avg_returns_by_quintile = {}

for q in range(1, 6):
    signals, avg_rets = generate_factor_signals(similarity_quintiles, factors_aligned, quintile=q)
    signals_by_quintile[q] = signals
    avg_returns_by_quintile[q] = avg_rets
    print(f"Quintile {q}: {len(signals)} signals generated")

Quintile 1: 561 signals generated
Quintile 2: 561 signals generated
Quintile 3: 561 signals generated
Quintile 4: 561 signals generated
Quintile 5: 561 signals generated


In [9]:
# Show signals for quintile 1 (most similar dates)
print("Factor signals for Quintile 1 (most similar dates):")
print("\nLast 10 signals:")
signals_by_quintile[1].tail(10)

Factor signals for Quintile 1 (most similar dates):

Last 10 signals:


Unnamed: 0,Mkt-RF,SMB,HML,RMW,CMA
2024-11-30,1.0,1.0,-1.0,1.0,1.0
2024-12-31,1.0,1.0,1.0,1.0,1.0
2025-01-31,1.0,1.0,1.0,1.0,-1.0
2025-02-28,1.0,-1.0,-1.0,1.0,-1.0
2025-03-31,1.0,-1.0,-1.0,1.0,-1.0
2025-04-30,1.0,-1.0,-1.0,1.0,1.0
2025-05-31,1.0,1.0,1.0,1.0,1.0
2025-06-30,1.0,1.0,1.0,1.0,1.0
2025-07-31,1.0,1.0,1.0,1.0,1.0
2025-08-31,1.0,1.0,1.0,1.0,1.0


## 4. Backtest Factor Timing Strategy

Compute portfolio returns by applying the signals to factor returns.

In [35]:
def backtest_factor_timing(signals, factors, equal_weight=True):
    """
    Backtest factor timing strategy.
    
    Parameters:
    -----------
    signals : pd.DataFrame
        Factor signals (+1 or -1) for each date
    factors : pd.DataFrame
        Factor returns
    equal_weight : bool
        Whether to equal-weight across factors
    
    Returns:
    --------
    portfolio_returns : pd.Series
        Strategy returns
    factor_returns : pd.DataFrame
        Individual timed factor returns
    """
    # Align signals with next month's factor returns
    # Signal at time t is used to trade at t, earning return from t to t+1
    factors_next = factors.shift(-1)
    
    # Align dates
    common_dates = signals.index.intersection(factors_next.index)
    signals_aligned = signals.loc[common_dates]
    factors_aligned = factors_next.loc[common_dates]
    
    # Compute timed factor returns
    timed_returns = signals_aligned * factors_aligned
    
    # Portfolio return (equal-weighted across factors)
    if equal_weight:
        portfolio_returns = timed_returns.mean(axis=1)
    else:
        portfolio_returns = timed_returns.sum(axis=1)
    
    return portfolio_returns, timed_returns

# Backtest each quintile
portfolio_returns_by_quintile = {}
factor_returns_by_quintile = {}

for q in range(1, 6):
    port_ret, factor_ret = backtest_factor_timing(
        signals_by_quintile[q], 
        factors_aligned
    )
    portfolio_returns_by_quintile[q] = port_ret
    factor_returns_by_quintile[q] = factor_ret

print("Backtest complete for all quintiles")

Backtest complete for all quintiles


## 5. Performance Analysis

In [36]:
def compute_performance_metrics(returns):
    """
    Compute performance metrics for a return series.
    """
    # Remove NaN
    returns = returns.dropna()
    
    if len(returns) == 0:
        return {}
    
    # Annualization factor (monthly data)
    ann_factor = 12
    
    # Metrics
    total_return = (1 + returns).prod() - 1
    ann_return = (1 + returns).prod() ** (ann_factor / len(returns)) - 1
    ann_vol = returns.std() * np.sqrt(ann_factor)
    sharpe = ann_return / ann_vol if ann_vol > 0 else 0
    
    # Max drawdown
    cum_returns = (1 + returns).cumprod()
    rolling_max = cum_returns.expanding().max()
    drawdowns = cum_returns / rolling_max - 1
    max_dd = drawdowns.min()
    
    # Hit rate (% positive months)
    hit_rate = (returns > 0).mean()
    
    return {
        'Total Return': f"{total_return:.1%}",
        'Ann. Return': f"{ann_return:.1%}",
        'Ann. Vol': f"{ann_vol:.1%}",
        'Sharpe': f"{sharpe:.2f}",
        'Max DD': f"{max_dd:.1%}",
        'Hit Rate': f"{hit_rate:.1%}",
        'N Months': len(returns),
    }

# Compute metrics for each quintile
performance_summary = []
for q in range(1, 6):
    metrics = compute_performance_metrics(portfolio_returns_by_quintile[q])
    metrics['Quintile'] = q
    performance_summary.append(metrics)

performance_df = pd.DataFrame(performance_summary)
performance_df = performance_df.set_index('Quintile')
print("Performance by Similarity Quintile (Q1 = Most Similar):")
performance_df

Performance by Similarity Quintile (Q1 = Most Similar):


Unnamed: 0_level_0,Total Return,Ann. Return,Ann. Vol,Sharpe,Max DD,Hit Rate,N Months
Quintile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1003.6%,5.3%,5.7%,0.92,-10.6%,66.8%,560
2,666.4%,4.5%,5.6%,0.79,-14.6%,66.1%,560
3,380.3%,3.4%,5.8%,0.59,-17.3%,62.3%,560
4,389.0%,3.5%,5.2%,0.67,-17.6%,65.0%,560
5,370.3%,3.4%,6.2%,0.55,-20.6%,58.9%,560


In [12]:
# Plot cumulative returns by quintile
cum_returns_df = pd.DataFrame()
for q in range(1, 6):
    cum_ret = (1 + portfolio_returns_by_quintile[q]).cumprod()
    cum_returns_df[f'Q{q}'] = cum_ret

fig = px.line(
    cum_returns_df,
    title='Cumulative Returns by Similarity Quintile (Q1=Most Similar, Q5=Least Similar)',
    labels={'value': 'Cumulative Return', 'variable': 'Quintile'}
)
fig.update_layout(height=500, hovermode='x unified')
fig.show()

In [38]:
# Compute Q1 - Q5 spread (long most similar, short least similar)
q1_q5_spread = portfolio_returns_by_quintile[1] - portfolio_returns_by_quintile[5]
q1_q5_spread = q1_q5_spread.dropna()

print("Q1 - Q5 Spread Performance:")
spread_metrics = compute_performance_metrics(q1_q5_spread)
for k, v in spread_metrics.items():
    print(f"  {k}: {v}")

# Plot spread cumulative return
cum_spread = (1 + q1_q5_spread).cumprod()
fig = px.line(
    x=cum_spread.index,
    y=cum_spread.values,
    title='Q1 - Q5 Spread: Long Most Similar, Short Least Similar',
    labels={'x': 'Date', 'y': 'Cumulative Return'}
)
fig.update_layout(height=400)
fig.show()

Q1 - Q5 Spread Performance:
  Total Return: 114.8%
  Ann. Return: 1.7%
  Ann. Vol: 5.8%
  Sharpe: 0.28
  Max DD: -25.1%
  Hit Rate: 42.3%
  N Months: 560


## 6. Individual Factor Performance (Q1 Strategy)

In [14]:
# Performance by factor for Quintile 1 strategy
q1_factor_returns = factor_returns_by_quintile[1]

factor_performance = []
for factor in factor_cols:
    if factor in q1_factor_returns.columns:
        metrics = compute_performance_metrics(q1_factor_returns[factor])
        metrics['Factor'] = factor
        factor_performance.append(metrics)

factor_perf_df = pd.DataFrame(factor_performance).set_index('Factor')
print("Q1 Strategy Performance by Factor:")
factor_perf_df

Q1 Strategy Performance by Factor:


Unnamed: 0_level_0,Total Return,Ann. Return,Ann. Vol,Sharpe,Max DD,Hit Rate,N Months
Factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Mkt-RF,2436.4%,7.2%,19.1%,0.38,-65.1%,61.8%,560
SMB,97.1%,1.5%,12.1%,0.12,-66.1%,50.7%,560
HML,402.1%,3.5%,17.5%,0.2,-58.9%,53.4%,560
RMW,914.5%,5.1%,11.1%,0.46,-48.0%,59.3%,560
CMA,291.4%,3.0%,10.3%,0.29,-43.1%,51.1%,560


In [15]:
# Compare timed vs untimed factor returns
print("\nComparison: Timed (Q1 Strategy) vs Untimed Factor Returns")
print("=" * 60)

comparison_data = []
for factor in factor_cols:
    # Untimed: just hold the factor
    untimed = factors_aligned[factor].dropna()
    untimed_metrics = compute_performance_metrics(untimed)
    
    # Timed: Q1 strategy
    timed = q1_factor_returns[factor].dropna() if factor in q1_factor_returns.columns else pd.Series()
    timed_metrics = compute_performance_metrics(timed)
    
    comparison_data.append({
        'Factor': factor,
        'Untimed Sharpe': untimed_metrics.get('Sharpe', 'N/A'),
        'Timed Sharpe': timed_metrics.get('Sharpe', 'N/A'),
        'Untimed Return': untimed_metrics.get('Ann. Return', 'N/A'),
        'Timed Return': timed_metrics.get('Ann. Return', 'N/A'),
    })

comparison_df = pd.DataFrame(comparison_data).set_index('Factor')
comparison_df


Comparison: Timed (Q1 Strategy) vs Untimed Factor Returns


Unnamed: 0_level_0,Untimed Sharpe,Timed Sharpe,Untimed Return,Timed Return
Factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mkt-RF,0.52,0.38,10.2%,7.2%
SMB,0.12,0.12,1.5%,1.5%
HML,0.18,0.2,3.1%,3.5%
RMW,0.5,0.46,5.4%,5.1%
CMA,0.43,0.29,4.3%,3.0%


## 7. Save Results

In [48]:
# Save portfolio returns by quintile
output_dir = repo_root / 'data' / 'processed'

# Combine all quintile returns into one DataFrame
all_returns = pd.DataFrame(portfolio_returns_by_quintile)
all_returns.columns = [f'quintile_{q}' for q in range(1, 6)]
all_returns.to_csv(output_dir / 'portfolio_returns_by_quintile.csv')

# Save Q1 factor signals
signals_by_quintile[1].to_csv(output_dir / 'factor_signals_q1.csv')

# Save performance summary
performance_df.to_csv(output_dir / 'portfolio_performance_summary.csv')

print("Results saved to:")
print(f"  - {output_dir / 'portfolio_returns_by_quintile.csv'}")
print(f"  - {output_dir / 'factor_signals_q1.csv'}")
print(f"  - {output_dir / 'portfolio_performance_summary.csv'}")

Results saved to:
  - /Users/bachnguyen/nfs-regime-based-predictive-modelling/data/processed/portfolio_returns_by_quintile.csv
  - /Users/bachnguyen/nfs-regime-based-predictive-modelling/data/processed/factor_signals_q1.csv
  - /Users/bachnguyen/nfs-regime-based-predictive-modelling/data/processed/portfolio_performance_summary.csv


In [37]:
# Compare Q1 Timed vs Untimed Equal-Weighted FF5+Mom Portfolio
print("=" * 60)
print("Q1 TIMED vs UNTIMED EW FF5+MOM PORTFOLIO COMPARISON")
print("=" * 60)

# Untimed: Equal-weighted buy-and-hold of all 6 factors
untimed_ew = factors_aligned.mean(axis=1)  # EW across 6 factors
untimed_ew_metrics = compute_performance_metrics(untimed_ew)

# Q1 Timed: Already computed
q1_timed_metrics = compute_performance_metrics(portfolio_returns_by_quintile[1])

print("\n                      Untimed EW FF5+Mom    Q1 Timed EW FF5+Mom")
print("-" * 60)
for key in ['Ann. Return', 'Ann. Vol', 'Sharpe', 'Max DD', 'Hit Rate']:
    untimed_val = untimed_ew_metrics.get(key, 'N/A')
    timed_val = q1_timed_metrics.get(key, 'N/A')
    print(f"{key:20}  {untimed_val:>18}  {timed_val:>18}")

# Plot cumulative returns comparison
import plotly.graph_objects as go

cum_untimed = (1 + untimed_ew).cumprod()
cum_timed = (1 + portfolio_returns_by_quintile[1]).cumprod()

fig = go.Figure()
fig.add_trace(go.Scatter(x=cum_untimed.index, y=cum_untimed.values, name='Untimed EW FF5+Mom', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=cum_timed.index, y=cum_timed.values, name='Q1 Timed EW FF5+Mom', line=dict(color='green')))
fig.update_layout(
    title='Q1 Timed vs Untimed Equal-Weighted FF5+Mom Portfolio',
    xaxis_title='Date',
    yaxis_title='Cumulative Return',
    height=450,
    hovermode='x unified'
)
fig.show()

Q1 TIMED vs UNTIMED EW FF5+MOM PORTFOLIO COMPARISON

                      Untimed EW FF5+Mom    Q1 Timed EW FF5+Mom
------------------------------------------------------------
Ann. Return                         6.0%                5.3%
Ann. Vol                            5.9%                5.7%
Sharpe                              1.02                0.92
Max DD                            -13.5%              -10.6%
Hit Rate                           69.2%               66.8%


## 8. Optimized Factor Weights

Instead of equal-weighting factors, we optimize weights to maximize the Sharpe ratio on a training set, then evaluate on a held-out test set.

**Methodology:**
- Split data chronologically: 80% train (earlier dates), 20% test (later dates)
- Optimize weights on Q1 timed factor returns to maximize Sharpe ratio
- Weights sum to 1 (can be negative for shorting)
- Evaluate optimized weights on test set and compare to equal-weighted

In [39]:
from scipy.optimize import minimize

# Get Q1 timed factor returns
q1_factor_returns = factor_returns_by_quintile[1]

# Chronological train/test split (80/20)
n_total = len(q1_factor_returns)
n_train = int(n_total * 0.8)

train_returns = q1_factor_returns.iloc[:n_train]
test_returns = q1_factor_returns.iloc[n_train:]

train_split_date = train_returns.index[-1]
test_start_date = test_returns.index[0]

print(f"Total observations: {n_total}")
print(f"Train set: {len(train_returns)} months ({train_returns.index[0].date()} to {train_split_date.date()})")
print(f"Test set: {len(test_returns)} months ({test_start_date.date()} to {test_returns.index[-1].date()})")
print(f"\nFactors: {list(q1_factor_returns.columns)}")

Total observations: 561
Train set: 448 months (1978-12-31 to 2016-03-31)
Test set: 113 months (2016-04-30 to 2025-08-31)

Factors: ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']


In [40]:
def portfolio_sharpe(weights, returns):
    """
    Compute negative Sharpe ratio for optimization (we minimize).
    """
    weights = np.array(weights)
    portfolio_ret = (returns * weights).sum(axis=1)
    
    ann_return = portfolio_ret.mean() * 12
    ann_vol = portfolio_ret.std() * np.sqrt(12)
    
    if ann_vol == 0:
        return 0
    
    sharpe = ann_return / ann_vol
    return -sharpe  # Negative for minimization

def optimize_weights(returns, allow_short=True):
    """
    Optimize factor weights to maximize Sharpe ratio.
    
    Parameters:
    -----------
    returns : pd.DataFrame
        Factor returns
    allow_short : bool
        If True, weights can be negative (shorting allowed)
        If False, weights must be >= 0 (long only)
    
    Returns:
    --------
    optimal_weights : np.ndarray
    optimization_result : scipy.optimize.OptimizeResult
    """
    n_factors = returns.shape[1]
    
    # Initial guess: equal weights
    w0 = np.ones(n_factors) / n_factors
    
    # Constraint: weights sum to 1
    constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}
    
    # Bounds
    if allow_short:
        bounds = [(-1, 2) for _ in range(n_factors)]  # Allow shorts up to -100%
    else:
        bounds = [(0, 1) for _ in range(n_factors)]  # Long only
    
    result = minimize(
        portfolio_sharpe,
        w0,
        args=(returns,),
        method='SLSQP',
        bounds=bounds,
        constraints=constraints,
        options={'maxiter': 1000}
    )
    
    return result.x, result

print("Optimization functions defined")

Optimization functions defined


In [41]:
# Optimize weights on training set
print("Optimizing factor weights on training set...")
print("=" * 60)

# Long-short optimization (allow negative weights)
optimal_weights_ls, result_ls = optimize_weights(train_returns, allow_short=True)

# Long-only optimization
optimal_weights_lo, result_lo = optimize_weights(train_returns, allow_short=False)

# Display results
weights_df = pd.DataFrame({
    'Factor': factor_cols,
    'Equal Weight': [1/len(factor_cols)] * len(factor_cols),
    'Optimal (Long-Short)': optimal_weights_ls,
    'Optimal (Long-Only)': optimal_weights_lo,
}).set_index('Factor')

print("\nOptimal Factor Weights (trained on training set):")
print(weights_df.round(3))

# Verify weights sum to 1
print(f"\nWeight sums - Long-Short: {optimal_weights_ls.sum():.4f}, Long-Only: {optimal_weights_lo.sum():.4f}")

Optimizing factor weights on training set...

Optimal Factor Weights (trained on training set):
        Equal Weight  Optimal (Long-Short)  Optimal (Long-Only)
Factor                                                         
Mkt-RF         0.167                 0.165                0.163
SMB            0.167                 0.163                0.163
HML            0.167                -0.010                0.000
RMW            0.167                 0.247                0.244
CMA            0.167                 0.252                0.247
Mom            0.167                 0.183                0.182

Weight sums - Long-Short: 1.0000, Long-Only: 1.0000


In [42]:
# Compute portfolio returns for each weighting scheme
def compute_weighted_portfolio(returns, weights):
    """Compute portfolio returns given factor returns and weights."""
    return (returns * weights).sum(axis=1)

# Equal-weighted
ew_weights = np.ones(len(factor_cols)) / len(factor_cols)

# Training set performance
train_ew = compute_weighted_portfolio(train_returns, ew_weights)
train_ls = compute_weighted_portfolio(train_returns, optimal_weights_ls)
train_lo = compute_weighted_portfolio(train_returns, optimal_weights_lo)

# Test set performance (out-of-sample)
test_ew = compute_weighted_portfolio(test_returns, ew_weights)
test_ls = compute_weighted_portfolio(test_returns, optimal_weights_ls)
test_lo = compute_weighted_portfolio(test_returns, optimal_weights_lo)

print("Portfolio returns computed for train and test sets")

Portfolio returns computed for train and test sets


In [47]:
# Performance comparison
print("=" * 80)
print("PERFORMANCE COMPARISON: TIMED vs UNTIMED, EQUAL-WEIGHTED vs OPTIMIZED")
print("=" * 80)

# Get untimed factor returns for train/test periods
# Untimed = buy-and-hold the raw factors (not using Q1 timing signals)
untimed_factors_train = factors_aligned.iloc[:n_train]
untimed_factors_test = factors_aligned.iloc[n_train:]

# Untimed equal-weighted portfolio
untimed_train_ew = untimed_factors_train.mean(axis=1)
untimed_test_ew = untimed_factors_test.mean(axis=1)

# Training set metrics
print("\n--- TRAINING SET (In-Sample) ---")
train_metrics = pd.DataFrame({
    'Untimed EW FF5+Mom': compute_performance_metrics(untimed_train_ew),
    'Q1 Timed EW': compute_performance_metrics(train_ew),
    'Q1 Timed Optimal L/S': compute_performance_metrics(train_ls),
    'Q1 Timed Optimal L-Only': compute_performance_metrics(train_lo),
}).T
print(train_metrics[['Ann. Return', 'Ann. Vol', 'Sharpe', 'Max DD', 'Hit Rate']])

# Test set metrics (OUT-OF-SAMPLE - this is what matters!)
print("\n--- TEST SET (Out-of-Sample) ---")
test_metrics = pd.DataFrame({
    'Untimed EW FF5+Mom': compute_performance_metrics(untimed_test_ew),
    'Q1 Timed EW': compute_performance_metrics(test_ew),
    'Q1 Timed Optimal L/S': compute_performance_metrics(test_ls),
    'Q1 Timed Optimal L-Only': compute_performance_metrics(test_lo),
}).T
print(test_metrics[['Ann. Return', 'Ann. Vol', 'Sharpe', 'Max DD', 'Hit Rate']])

PERFORMANCE COMPARISON: TIMED vs UNTIMED, EQUAL-WEIGHTED vs OPTIMIZED

--- TRAINING SET (In-Sample) ---
                        Ann. Return Ann. Vol Sharpe  Max DD Hit Rate
Untimed EW FF5+Mom             7.1%     6.1%   1.16  -12.8%    72.3%
Q1 Timed EW                    5.9%     5.9%   1.00  -10.6%    69.6%
Q1 Timed Optimal L/S           6.4%     5.6%   1.14   -7.7%    69.4%
Q1 Timed Optimal L-Only        6.4%     5.6%   1.14   -7.8%    69.2%

--- TEST SET (Out-of-Sample) ---
                        Ann. Return Ann. Vol Sharpe  Max DD Hit Rate
Untimed EW FF5+Mom             3.0%     5.1%   0.59  -13.5%    61.3%
Q1 Timed EW                    2.8%     4.9%   0.58   -8.8%    54.9%
Q1 Timed Optimal L/S           2.9%     4.1%   0.70   -6.8%    55.8%
Q1 Timed Optimal L-Only        2.9%     4.1%   0.70   -6.9%    55.8%


In [44]:
# Visualize optimal weights
fig = go.Figure()

fig.add_trace(go.Bar(
    name='Equal Weight',
    x=factor_cols,
    y=ew_weights,
    marker_color='gray'
))

fig.add_trace(go.Bar(
    name='Optimal Long-Short',
    x=factor_cols,
    y=optimal_weights_ls,
    marker_color='blue'
))

fig.add_trace(go.Bar(
    name='Optimal Long-Only',
    x=factor_cols,
    y=optimal_weights_lo,
    marker_color='green'
))

fig.update_layout(
    title='Factor Weights: Equal vs Optimized',
    xaxis_title='Factor',
    yaxis_title='Weight',
    barmode='group',
    height=400
)
fig.add_hline(y=0, line_dash="dash", line_color="black")
fig.show()

In [45]:
# Plot cumulative returns on TEST set (out-of-sample)
fig = go.Figure()

cum_test_ew = (1 + test_ew).cumprod()
cum_test_ls = (1 + test_ls).cumprod()
cum_test_lo = (1 + test_lo).cumprod()

fig.add_trace(go.Scatter(
    x=cum_test_ew.index, y=cum_test_ew.values,
    name='Equal Weight', line=dict(color='gray', width=2)
))
fig.add_trace(go.Scatter(
    x=cum_test_ls.index, y=cum_test_ls.values,
    name='Optimal Long-Short', line=dict(color='blue', width=2)
))
fig.add_trace(go.Scatter(
    x=cum_test_lo.index, y=cum_test_lo.values,
    name='Optimal Long-Only', line=dict(color='green', width=2)
))

fig.update_layout(
    title=f'Out-of-Sample Cumulative Returns (Test Set: {test_start_date.date()} onwards)',
    xaxis_title='Date',
    yaxis_title='Cumulative Return',
    height=450,
    hovermode='x unified'
)
fig.show()

In [46]:
# Full sample performance with optimized weights
print("=" * 80)
print("FULL SAMPLE PERFORMANCE (applying train-optimized weights to all data)")
print("=" * 80)

# Apply optimized weights to full sample
full_ew = compute_weighted_portfolio(q1_factor_returns, ew_weights)
full_ls = compute_weighted_portfolio(q1_factor_returns, optimal_weights_ls)
full_lo = compute_weighted_portfolio(q1_factor_returns, optimal_weights_lo)

full_metrics = pd.DataFrame({
    'Equal Weight': compute_performance_metrics(full_ew),
    'Optimal Long-Short': compute_performance_metrics(full_ls),
    'Optimal Long-Only': compute_performance_metrics(full_lo),
}).T

print("\nFull Sample Metrics:")
print(full_metrics[['Ann. Return', 'Ann. Vol', 'Sharpe', 'Max DD', 'Hit Rate', 'N Months']])

# Summary
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"\nOptimal Long-Short Weights: {dict(zip(factor_cols, optimal_weights_ls.round(3)))}")
print(f"Optimal Long-Only Weights:  {dict(zip(factor_cols, optimal_weights_lo.round(3)))}")

FULL SAMPLE PERFORMANCE (applying train-optimized weights to all data)

Full Sample Metrics:
                   Ann. Return Ann. Vol Sharpe  Max DD Hit Rate N Months
Equal Weight              5.3%     5.7%   0.92  -10.6%    66.7%      561
Optimal Long-Short        5.7%     5.4%   1.06   -7.7%    66.7%      561
Optimal Long-Only         5.7%     5.3%   1.06   -7.8%    66.5%      561

SUMMARY

Optimal Long-Short Weights: {'Mkt-RF': np.float64(0.165), 'SMB': np.float64(0.163), 'HML': np.float64(-0.01), 'RMW': np.float64(0.247), 'CMA': np.float64(0.252), 'Mom': np.float64(0.183)}
Optimal Long-Only Weights:  {'Mkt-RF': np.float64(0.163), 'SMB': np.float64(0.163), 'HML': np.float64(0.0), 'RMW': np.float64(0.244), 'CMA': np.float64(0.247), 'Mom': np.float64(0.182)}
