In [140]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import sys

sys.path.append("../cmds")
from utils import (
    calc_univariate_regression,
    calc_iterative_regression,
    calc_performance_metrics
)


# Plotting settings.
plt.rcParams["axes.grid"] = True
plt.rcParams["axes.spines.right"] = False
plt.rcParams["axes.spines.top"] = False

# Pandas settings.
pd.set_option("display.float_format", lambda x: "{:.4f}".format(x))

# Constants for risk metrics, return metrics, and annualization.
RETURN_COLS = ["Annualized Return", "Annualized Volatility", "Annualized Sharpe Ratio"]
RISK_COLS = [
    "Skewness",
    "Excess Kurtosis",
    "VaR (0.05)",
    "CVaR (0.05)",
    "Max Drawdown",
    "Bottom",
    "Peak",
    "Recovery",
    "Duration (days)",
]
ADJ = 12


def calc_return_metrics(data, as_df=False, adj=12):
    """
    Calculate return metrics for a given dataset. Specifically:
    - Annualized Return
    - Annualized Volatility
    - Annualized Sharpe Ratio
    - Annualized Sortino Ratio (not part of the course, but useful to know)

    Args:
        data : Returns time series.
        as_df (bool, optional): Return a df or dict. Defaults to False.
        adj (int, optional): Annualization. Defaults to 12.

    Returns:
        DataFrame or dict: Summary of return metrics.
    """
    summary = dict()
    summary["Annualized Return"] = data.mean() * adj
    summary["Annualized Volatility"] = data.std() * np.sqrt(adj)
    summary["Annualized Sharpe Ratio"] = (
        summary["Annualized Return"] / summary["Annualized Volatility"]
    )
    summary["Annualized Sortino Ratio"] = (
        summary["Annualized Return"] * np.sqrt(adj) / (data[data < 0].std())
    )

    # Here, we use what is known as a "ternary operator", usually denoted as "condition ? if_true : if_false",
    # in other programming languages. This is equivalent to having an explicit if-else statement, but is more
    # concise and can be written on a single line.
    return pd.DataFrame(summary, index=data.columns) if as_df else summary

# Mean Variance Portfolio Functions

In [111]:
def tan_portfolio(mean_rets, cov_matrix):
    """
    Function to calculate tangency portfolio weights. Comes from the
    formula seen in class (Week 1).

    Args:
        mean_rets: Vector of mean returns.
        cov_matrix: Covariance matrix of returns.

    Returns:
        Vector of tangency portfolio weights.
    """
    inv_cov = np.linalg.inv(cov_matrix)
    ones = np.ones(mean_rets.shape)
    return (inv_cov @ mean_rets) / (ones.T @ inv_cov @ mean_rets)


def gmv_portfolio(cov_matrix):
    """
    Function to calculate the weights of the global minimum variance portfolio.

    Args:
        cov_matrix : Covariance matrix of returns.

    Returns:
        Vector of GMV portfolio weights.
    """
    try:
        cov_inv = np.linalg.inv(cov_matrix)
    except TypeError:
        cov_inv = np.linalg.inv(np.array(cov_matrix))

    one_vector = np.ones(len(cov_matrix.index))
    return cov_inv @ one_vector / (one_vector @ cov_inv @ (one_vector))


def mv_portfolio(mean_rets, cov_matrix, target=None):
    """
    Function to calculate the weights of the mean-variance portfolio. If
    target is not specified, then the function will return the tangency portfolio.
    If target is specified, then we return the MV-efficient portfolio with the target
    return.

    Args:
        mean_rets : Vector of mean returns.
        cov_matrix : Covariance matrix of returns.
        target (optional):  Target mean return. Defaults to None. Note: must be adjusted for
                            annualization the same time-frequency as the mean returns. If the
                            mean returns are monthly, the target must be monthly as well.

    Returns:
        Vector of MV portfolio weights.
    """
    w_tan = tan_portfolio(mean_rets, cov_matrix)

    if target is None:
        return w_tan

    w_gmv = gmv_portfolio(cov_matrix)
    delta = (target - mean_rets @ w_gmv) / (mean_rets @ w_tan - mean_rets @ w_gmv)
    return delta * w_tan + (1 - delta) * w_gmv

# OLS Formulas/Functions

In [112]:
def performanceMetrics(returns,annualization=1, quantile=.05):
    metrics = pd.DataFrame(index=returns.columns)
    metrics['Mean'] = returns.mean() * annualization
    metrics['Vol'] = returns.std() * np.sqrt(annualization)
    metrics['Sharpe'] = (returns.mean() / returns.std()) * np.sqrt(annualization)

    metrics['Min'] = returns.min()
    metrics['Max'] = returns.max()
    return metrics

def tailMetrics(returns, quantile=.05, relative=False, mdd=True):
    
    #Maximum Drawdown
    def maximumDrawdown(returns):
        cum_returns = (1 + returns).cumprod()
        rolling_max = cum_returns.cummax()
        drawdown = (cum_returns - rolling_max) / rolling_max

        max_drawdown = drawdown.min()
        end_date = drawdown.idxmin()
        summary = pd.DataFrame({'Max Drawdown': max_drawdown, 'Bottom': end_date})

        for col in drawdown:
            summary.loc[col,'Peak'] = (rolling_max.loc[:end_date[col],col]).idxmax()
            recovery = (drawdown.loc[end_date[col]:,col])
            try:
                summary.loc[col,'Recover'] = pd.to_datetime(recovery[recovery >= 0].index[0])
            except:
                summary.loc[col,'Recover'] = pd.to_datetime(None)

            summary['Peak'] = pd.to_datetime(summary['Peak'])
            try:
                summary['Duration (to Recover)'] = (summary['Recover'] - summary['Peak'])
            except:
                summary['Duration (to Recover)'] = None

            summary = summary[['Max Drawdown','Peak','Bottom','Recover','Duration (to Recover)']]

        return summary  
    
    metrics = pd.DataFrame(index=returns.columns)
    metrics['Skewness'] = returns.skew()
    metrics['Kurtosis'] = returns.kurtosis()

    VaR = returns.quantile(quantile)
    CVaR = (returns[returns < returns.quantile(quantile)]).mean()

    if relative:
        VaR = (VaR - returns.mean())/returns.std()
        CVaR = (CVaR - returns.mean())/returns.std()

    metrics[f'VaR ({quantile})'] = VaR
    metrics[f'CVaR ({quantile})'] = CVaR

    if mdd:
        mdd_stats = maximumDrawdown(returns)
        metrics = metrics.join(mdd_stats)

        if relative:
            metrics['Max Drawdown'] = (metrics['Max Drawdown'] - returns.mean())/returns.std()

    return metrics


def get_ols_metrics(regressors, targets, annualization=1, ignorenan=True):
    # ensure regressors and targets are pandas dataframes, as expected
    if not isinstance(regressors, pd.DataFrame):
        regressors = regressors.to_frame()
    if not isinstance(targets, pd.DataFrame):
        targets = targets.to_frame()

    # align the targets and regressors on the same dates
    df_aligned = targets.join(regressors, how='inner', lsuffix='y ')
    Y = df_aligned[targets.columns]
    Xset = df_aligned[regressors.columns]

    reg = pd.DataFrame(index=targets.columns)
    for col in Y.columns:
        y = Y[col]
        
        if ignorenan:
            # ensure we use only non-NaN dates
            alldata = Xset.join(y,lsuffix='X')
            mask = alldata.notnull().all(axis=1)
            y = y[mask]
            X = Xset[mask]
        else:
            X = Xset

        model = LinearRegression().fit(X, y)
        reg.loc[col, 'alpha'] = model.intercept_ * annualization
        reg.loc[col, regressors.columns] = model.coef_
        reg.loc[col, 'r-squared'] = model.score(X, y)

        # sklearn does not return the residuals, so we need to build them
        yfit = model.predict(X)
        residuals = y - yfit

        # Treynor Ratio is only defined for univariate regression
        if Xset.shape[1] == 1:
            reg.loc[col,'Treynor Ratio'] = (y.mean() / model.coef_) * annualization

        
        # if intercept =0, numerical roundoff will nonetheless show nonzero Info Ratio
        num_roundoff = 1e-12
        if np.abs(model.intercept_) < num_roundoff:
            reg.loc[col, 'Info Ratio'] = None
        else:
            reg.loc[col, 'Info Ratio'] = (model.intercept_ / residuals.std()) * np.sqrt(annualization)

    return reg

def tangency_portfolio(data):
    mu = data.mean()
    sigma = np.linalg.inv(data.cov())
    one_vector = np.ones(len(data.columns))
    return sigma @ mu / (one_vector @ sigma @ mu)

def tangency_portfolio_allocation(data, target_return = 0.01):
    mu = data.mean()
    sigma = np.linalg.inv(data.cov())
    one_vector = np.ones(len(data.columns))
    tan_wts = tangency_portfolio(data)
    return ((one_vector @ sigma @ mu) / (mu @ sigma @ mu)) * target_return 

##  1 Return Analysis (25pts)
1. (a) (5pts) For each of the 10assets, report the following annualized excess return statistics:
• mean
• volatility
• Sharpe ratio

In [113]:
factors = pd.read_excel('./midterm_A_data-1.xlsx', 0).set_index('Date')
assets = pd.read_excel('./midterm_A_data-1.xlsx', 1).set_index('Date')
risk_free = pd.read_excel('./midterm_A_data-1.xlsx', 2).set_index('Date')

In [114]:
mean_vol_table = assets.describe().loc[['mean','std']].transpose()
mean_vol_table['Annualized_Mean'], mean_vol_table['Annualized_Std'] = mean_vol_table['mean'] * 12, mean_vol_table['std'] * (12**0.5)
mean_vol_table['Annualized_Sharpe_Ratio'] = mean_vol_table['Annualized_Mean']/mean_vol_table['Annualized_Std']
mean_vol_table_print = mean_vol_table.sort_values('Annualized_Sharpe_Ratio', ascending = False)


# mean_vol_table_print[['Annualized_Mean','Annualized_Std', 'Annualized_Sharpe_Ratio']].style.format("{:.2}")
mean_vol_table[['Annualized_Mean','Annualized_Std', 'Annualized_Sharpe_Ratio']].style.format('{:,.2%}')


Unnamed: 0,Annualized_Mean,Annualized_Std,Annualized_Sharpe_Ratio
NoDur,8.66%,12.49%,69.28%
Durbl,10.73%,30.07%,35.69%
Manuf,8.71%,17.37%,50.16%
Enrgy,9.98%,24.60%,40.56%
HiTec,7.24%,23.77%,30.45%
Telcm,2.35%,18.19%,12.91%
Shops,8.33%,15.81%,52.72%
Hlth,7.86%,14.23%,55.28%
Utils,9.16%,14.53%,63.03%
Other,6.56%,18.44%,35.58%


  (b) (2pts) Which asset has the
• highest mean return?
• highest Sharpe ratio?


In [115]:
print('Asset with Max Mean Return: {}'.format(mean_vol_table.Annualized_Mean.idxmax()))
print('Best Sharpe Ratio: '+ str(round(mean_vol_table_print.iloc[[0]].Annualized_Sharpe_Ratio[0], 2)) + ' for ' + mean_vol_table_print.iloc[[0]].index)
print('Worst Sharpe Ratio: '+ str(round(mean_vol_table_print.iloc[[-1]].Annualized_Sharpe_Ratio[0], 2)) + ' for ' + mean_vol_table_print.iloc[[-1]].index)


Asset with Max Mean Return: Durbl
Index(['Best Sharpe Ratio: 0.69 for NoDur'], dtype='object')
Index(['Worst Sharpe Ratio: 0.13 for Telcm'], dtype='object')


2. (5pts) For each of the 10assets, report the following statistics (no annualization needed).

• VaR (0.05). That is to say, the 5th quantile of returns.

• CVaR (0.05). That is to say, the average of the returns less than the 5th quantile.

• maximum drawdown1 Though we usually calculate maximum drawdown on total returns,
keep things simple and just continue to use the excess returns we’re already using in all
the other problems.

In [116]:
vars = {}
vars["VaR (0.05)"] = assets.quantile(0.05, axis=0)
vars["CVaR (0.05)"] = assets[assets <= assets.quantile(0.05, axis=0)].mean()

# Cumulative returns on $1000
wealth_index = 1000 * (1 + assets).cumprod()
previous_peaks = wealth_index.cummax()
# Biggest difference between cumulative max and your current wealth
drawdowns = (wealth_index - previous_peaks) / previous_peaks
vars["Max Drawdown"] = drawdowns.min()

summary = pd.DataFrame(vars)

summary.style.format('{:,.2%}')


Unnamed: 0,VaR (0.05),CVaR (0.05),Max Drawdown
NoDur,-5.73%,-7.95%,-34.99%
Durbl,-11.70%,-16.07%,-76.69%
Manuf,-8.50%,-11.57%,-54.91%
Enrgy,-10.71%,-14.72%,-66.39%
HiTec,-12.12%,-15.96%,-81.35%
Telcm,-9.53%,-12.16%,-77.39%
Shops,-7.48%,-9.67%,-40.91%
Hlth,-7.18%,-8.85%,-42.58%
Utils,-6.46%,-9.72%,-39.23%
Other,-8.41%,-12.56%,-69.57%


In [117]:
corrmat = assets.corr()
corrmat[corrmat==1] = None
corr_rank = abs(corrmat.unstack()).sort_values(ascending = False).dropna()
print('Pair of assets with highest correlation: ' + str(corr_rank.index[1]))
print('Pair of assets with lowest correlation: ' + str(corr_rank.index[-1]))
# print(mean_vol_table_print['Annualized_Sharpe_Ratio'])
# print(mean_vol_table_print['Annualized_Sharpe_Ratio'].describe().loc[['mean']])
# plt.figure(figsize=(10,8))
# sns.heatmap(corrmat)
# plt.show()

Pair of assets with highest correlation: ('Manuf', 'Other')
Pair of assets with lowest correlation: ('Utils', 'HiTec')


## 2 Mean-Variance Optimization (35pts)

1. (10pts) Calculate the weights of the tangency portfolio formed from the 10assets.

In [118]:
tangency_portfolio_weights = tangency_portfolio(assets)
pd.DataFrame(tangency_portfolio_weights, index = assets.columns, columns = ['Weights for the Tangency Portfolio']).sort_values('Weights for the Tangency Portfolio', ascending = False).style.format('{:,.2%}')

Unnamed: 0,Weights for the Tangency Portfolio
NoDur,74.75%
Shops,42.92%
Utils,29.65%
Hlth,28.27%
HiTec,16.23%
Enrgy,10.91%
Manuf,6.62%
Durbl,1.60%
Other,-49.59%
Telcm,-61.36%


2. (a) (5pts) What are the weights of the optimal portfolio, w∗, with a targeted mean excess
return of 0.01 per month?

In [119]:
# Allocation to tangency portfolio
portfolio_weights = tangency_portfolio_weights * tangency_portfolio_allocation(assets, )
weights = pd.DataFrame(portfolio_weights, index = assets.columns, columns = ['Weights for the Target Portfolio']).sort_values('Weights for the Target Portfolio', ascending = False).style.format('{:,.2%}')
weights

Unnamed: 0,Weights for the Target Portfolio
NoDur,67.44%
Shops,38.72%
Utils,26.74%
Hlth,25.51%
HiTec,14.64%
Enrgy,9.84%
Manuf,5.97%
Durbl,1.45%
Other,-44.74%
Telcm,-55.35%


(b) (5pts) Is the optimal portfolio, w∗, invested in the risk-free rate?

In [120]:
1-portfolio_weights.sum()
#  In the optimal portfolio, w*, there is roughly 9.79% invested in risk-free rates

0.09785803563216389

3. (5pts) Report the mean, volatility, and Sharpe ratio for the optimized portfolio, w∗, (calculated
in the previous question.) Annualize the statistics.

In [121]:
performanceMetrics(pd.DataFrame(assets@portfolio_weights, columns=['Optimal Portfolio']), annualization=12)\
.style.format('{:,.2%}')

Unnamed: 0,Mean,Vol,Sharpe,Min,Max
Optimal Portfolio,12.00%,12.24%,98.01%,-10.19%,13.61%


4. (5pts) Suppose an endowment is optimizing the multi-asset-class ETFs we used in our case study
of Homework 1.
Briefly describe one thing that should be considered in deciding whether to make Bitcoin an
(additional) asset class for the endowment’s allocation.

ANSWER: Bitcoin's correlation with the already existing asset classes

5. (5pts) In Homework 1, we tried estimating the mean-variance solution using data from 2009-2020
and testing it out of sample in 2021-2022.

We found the out-of-sample performance of the MV solution was worse than an equally weighted
portfolio.

Explain from a technical perspective why MV failed out of sample.

ANSWER: The mean-variance solution using data from 2009-2020 was constructed using attributes (volatility, mean returns, sharpe ratio, etc) of the assets during that time period, however, if these attributes changed, then the delicately constructed mean-variance solution for 2009-2020 would no longer be optimal. It would break away from the actual tangency porfolio.

MV Optimization fails out-of-sample for two reasons:

1. ***Imprecise estimation of covariance matrix:*** The covariance matrix is poorly estimated in the case of large number of assets or less amount of historical data. Inverting the covariance matrix makes the estimation even more fragile. Inverting a matrix with high correlations increases the condition number further adding to the instability. Due to these, our estimates of covariances will likely not hold out-of-sample.
2. ***High senstivity to changes in mean return:*** MV optimizer is highly sensitive to small changes in the estimated mean returns of the security pool. Large swings in portfolio weights are required to maintain the optimal portfolio even with small changes in mean returns. Due to this, MV optimizer does not perform well on out-of-sample data   

## 3 Pricing (25pts)

1. (10pts) Test a 3-factor pricing model on the 10assets. All this data is already given in excess
returns, so no further adjustment is needed.
Report the
• annualized alphas
• annualized Information ratios
• r-squared statistics

In [172]:
ols_metrics = get_ols_metrics(factors, assets, annualization=12)

# display(ols_metrics.style.format(formatter = {'alpha' : '{:,.2%}', 'MKT' : '{:,.2f}', 'r-squared' : '{:,.2%}',\
#                              'Treynor Ratio' : '{:,.2f}', 'Info Ratio' : '{:,.2f}'}))
ols_metrics

Unnamed: 0,alpha,MKT,HML,RMW,r-squared,Info Ratio
NoDur,0.0178,0.6646,0.1256,0.4079,0.6382,0.2374
Durbl,-0.0191,1.5866,0.2004,0.3318,0.6384,-0.1055
Manuf,-0.0038,1.0791,0.1853,0.2888,0.8813,-0.0628
Enrgy,0.0085,0.991,0.6511,0.138,0.4757,0.0477
HiTec,0.0307,1.216,-0.4838,-0.4091,0.8918,0.3926
Telcm,-0.0357,0.9356,0.0203,-0.0338,0.6651,-0.3393
Shops,0.0042,0.9406,-0.0527,0.3721,0.761,0.054
Hlth,0.0336,0.6642,-0.086,0.0881,0.5136,0.3391
Utils,0.0373,0.518,0.1534,0.2996,0.3003,0.3072
Other,-0.0211,1.0762,0.5095,0.0309,0.9201,-0.4039


2. (5pts) Which asset does the pricing model fit best?

ANSWER: The pricing model fits the asset category "Other" the best with an r-squared of 92.01%

3. (5pts) Instead of the 3-factor model above, suppose the CAPM is true and fits perfectly in our
sample.
For n assets, what do we know about their...
• time-series r-squared metrics?
• Treynor Ratios?
• Information Ratios?

**time-series r-squared metrics:** Nothing. Time-series R-Squared values can be high or low even if the CAPM is perfect, because CAPM doesn't claim to explain the difference in returns of a security at different times. CAPM says that long term expected excess return of a security if a linear function of it's beta to the market premium.

**Treynor Ratio:** If CAPM were true, Treynor Ratio $\frac{\mathbb{E}[\tilde{r}]}{\beta}$ for every security would be equal to the expected market premium. 

**Information Ratio:** If CAPM were true, there would be no excess expected return against the market premium. Hence alpha would be zero and Info Ratio would be zero.

4. (5pts) Suppose the CAPM is true and fits perfectly in our sample, yet we estimated the 3-factor
model as above.
Would the betas on the extra regressors (HML and RMW) be zero?


ANSWER: Even if CAPM is true and fits perfectly in the sample, it is not necessary that the time-series betas on other factors would be zero. That's because these factors can still help explain the variation in a security returns time series so that time-series model error terms are even smaller.

## 4 Forecasting (35pts)

1. (7pts) Forecast the market return, MKT, using the lagged risk-free rate as a signal.
Report the beta and r-squared from the regression.

In [162]:
lag_risk_free = risk_free.shift().dropna()
mkt = factors[['MKT']].iloc[1:]
model = LinearRegression().fit(lag_risk_free, mkt)
model_params = pd.DataFrame({'Beta': model.coef_[0], 'R-Squared' : model.score(lag_risk_free, mkt)}, index = ['Forecast'])
display(model_params.style.format({'Beta':'{:,.2f}', 'R-Squared' : '{:,.2%}'}))

Unnamed: 0,Beta,R-Squared
Forecast,-4.63,2.31%


**2. Calculate the fitted values.**

$$\hat{y_t} = \alpha + \beta x_t$$

**Use them to calculate the weights:**
$$w_t = 100\hat{y_t} = 100(\alpha + \beta x_t)$$

**Report the final value in the timeseries of $w_t$**

In [164]:
predict_mkt = model.predict(lag_risk_free)
wt = 100*predict_mkt
print(wt[len(wt)-1])
# predict_mkt

[1.12435602]


**3. Use the weights to calculate the strategy return:**
$$r^x_{t+1} = w_t r^{\text{MKT}}_{t+1}$$

Report the **annualized** mean, volatility and Sharpe ratio of the strategy.

In [165]:
rt = wt*mkt
performanceMetrics(rt, 12).style.format('{:,.2%}')

Unnamed: 0,Mean,Vol,Sharpe,Min,Max
MKT,9.41%,13.67%,68.85%,-11.36%,15.77%


**4. Estimate the factor decomposition of the strategy versus the MKT factor:**

$$r^x_t = \alpha + \beta r^{\text{MKT}}_t + \epsilon_t$$

**Report**
* the annualized alpha and Info Ratio. 
* beta
* r-squared.

In [170]:
strat_model = LinearRegression().fit(mkt,rt)
rsq = strat_model.score(mkt,rt)
alpha = strat_model.intercept_[0]
beta = strat_model.coef_[0]
resid = rt - strat_model.predict(mkt)
info_ratio = alpha/resid.std()

decomp = pd.DataFrame({'alpha': alpha*12, 'Info Ratio': info_ratio*np.sqrt(12), 'MKT': beta, 'r-squared': rsq})
display(decomp.style.format('{:,.2%}'))

Unnamed: 0,alpha,Info Ratio,MKT,r-squared
MKT,5.94%,54.13%,51.86%,35.54%


**5. Suppose you wanted to hedge the timing strategy against movements in MKT. Based on the previous calculation, explain how to set up this hedge.**

To hedge the strategy above, we would *short* 0.5186 units of the MKT for every unit of investment in the strategy. The optimal hedge ratio of 0.5186 is given by the beta of the regression