# Homework 5

In [4]:

def stats_dates(df, dates, annual_fac=12):
    stats_df = pd.DataFrame(data=None, index = ['Mean', 'Vol', 'Sharpe', 'VaR (.05)'])
    
    for d in dates:
        for col in df.columns:
            df_ = df.loc[d[0]:d[1], col]
            stats_df[col + ' ' + d[0] + '-' + d[1]] = [df_.mean()*annual_fac,
                                                       df_.std()*np.sqrt(annual_fac),
                                                       (df_.mean()/df_.std())*np.sqrt(annual_fac),
                                                       df_.quantile(.05)]
    
    return stats_df

def summary_stats(df, annual_fac=12):
    ss_df = (df.mean() * annual_fac).to_frame('Mean')
    ss_df['Vol'] = df.std() * np.sqrt(annual_fac)
    ss_df['Sharpe'] = ss_df['Mean'] / ss_df['Vol']
    
    return round(ss_df, 4)

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from arch import arch_model
from arch.univariate import GARCH, EWMAVariance 
from sklearn import linear_model
import scipy.stats as stats
from statsmodels.regression.rolling import RollingOLS
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.precision", 4)
sns.set(rc={'figure.figsize':(15, 10)})

# Data

In [6]:
file_path ='C:/Users/dcste/OneDrive/Portfolio_Theory/Homework_Jupyter/portfolio_theory/factor_pricing_data.xlsx'
factors_description = pd.read_excel(file_path, sheet_name = 0)
factors_description

Unnamed: 0.1,Unnamed: 0,Name,Unit,Construction,Description
0,MKT,Market,Excess Return,Market-cap-weighted,US Equities
1,SMB,Size,Excess Return,Small Minus Big,Long small stocks and short big stocks
2,HML,Value,Excess Return,High Minus Low,Long value (high book-to-market) stocks and sh...
3,RMW,Profitability,Excess Return,Robust Minus Weak,Long profitability (income statement) and shor...
4,CMA,Investment,Excess Return,Conservative Minus Agressive,Long stocks with low (conservative) investment...
5,UMD,Momentum,Excess Return,Up Minus Down,Long stocks that have high recent returns and ...
6,RF,Risk-free rate,Total Return,Tbills,


In [7]:
factors = pd.read_excel(file_path, sheet_name = 'factors (excess returns)')
factors = factors.set_index("Date")
factors.head(4)


Unnamed: 0_level_0,MKT,SMB,HML,RMW,CMA,UMD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1980-01-31,0.0551,0.0183,0.0175,-0.017,0.0164,0.0755
1980-02-29,-0.0122,-0.0157,0.0061,0.0004,0.0268,0.0788
1980-03-31,-0.129,-0.0693,-0.0101,0.0146,-0.0119,-0.0955
1980-04-30,0.0397,0.0105,0.0106,-0.021,0.0029,-0.0043


In [8]:
# Analyzing the Factors mean, vol, and sharpe
factor_stats = summary_stats(factors)
factor_stats

Unnamed: 0,Mean,Vol,Sharpe
MKT,0.0831,0.1567,0.5305
SMB,0.0122,0.1005,0.1211
HML,0.0275,0.1088,0.2529
RMW,0.0448,0.0834,0.5376
CMA,0.0333,0.0715,0.4652
UMD,0.0655,0.1545,0.4241


In [9]:
# How has each factor performed from 2015 and present

stats_dates(factors, [['2015','2022']])

Unnamed: 0,MKT 2015-2022,SMB 2015-2022,HML 2015-2022,RMW 2015-2022,CMA 2015-2022,UMD 2015-2022
Mean,0.1069,-0.0058,-0.0197,0.0395,0.0022,0.0255
Vol,0.1602,0.0977,0.1325,0.0712,0.0796,0.1368
Sharpe,0.6676,-0.059,-0.1488,0.5553,0.0282,0.1865
VaR (.05),-0.0788,-0.0419,-0.0482,-0.0233,-0.0267,-0.0644


- Each factor has a positive risk premium to the market
- All factors are earning a positive premium to the market except the size factor and the HML factor

## 3.A Correlation Matrix

 

In [10]:
factors.corr()

Unnamed: 0,MKT,SMB,HML,RMW,CMA,UMD
MKT,1.0,0.2263,-0.2221,-0.2554,-0.3819,-0.1677
SMB,0.2263,1.0,-0.0721,-0.4143,-0.0642,-0.0304
HML,-0.2221,-0.0721,1.0,0.2295,0.6725,-0.2349
RMW,-0.2554,-0.4143,0.2295,1.0,0.1155,0.0753
CMA,-0.3819,-0.0642,0.6725,0.1155,1.0,-0.0122
UMD,-0.1677,-0.0304,-0.2349,0.0753,-0.0122,1.0


## 3.B

HML- one could argue is correlated to the RMW factor, so including both of them is redundant. 

## 4.A Report Tangency Weights for a Portfolio of these 6 Factors

In [11]:
def compute_tangency(df_tilde, diagonalize_Sigma=False):

    Sigma = df_tilde.cov()

    # N is the number of assets

    N = Sigma.shape[0]

    Sigma_adj = Sigma.copy()

    if diagonalize_Sigma:

        Sigma_adj.loc[:,:] = np.diag(np.diag(Sigma_adj))



    mu_tilde = df_tilde.mean()

    Sigma_inv = np.linalg.inv(Sigma_adj)

    weights = Sigma_inv @ mu_tilde / (np.ones(N) @ Sigma_inv @ mu_tilde)

    # For convenience, I'll wrap the solution back into a pandas.Series object.

    omega_tangency = pd.Series(weights, index=mu_tilde.index)

    return omega_tangency, mu_tilde, Sigma_adj



omega_tangency, mu_tilde, Sigma = compute_tangency(factors)

omega_tangency.to_frame('Tangency Weights')

Unnamed: 0,Tangency Weights
MKT,0.2011
SMB,0.0816
HML,-0.047
RMW,0.2884
CMA,0.3774
UMD,0.0986


In [12]:
omega_tangency = pd.DataFrame(omega_tangency)
omega_tangency = omega_tangency.rename(columns = {0:"Weights"})

factor_stats = factor_stats.join(omega_tangency)
factor_stats

Unnamed: 0,Mean,Vol,Sharpe,Weights
MKT,0.0831,0.1567,0.5305,0.2011
SMB,0.0122,0.1005,0.1211,0.0816
HML,0.0275,0.1088,0.2529,-0.047
RMW,0.0448,0.0834,0.5376,0.2884
CMA,0.0333,0.0715,0.4652,0.3774
UMD,0.0655,0.1545,0.4241,0.0986


In [13]:
factor_stats.sort_values(by = ['Mean'])

Unnamed: 0,Mean,Vol,Sharpe,Weights
SMB,0.0122,0.1005,0.1211,0.0816
HML,0.0275,0.1088,0.2529,-0.047
CMA,0.0333,0.0715,0.4652,0.3774
RMW,0.0448,0.0834,0.5376,0.2884
UMD,0.0655,0.1545,0.4241,0.0986
MKT,0.0831,0.1567,0.5305,0.2011


## 4.a and 4.b 

- The factors that seem most important are CMA (Investment Factor), RMW(Profitablility Factor), and the market factor
- Yes, CMA has a low-mean return yet the highest allocation of .37.

## 4.C

- Re-do the tangency weights that only include mkt, smb, hml, and umd

In [14]:
omega_tangency2, mu_tilde_2, sigma_2 = compute_tangency(factors[['MKT','SMB','HML','UMD']])


omega_tangency2.to_frame('Weights')

Unnamed: 0,Weights
MKT,0.3314
SMB,0.0061
HML,0.3622
UMD,0.3003


- The **Value Factor** receives the highest tangency weight now. This makes sense because after removing **CMA** the profitability factor-which is correlated to **HML** we would expect the alloaction of HML to increase. 
- The importance of these styles is based heavily on their covariances. 

# Section 3

# Testing Modern LPMs

# 3.1 

- Test the AQR 4-factor Model using the time series test. For each regression, report the alpha and r-squared. 

In [15]:
portfolios = pd.read_excel(file_path, sheet_name = 2)
portfolios = portfolios.set_index("Date")

In [16]:
portfolios.head(3)

Unnamed: 0_level_0,Agric,Food,Soda,Beer,Smoke,Toys,Fun,Books,Hshld,Clths,...,Boxes,Trans,Whlsl,Rtail,Meals,Banks,Insur,RlEst,Fin,Other
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1980-01-31,-0.005,0.0283,0.0084,0.1024,-0.0143,0.0999,0.0354,0.0352,0.0048,0.0032,...,0.0159,0.0876,0.0463,-0.0116,0.0458,-0.0279,0.0258,0.0751,0.0299,0.0665
1980-02-29,0.0111,-0.061,-0.0966,-0.0319,-0.0569,-0.0314,-0.0527,-0.0788,-0.0556,-0.014,...,-0.0079,-0.0535,-0.0339,-0.0633,-0.0638,-0.0855,-0.096,-0.0314,-0.0275,-0.0267
1980-03-31,-0.2244,-0.1116,-0.0167,-0.1464,-0.0192,-0.1281,-0.0817,-0.1278,-0.0565,-0.0664,...,-0.0821,-0.1511,-0.1106,-0.0922,-0.1443,-0.0563,-0.0883,-0.2441,-0.1245,-0.1728


In [17]:
CAPM =  ['MKT']
FF_3F = ['MKT','SMB','HML']
FF_5F = ['MKT','SMB','HML','RMW','CMA']
AQR = ['MKT','HML','RMW','UMD']

In [18]:
def ts_test(df, factor_df, factors, test, annualization=12):
    res = pd.DataFrame(data = None, index = df.columns, columns = [test + "Alpha", test + "R-squared"])
    
    for port in df.columns:
        y = df[port]
        X = sm.add_constant(factor_df[factors])
        model = sm.OLS(y, X).fit()
        res.loc[port] = [model.params[0] * annualization, model.rsquared]
    
    return res

In [19]:
AQR_factor = ts_test(portfolios, factors, AQR, "AQR")
AQR_factor

Unnamed: 0,AQRAlpha,AQRR-squared
Agric,0.0156,0.3302
Food,0.0152,0.4681
Soda,0.0238,0.3098
Beer,0.0268,0.4248
Smoke,0.0399,0.2575
Toys,-0.0277,0.5033
Fun,0.0271,0.6156
Books,-0.0292,0.6886
Hshld,-0.0009,0.5681
Clths,-0.0014,0.6185


## 3.b Calculating Mean-Absolute Error of the Estimated Alphas



In [20]:
print("AQR MAE", round(AQR_factor['AQRAlpha'].abs().mean(),4))

AQR MAE 0.0235


If the pricing model worked, we should expect the alpha estimates to be small because a linear pricing model's independent variables should be able to explain all the returns of the asset or security based on the factors. 

## 3.2 

- Test the CAPM, FF 3-Factor Model and the FF-Factor Model. Report the MAE statistic for each of these models and compare it AQR model MAE. Which model fits best?

In [21]:
factor_tests = ts_test(portfolios, factors, CAPM, 'CAPM').join(ts_test(portfolios, factors, FF_3F, 'Fama-French 3F'))\
                                                         .join(ts_test(portfolios, factors, FF_5F, 'Fama-French 5F'))
factors_MAE = factor_tests[['CAPMAlpha',
                            'Fama-French 3FAlpha',
                            'Fama-French 5FAlpha']].abs().mean().to_frame('MAE')
                                                         

In [22]:
factors_MAE.loc['AQR ALpha'] = AQR_factor.AQRAlpha.abs().mean()


In [23]:
factors_MAE.sort_values(by = 'MAE')

Unnamed: 0,MAE
CAPMAlpha,0.0215
AQR ALpha,0.0235
Fama-French 3FAlpha,0.0254
Fama-French 5FAlpha,0.0325


- The linear pricing model that achieves the lowest ALPHA is the capital asset pricing model. 

## 3.3

The factor that is most important and drives the returns of all securities is the market factor. Yes, I believe Fama and French ought to include the momentum factor because it achieves a lower MAE than the Fama-French 3F model and Fama-French 5F model. 

## 4

Interpret the r-squareds from the factor models.

In [24]:
factors_r_squared = pd.DataFrame([[factor_tests['CAPMR-squared'].mean(), factor_tests['Fama-French 3FR-squared'].mean(), factor_tests['Fama-French 5FR-squared'].mean(), AQR_factor['AQRR-squared'].mean()]], columns = ['CAPM-rsquared', 'FF3-r-squared','FF5-rsquared','AQR-rsquared'], index = ['R-squared'])

In [25]:
factors_r_squared.T

Unnamed: 0,R-squared
CAPM-rsquared,0.5275
FF3-r-squared,0.5711
FF5-rsquared,0.5964
AQR-rsquared,0.5757


- We can see that about 55% of the variation in the factors explain the variation of the portfolio returns across time. These are not high r-squared and would generally not lead to a good linear factor decomposition of assets. 

## Re-test these models using a cross-sectional test. 

In [26]:

def ts_betas(df, factor_df, factors, intercept=False):
    if intercept == True:
        res = pd.DataFrame(data = None, index = df.columns, columns = ['alpha'])
        res[factors] = None
    else:
        res = pd.DataFrame(data = None, index = df.columns, columns = factors)
    
    for port in df.columns:
        y = df[port]
        if intercept == True:
            X = sm.add_constant(factor_df[factors])
        else:
            X = factor_df[factors]
        model = sm.OLS(y, X).fit()
        res.loc[port] = model.params
    
    return res

def cross_section(df, factor_df, factors, ts_int=True, annualization=12):
    betas = ts_betas(df, factor_df, factors, intercept=ts_int)
    res = pd.DataFrame(data = None, index = betas.index, columns = factors)
    res['Predicted'] = None
    res['Actual'] = None
    
    for port in res.index:
        res.loc[port, factors] = betas.loc[port]
        prem = (betas.loc[port] * factor_df[factors]).sum(axis=1).mean() * annualization
        res.loc[port,['Predicted','Actual']] = prem, df[port].mean() * annualization
    
    return res

def cross_premia(df_cs, factors):
    y = df_cs['Actual'].astype(float)
    X = df_cs[factors].astype(float)

    return sm.OLS(y,X).fit().params.to_frame('CS Premia')

def cross_premia_mae(df_cs, factors, model):
    y = df_cs['Actual'].astype(float)
    X = df_cs[factors].astype(float)

    print(model + ' MAE: ' + str(round(sm.OLS(y,X).fit().resid.abs().mean(), 4)))
    return

In [27]:
CAPM_cs = cross_section(portfolios, factors, CAPM, ts_int=True)
FF_3F_cs = cross_section(portfolios, factors, FF_3F, ts_int=True)
FF_5F_cs = cross_section(portfolios, factors, FF_5F, ts_int=True)
AQR_cs = cross_section(portfolios, factors, AQR, ts_int=True)


## 5.a
- Report the time-series premia of the factors and compare to the cross-sectionally estiamted premia of the factor models. Do they differ substantially?

In [28]:
# Time series premium

time_series_premia = (factors.mean()*12).to_frame("TS Premia")

In [29]:
print('3 Fama Factor Model')
FF_3cs_premia = cross_premia(FF_3F_cs, FF_3F)


3 Fama Factor Model


In [30]:
print('5 Fama Factor Model')
FF_5cs_premia = cross_premia(FF_5F_cs,FF_5F)

5 Fama Factor Model


In [31]:
print('AQR Factor Model')
AQR_cs_premia = cross_premia(AQR_cs, AQR)

AQR Factor Model


In [32]:
print("Capital Asset Pricing Model")
CAPM_cs_premia = cross_premia(CAPM_cs, CAPM)
CAPM_cs_premia = CAPM_cs_premia
CAPM_cs_premia

Capital Asset Pricing Model


Unnamed: 0,CS Premia
MKT,0.0849


In [33]:
cs_premia_df = pd.concat([time_series_premia,FF_3cs_premia, FF_5cs_premia, AQR_cs_premia, CAPM_cs_premia], axis =1).fillna('')
cs_premia_df.columns = ['TS Premia', 'FF3','FF5','AQR','CAPM Premia']
cs_premia_df

Unnamed: 0,TS Premia,FF3,FF5,AQR,CAPM Premia
MKT,0.0831,0.101,0.0948,0.0866,0.0849
SMB,0.0122,-0.0659,-0.0587,,
HML,0.0275,-0.0173,-0.0354,-0.0409,
RMW,0.0448,,0.0368,0.0455,
CMA,0.0333,,-0.0154,,
UMD,0.0655,,,0.0553,


## 5b

- Report the cross-sectional regression residuals for each of the four models. How do they compare to the MAE of the time-series alphas?

In [34]:
print(cross_premia_mae(CAPM_cs, CAPM,'CAPM'),cross_premia_mae(AQR_cs,AQR,"AQR"), cross_premia_mae(FF_3F_cs,FF_3F,'FF-3F'), cross_premia_mae(FF_5F_cs,FF_5F,'FF-5F'))

CAPM MAE: 0.0214
AQR MAE: 0.0172
FF-3F MAE: 0.0161
FF-5F MAE: 0.0136
None None None None


Compared to the time-series alphas, the cross-sectional residuals are smaller are smaller than the time-series regression MAE. 