In [1]:
import pandas as pd
import numpy as np
from sympy import Matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import skew, kurtosis, norm
import seaborn as sns
import matplotlib.pyplot as plt
from Functions import *


def calculate_statistics(df, annualize_factor=12, VaR=0.05, CVaR=0.05):
    '''
    Calculates the mean, volatility, sharpe, skewness, kurtosis, VaR and CVaR of a dataframe
    Returns a dataframe with values for each asset
    '''
    res={}
    res_i={}
    for i in df.columns:
        if df[i].dtype=='<M8[ns]':
            pass
        else:
            res_i.update({'mean':np.mean(df[i])*annualize_factor})
            res_i.update({'volatility':np.std(df[i])*(annualize_factor**(1/2))})
            res_i.update({'sharpe':res_i['mean']/res_i['volatility']})
            res_i.update({'skewness':skew(df[i])})
            res_i.update({'kurtosis':kurtosis(df[i])})
            res_i.update({'VaR':df[i].quantile(VaR)})
            res_i.update({'CVaR':df[i][df[i]<df[i].quantile(CVaR)].mean()})
            # res_i.update({'Max_Drawdown':maxDrawD(df[i])})
            res.update({i:res_i})
            res_i={}
    return pd.DataFrame(res)

def calculate_statistics_array(data, annualize_factor=12, VaR=0.05, CVaR=0.05):
    '''
    Calculates the mean, volatility and sharpe ratio of an array
    Returns a dictionary with the 'mean', 'volatility' and 'sharpe' ratio of the array
    '''
    res_i={}
    res_i.update({'mean':np.mean(data)*annualize_factor})
    res_i.update({'volatility':np.std(data)*(annualize_factor**(1/2))})
    res_i.update({'sharpe':res_i['mean']/res_i['volatility']})
    res_i.update({'skewness':skew(data)})
    res_i.update({'kurtosis':kurtosis(data)})
    df=pd.DataFrame(data)
    res_i.update({'VaR':df.quantile(VaR)})
    res_i.update({'CVaR':df[df<df.quantile(CVaR)].mean()})
    # res_i.update({'Max_Drawdown':maxDrawD(df[i])})
    return res_i

def tangency_portfolio(df):
    '''
    Calculates the weights of the tangency portfolio
    Inputs: dataframe with column 0 being the date and the rest ([1:]) being the assets
    Make sure df's first column (0) is the date, or anything that is not an asset
    '''
    stats=calculate_statistics(df)
    assets=len(df.columns[1:])
    mdf=Matrix(df.iloc[:,1:].cov())
    vect1=Matrix([1]*assets)
    mean=[]
    for i in stats:
        mean.append(stats[i]['mean'])
    vectmean=Matrix(mean)
    sigma_inv=mdf.inv()
    wt=(1/((vect1.T@sigma_inv@vectmean)[0,0]))*(sigma_inv@vectmean)

    tickers=[]
    for i in stats:
        tickers.append(i)
    tan_port=pd.DataFrame()
    tan_port['tickers']=tickers
    tan_port['Tangent Weights']=0.0
    for i in range(len(tan_port)):
        tan_port.loc[i,'Tangent Weights']=float(round(wt[i], 6))
    
    tan_port.set_index('tickers', inplace=True,drop=True)

    return tan_port
    

def correlation_heatmap(df):
    '''
    Plots a heatmap of the correlation matrix of a dataframe [1:]
    '''
    plt.figure(figsize=(16, 6))
    heatmap = sns.heatmap(df.iloc[:,1:].corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
    heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12)

def calculate_market_statistics(df,regressor, annualize_factor=12):
    '''
    Calculates the alpha, beta, treynor ratio and information ratio of a dataframe for all non-date columns
    Regressor is benchmark: e.g., SPY, Market
    '''
    res={}
    res_i={}
    for i in df.columns:
        if df[i].dtype=='<M8[ns]':
            pass
        else:
            model=LinearRegression()
            model.fit(np.array(regressor).reshape(-1,1),df[i])
            alpha=model.intercept_*annualize_factor
            beta=model.coef_[0]
            res_i.update({'alpha':alpha})
            res_i.update({'market_beta':beta})
            res_i.update({'treynor_ratio':np.mean(df[i])*annualize_factor/beta})

            residuals=np.array(df[i])-model.predict(np.array(regressor).reshape(-1,1))
            res_i.update({'information_ratio':alpha/(np.std(residuals)*annualize_factor**(1/2))})

            res.update({i:res_i})
            res_i={}
    return pd.DataFrame(res).transpose()

def run_regression(df,regressors,annualize_factor=12):
    '''
    Runs a regression for all non-date columns in a dataframe
    Regressors is a dataframe with the regressors as columns
    Returns a dataframe with the alpha, betas, and r2 for each asset
    '''
    res={}
    res_i={}
    for i in df.columns:
        if df[i].dtype=='<M8[ns]':
            pass
        else:
            model=LinearRegression()
            model.fit(regressors,df[i])
            alpha=model.intercept_*annualize_factor
            betas=model.coef_
            r2=model.score(regressors, df[i])
            res_i.update({'alpha':alpha})
            for ii in range(len(betas)):
                res_i.update({'beta_'+str(regressors.columns[ii]):betas[ii]})
            res_i.update({'r2':r2})
            res.update({i:res_i})
            res_i={}
    
    return pd.DataFrame(res).transpose()

def prob(mu, sigma, h):
    '''
    Returns the probability of underperforming mu
    over a period h. Make sure that mu and sigma are 
    from log returns.
    '''
    return norm.cdf(np.sqrt(h)*-mu/sigma)



# Midterm 2

## FINM 36700 - 2023

### UChicago Financial Mathematics

* Mark Hendricks
* hendricks@uchicago.edu

# Instructions

## Please note the following:

Points
* The exam is 100 points.
* You have 120 minutes to complete the exam.
* For every minute late you submit the exam, you will lose one point.


Submission
* You will upload your solution to the `Midterm 2` assignment on Canvas, where you downloaded this. (Be sure to **submit** on Canvas, not just **save** on Canvas.
* Your submission should be readable, (the graders can understand your answers,) and it should **include all code used in your analysis in a file format that the code can be executed.** 

Rules
* The exam is open-material, closed-communication.
* You do not need to cite material from the course github repo--you are welcome to use the code posted there without citation.

Advice
* If you find any question to be unclear, state your interpretation and proceed. We will only answer questions of interpretation if there is a typo, error, etc.
* The exam will be graded for partial credit.

## Data

**All data files are found in the class github repo, in the `data` folder.**

This exam makes use of the following data files:
* `midterm_2_data.xlsx`

This file has sheets for...
* `info` - names and descriptions of each factor
* `factors (excess returns)` - excess returns on several factors
* `portfolios (excess returns)` - excess returns on industry portfolios
* `risk-free rate` - risk-free rates over time

Note the data is **monthly** so any annualizations should use `12` months in a year.

## Scoring

| Problem | Points |
|---------|--------|
| 1       | 30     |
| 2       | 35     |
| 3       | 20     |
| 4       | 15     |

### Each numbered question is worth 5 points unless otherwise specified.

### Notation
(Hidden LaTeX commands)

$$\newcommand{\betamkt}{\beta^{i,\text{MKT}}}$$
$$\newcommand{\betahml}{\beta^{i,\text{HML}}}$$
$$\newcommand{\betaumd}{\beta^{i,\text{UMD}}}$$
$$\newcommand{\Eri}{E\left[\tilde{r}^{i}\right]}$$
$$\newcommand{\Emkt}{E\left[\tilde{r}^{\text{MKT}}\right]}$$
$$\newcommand{\Ehml}{E\left[\tilde{r}^{\text{HML}}\right]}$$
$$\newcommand{\Eumd}{E\left[\tilde{r}^{\text{UMD}}\right]}$$

# 1. Short Answer

#### No Data Needed

These problems do not require any data file. Rather, analyze them conceptually. 

## 1.

Suppose that we find a set of factors that perfectly hedge any asset. Will these factors work as a linear factor pricing model? 

This would only be the case if the correlations of the residuals were 0 and we had infinite assets. If it perfectly hedged any assets (assuming all errors were = 0) and you had infinite assets, it would also work as a pricing model. Given that infite assets is an unrealistic assumtion (so is perfect/corr of errors = 0; but assuming for a second), the factors would not necessarily work as a pricing model.

## 2.

If the Fama-French 3-factor model fit perfectly, would the Treynor ratio be equal for every asset?

No. If the CAPM fit perfectly, the Treynor would be equal. In the FF 3-factor model there are more factors! Consequently, assuming that the returns for the other factos are not equal to zero, a portion of the returns would be explained by the other 2 factors and the Treynor ratio would not necessarily be constant.

## 3.

Suppose the CAPM fits perfectly. Then assets which have higher time-series r-squared metrics on the market factor will have higher Sharpe ratios.

The CAPM does not say anything about the r2 of the time-series metrics! Assets with a higher beta would have higher sharpe ratios, but capm fitting perfectly means alphas would be zero, nothing about the r2.

## 4.

Based on the case, what are two ways DFA hopes to generate attractive returns for investors?

Through "the value of sound academic research, and the ability of skilled traders to contribute to a fund's profits even when the investment was inherently passive."

Maybe the question meant specifically, in which case it could be by using (1) the value factor, and the (2) size factor, in addition to the market factor. 

## 5.

We analyzed a strategy similar to "AQR's Momentum Funds" (mutual funds.) We found this implementation had much higher returns than the momentum factor of Fama French. What was a major drawback to this construction?

That the correlation with the market was very high! The issue was going long only, as opposed to going long on high momentum, but short the negative momentum, which would mute the correlation with the market.

## 6.

From our analysis, momentum has had a negative mean return since 2009. Is this evidence against momentum as a pricing factor? Explain why this is a problem or why it is not a problem.

Not necessarily a problem! The correlation with other market factors is very small, and in cases negative, which offers significant diversification advantages even if returns are small, or slightly negative.

***

# 2. Linear Factor Pricing Models (LFPMs)

This problem tests the following LFPM:

$$\begin{align}
\Eri = \betamkt \Emkt + \betahml \Ehml + \betaumd \Eumd
\end{align}$$

## 1.

### (8 pts)

Estimate the **time-series (TS)** test of this pricing model. 

For each asset, report the following statistics:
* annualized alpha
* betas
* r-squared

In [2]:
dfer=pd.read_excel('../data/midterm_2_data.xlsx',sheet_name='factors (excess returns)')
dfpor=pd.read_excel('../data/midterm_2_data.xlsx',sheet_name='portfolios (excess returns)')
dfrf=pd.read_excel('../data/midterm_2_data.xlsx',sheet_name='risk-free rate')

In [3]:
reg=run_regression(dfpor,dfer.iloc[:,1:],12)
reg

Unnamed: 0,alpha,beta_MKT,beta_HML,beta_UMD,r2
NoDur,0.029253,0.739522,0.20458,0.049333,0.617919
Durbl,0.010734,1.271865,0.173595,-0.320023,0.613493
Manuf,-0.000996,1.049482,0.197462,-0.036704,0.870268
Enrgy,-0.015117,0.992222,0.637006,0.07517,0.465602
HiTec,0.028207,1.154959,-0.637135,-0.140638,0.829498
Telcm,0.003506,0.837326,0.094363,-0.084518,0.588052
Shops,0.026739,0.946928,-0.042222,-0.015005,0.742161
Hlth,0.031862,0.757605,-0.119928,0.074058,0.580514
Utils,0.01371,0.527879,0.353033,0.108622,0.342654
Other,-0.01978,1.115433,0.426753,-0.048678,0.910098


## 2.

### (7pts)

Estimate the **cross-sectional (CS)** test of the pricing model. 

Include an intercept in your cross-sectional test.

Report the
* annualized intercept
* annualized regression coefficients
* r-squared

In [4]:
temp_df=pd.DataFrame()
model=LinearRegression()
model.fit(reg.iloc[:,1:-1], dfpor.iloc[:,1:].mean())
print('Annualized Intercept', model.intercept_*12)
coefficients = {'MKT': model.coef_[0], 'HML': model.coef_[1], 'UMD': model.coef_[2]}
print('Regression coefficients', coefficients)
r2 = model.score(reg.iloc[:,1:-1], dfpor.iloc[:,1:].mean())
print('r2:', r2)


Annualized Intercept 0.06371608035760146
Regression coefficients {'MKT': 0.002666035066407337, 'HML': -0.0013138850666785042, 'UMD': 0.0025250435970351035}
r2: 0.36619814529491257


## 3.

Report the annualized factor premia (expected excess returns of the three factors) as implied by each of the TS and CS estimations.

Time-series:

In [5]:
dfer.iloc[:,1:].mean()*12

MKT    0.083853
HML    0.025028
UMD    0.061692
dtype: float64

CS

In [6]:
coefficients = {'MKT': model.coef_[0], 'HML': model.coef_[1], 'UMD': model.coef_[2]}
for key, value in coefficients.items():
    print(key, value*12)



MKT 0.03199242079688804
HML -0.015766620800142052
UMD 0.030300523164421243


## 4.

Use the r-squared statistics from the TS and CS tests above to assess whether these factors are effective for decomposition and/or pricing.

Be specific as to how the r-squared statistics from the TS and CS tests impact your conclusions.

In [7]:
print("Average TS r2: ", reg.mean()['r2'])

Average TS r2:  0.6560260105076539


Regarding the **time series approach**, we can see that the average r2 is ~0.65, which means that on average the factors would explain about 65% of the variability in the respective portfolios. This would not be a good decomposition model, which makes sense, because it is not meant to be one. Regarding pricing, the r2 gives us no information regarding the effectivenes of the model. The alphas would. Therefore, we cannot assess TS effectiveness from r-squared statistics.

In [8]:
print('Cross Section r2 = ', r2)

Cross Section r2 =  0.36619814529491257


Regarding the **cross sectional approach**, the r2 gives us direct information about how much expected returns' variability the model can explain; alternatively, how good of a pricing model this is. The r2 is low, which means that the model can only explain a limited portion of the portfolios returns and has therefore limited, although existing ability for pricing purposes.

Regarging factor decomposition, the r2 does not tell us anything about it, since it is only capturing the effect on **expected returns** and not on portfolios' time dependent behavior (other than expected returns).

## 5.

Report the annualized pricing mean absolute error (MAE) implied by each of the TS and CS estimations.

In [9]:
print('Time-Series MAE: ', round(np.mean(np.abs(reg.alpha)),5))

Time-Series MAE:  0.01799


In [10]:
from sklearn.metrics import mean_absolute_error

y_pred = model.predict(reg.iloc[:,1:-1])
y_true = dfpor.iloc[:,1:].mean()*12
csmae = mean_absolute_error(y_true, y_pred)
print('Cross Section Mean Absolute Error:', csmae)


Cross Section Mean Absolute Error: 0.08315265267175573


## 6.

Which asset has the highest premium as implied by the TS estimation? And as implied by the CS estimation? (For the latter, feel free to include the cross-sectional intercept.)

In [19]:
tdf=pd.DataFrame()
tdf['Portfolio']=dfpor.columns[1:]
for j, i in enumerate(dfpor.columns[1:]):
    corr=[]
    corr.append(reg.iloc[j,1])
    corr.append(reg.iloc[j,2])
    corr.append(reg.iloc[j,3])
    i_=0
    for ii in range(3):
        i_+=corr[ii]*dfer.iloc[:,1:].mean()[ii]
    tdf.loc[j,'Expected Return']=i_
tdf.sort_values(by='Expected Return', ascending=False, inplace=True)
tdf



Unnamed: 0,Portfolio,Expected Return
3,Enrgy,0.008648
9,Other,0.008434
1,Durbl,0.007604
2,Manuf,0.007557
6,Shops,0.006452
4,HiTec,0.006019
0,NoDur,0.005848
5,Telcm,0.005613
7,Hlth,0.005425
8,Utils,0.004983


Time-series: 

Highest: Enrgy

In [12]:
ttdf=pd.DataFrame()
ttdf['Portfolio']=dfpor.columns[1:]
ttdf['Expected Return']=y_pred
ttdf.sort_values(by='Expected Return', ascending=False, inplace=True)
ttdf

Unnamed: 0,Portfolio,Expected Return
4,HiTec,0.008871
6,Shops,0.007852
2,Manuf,0.007756
7,Hlth,0.007674
1,Durbl,0.007664
9,Other,0.0076
3,Enrgy,0.007308
5,Telcm,0.007205
0,NoDur,0.007137
8,Utils,0.006527


Cross_sectional:

Highest: HiTec

***

# 3. Additional Analysis

## 1. 

Consider the three-factor pricing model above. How can we assess whether all three factors are useful in this pricing model? 

Specifically, discuss whether the previously estimated regression betas would be informative. If not, what other statistic could we calculate?

The actual beta values would be useful when using the model. To determine if the factors themselves are useful, we could construct a tangency portfolio with the three of them, and see what weights are assigned to each factor.

We can also see from the cross sectional approach what the estimated premiums are for each factor (indirectly using betas). If some of them are too small, this limits their significance when using the model.

## 2.

Suppose we are testing the 3-factor model above, and now we want to allow for time-varying betas.

How could we test the model while allowing for this?

Be specific about the number of regressions we would run and the nature of these regressions.

For each time period, we could calculate a time-series beta based on only a subset of the data available (a rolling window, for example). Based on this betas, we could run a cross sectional for each time period, and estimate premiums for each period.

In particular, we would run a regression for each period of interes, for each asset (time-series, varying betas), then, for each time period we could also run a cross sectional.

## 3.

State one advantage and one disadvantage of using the CS estimation as opposed to the TS estimation in fitting the LFPM to the data.

Advantage: 

We can infer premiums from the data, as opposed to only calculating observed behavior for factor proxies directly. This gives the model additional degrees of freedom that make it a more lenient test for the model.

Disadvantage:

The fact that the test gives the model additional degrees of freedom, also means that some results may not be realistic, and we could further increase inaccuracies in the model making it even less useful for pricing purposes. For example, some premiums may become negative during the fitting process, when other studies would signal positive premiums.

## 4.

Suppose we are investing in just the assets included in our data set. We want to implement a momentum strategy.

Relative to the momentum strategies we studied, do you expect this strategy would have higher or lower...
* mean
* volatility

Explain.

The selection pool seems greatly reduced. This means that it will be harder to leverage small autocorrelation advantages (if any), and potentially, the turnover for top and bottom performers could be higher if only one asset were selected, for example. The momentum strategy tries to exploit tiny advantages in autocorrelation, which we can scale by selecting multiple assets simultaneously. That would not be an option with such a reduced pool within a realistic timeline. I expect the strategy to perform worse in this dataset, so **lower mean, and higher volatility**.

***

# 4. Returns Over Time

## 1.

If Barnstable’s assumptions hold, (log iid returns, normally distributed,) then in what sense is an investment safer in the long-run? And in what sense is it riskier in the long-run?

It is safer in the sense that our estimates of the period-specific expected returns become more accurate, their variability will decrease. However, the variance of the cumulative expected returns will increase, which may translate into a riskier investment.

In short, point estimate of long term period-specific returns will improve (safer), while the variance of cumulative returns will increase (riskier).

## 2. 

### (10pts)

Data 
* Make use of the `risk-free rate` tab.
* Construct the **total** factor returns by adding the risk-free rate to the excess `MKT` and `HML` factor returns.

Assumptions
* The total returns are lognormally distributed and iid. 

Report the probability that `MKT` will outperform `HML` over the following 5 years.

In [13]:
mdfer=dfer.copy()
mdfer['MKT']+=dfrf['RF']
mdfer['HML']+=dfrf['RF']
mdfer['MKT'] = np.log(1 + mdfer['MKT'])
mdfer['HML'] = np.log(1 + mdfer['HML'])
mmkt=mdfer['MKT'].mean()*12
mhml=mdfer['HML'].mean()*12
mdfer['Diff']=mdfer['HML']-mdfer['MKT']
print('Probability that MKT will outperform HML :', round(100*prob(mu=mdfer['Diff'].mean()*12,sigma=mdfer['Diff'].std()*(12**(1/2)),h=5),2),'%')


Probability that MKT will outperform HML : 71.07 %


***