# Emprical Asset Pricing - Problem Set 5: Yield

In [1]:
# Packages
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
import wrds
from datetime import datetime, timedelta
import warnings
from pandas.tseries.offsets import MonthEnd
warnings.simplefilter('ignore') # 
import statsmodels.api as sm

# Setups
pd.set_option("display.max_rows", 100)
pd.set_option('display.float_format', lambda x: '%.6f' % x)

### 1. Data Preparation

#### 1.1 Select the first 10 zero-coupon yields

In [9]:
yield_df = pd.read_csv('data/feds200628.csv', skiprows=9)
yield_df = yield_df[yield_df['Date'] > '1971-01']
# yield_df.dropna(inplace=True)
yield_df.head()

Unnamed: 0,Date,BETA0,BETA1,BETA2,BETA3,SVEN1F01,SVEN1F04,SVEN1F09,SVENF01,SVENF02,...,SVENY23,SVENY24,SVENY25,SVENY26,SVENY27,SVENY28,SVENY29,SVENY30,TAU1,TAU2
2492,1971-01-01,,,,,,,,,,...,,,,,,,,,,
2493,1971-01-04,6.467407,-2.566049,0.010459,0.0,6.0954,6.5593,,5.6811,6.2265,...,,,,,,,,,0.848936,-999.99
2494,1971-01-05,6.555818,-2.70662,0.055456,0.0,6.1721,6.651,,5.7403,6.3102,...,,,,,,,,,0.850909,-999.99
2495,1971-01-06,6.548855,-2.732244,0.006502,0.0,6.1928,6.6463,,5.7646,6.3238,...,,,,,,,,,0.803086,-999.99
2496,1971-01-07,6.586611,-2.834778,-0.007784,0.0,6.1827,6.6823,,5.7351,6.3308,...,,,,,,,,,0.829181,-999.99


In [10]:
# Ensure 'Date' is converted to datetime type and set as index
yield_df['Date'] = pd.to_datetime(yield_df['Date'])
yield_df.set_index('Date', inplace=True)

# Filter to get month-end values only
month_end_yields = yield_df.resample('M').last()

# Select columns for zero-coupon yields from 1 to 10 years maturities
columns_needed = [f'SVENY{str(i).zfill(2)}' for i in range(1, 11)]
selected_month_end_yields = month_end_yields[columns_needed]
selected_month_end_yields.head()

Unnamed: 0_level_0,SVENY01,SVENY02,SVENY03,SVENY04,SVENY05,SVENY06,SVENY07,SVENY08,SVENY09,SVENY10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1971-01-31,4.1806,4.7707,5.2395,5.5903,5.8492,6.0417,6.1874,,,
1971-02-28,3.6526,4.3001,4.8217,5.2101,5.4941,5.7034,5.8608,,,
1971-03-31,3.7679,4.2231,4.584,4.8727,5.1056,5.2951,5.4508,,,
1971-04-30,4.7168,5.3321,5.6775,5.8861,6.0211,6.1139,6.1811,,,
1971-05-31,4.9819,5.4942,5.8152,6.0254,6.169,6.2711,6.3464,,,


#### 1.b Construct annual forward rate

In [14]:
# First, let's define a function to calculate the forward rate given n-year and (n-1)-year yields.
def calculate_forward_rate(n, yield_n, yield_n_minus_1):
    return n * yield_n - (n - 1) * yield_n_minus_1

# Now we will apply this function to the DataFrame to calculate the forward rates.
# We will calculate f_t(n) for n = 2 to 10, as f_t(1) is not defined with the given formula.

# Initialize a DataFrame to store forward rates
forward_rates_df = pd.DataFrame(index=selected_month_end_yields.index)

# Calculate forward rates using the provided formula
for n in range(2, 11):
    yield_n = selected_month_end_yields[f'SVENY{str(n).zfill(2)}']
    yield_n_minus_1 = selected_month_end_yields[f'SVENY{str(n-1).zfill(2)}']
    forward_rate = calculate_forward_rate(n, yield_n, yield_n_minus_1)
    forward_rates_df[f'f_t({n})'] = forward_rate

In [15]:
forward_rates_df.head()

Unnamed: 0_level_0,f_t(2),f_t(3),f_t(4),f_t(5),f_t(6),f_t(7),f_t(8),f_t(9),f_t(10)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1971-01-31,5.3608,6.1771,6.6427,6.8848,7.0042,7.0616,,,
1971-02-28,4.9476,5.8649,6.3753,6.6301,6.7499,6.8052,,,
1971-03-31,4.6783,5.3058,5.7388,6.0372,6.2426,6.385,,,
1971-04-30,5.9474,6.3683,6.5119,6.5611,6.5779,6.5843,,,
1971-05-31,6.0065,6.4572,6.656,6.7434,6.7816,6.7982,,,


#### 1.c annual excess return

In [16]:
# Assuming the zero-coupon yields are given as percentages, we need to divide by 100.
log_bond_prices = -np.log(1 + selected_month_end_yields / 100)

# Calculate the log return for each bond, at a monthly frequency
# We will use the shift function to get the previous month's log bond price.
# As per the given formula, we subtract the 1-year yield from the change in log bond prices to get the excess return.
annual_excess_log_returns = log_bond_prices.diff(periods=1).subtract(log_bond_prices['SVENY01'], axis=0)

# Drop the first row as it will contain NaN values due to the diff operation
annual_excess_log_returns = annual_excess_log_returns.dropna()

annual_excess_log_returns.head()

Unnamed: 0_level_0,SVENY01,SVENY02,SVENY03,SVENY04,SVENY05,SVENY06,SVENY07,SVENY08,SVENY09,SVENY10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1971-09-30,0.050216,0.050505,0.050706,0.050845,0.050943,0.051014,0.051067,0.051107,0.051138,0.051164
1971-10-31,0.050316,0.04926,0.048035,0.047225,0.046711,0.046369,0.046127,0.045947,0.045809,0.045698
1971-11-30,0.044248,0.044743,0.045022,0.044985,0.044831,0.044665,0.04452,0.044401,0.044303,0.044225
1971-12-31,0.045489,0.045198,0.045054,0.044872,0.044653,0.044427,0.044213,0.044021,0.043854,0.043709
1972-01-31,0.042621,0.040329,0.039441,0.039177,0.039179,0.039287,0.039426,0.039565,0.039692,0.039806


#### 1.d CP factor

In [52]:
# Recalculate the average excess returns of 2- to 10-year bonds globally
average_excess_returns_global = annual_excess_log_returns.iloc[:, 1:].mean(axis=1)

# Align the lagged forward rates with the average excess returns
# We will use the lagged forward rates as predictors for the regression
lagged_forward_rates_global = forward_rates_df.shift(1)  # Shift the forward rates by one month to get lagged rates
lagged_forward_rates_global.dropna(inplace=True)  # Drop any NaN values resulting from the shift operation

# Ensure alignment of average excess returns with the lagged forward rates
average_excess_returns_aligned = average_excess_returns_global.reindex(lagged_forward_rates_global.index)

# Prepare the predictors with a constant for the intercept
predictors_aligned = sm.add_constant(lagged_forward_rates_global.loc[average_excess_returns_aligned.index])

# Perform the OLS regression model
model = sm.OLS(average_excess_returns_aligned, predictors_aligned)
CP_results = model.fit()

# Define the CP factor as the fitted values from the regression
CP_factor = CP_results.fittedvalues

In [53]:
CP_factor.head()

Date
1971-09-30   0.050720
1971-10-31   0.051854
1971-11-30   0.042877
1971-12-31   0.045784
1972-01-31   0.043220
Freq: ME, dtype: float64

In [54]:
# Now, display the first few values of the CP factor along with the model's summary
CP_results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.992
Model:,OLS,Adj. R-squared:,0.992
Method:,Least Squares,F-statistic:,8517.0
Date:,"Mon, 22 Apr 2024",Prob (F-statistic):,0.0
Time:,23:39:23,Log-Likelihood:,2772.9
No. Observations:,632,AIC:,-5526.0
Df Residuals:,622,BIC:,-5481.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0018,0.000,4.541,0.000,0.001,0.003
f_t(2),0.0730,0.003,21.211,0.000,0.066,0.080
f_t(3),-0.3223,0.031,-10.415,0.000,-0.383,-0.262
f_t(4),0.8330,0.123,6.751,0.000,0.591,1.075
f_t(5),-1.2296,0.267,-4.603,0.000,-1.754,-0.705
f_t(6),0.8887,0.334,2.658,0.008,0.232,1.545
f_t(7),0.0363,0.264,0.137,0.891,-0.483,0.555
f_t(8),-0.5852,0.246,-2.375,0.018,-1.069,-0.101
f_t(9),0.4127,0.191,2.162,0.031,0.038,0.788

0,1,2,3
Omnibus:,87.781,Durbin-Watson:,1.301
Prob(Omnibus):,0.0,Jarque-Bera (JB):,929.408
Skew:,-0.019,Prob(JB):,1.52e-202
Kurtosis:,8.941,Cond. No.,82200.0


#### 1.e Regress forward rate on CP factor

In [55]:
common_dates = forward_rates_df.dropna().index.intersection(CP_factor.index)

In [56]:
# Initialize a DataFrame to store the residuals from the regressions
forward_rate_residuals_df = pd.DataFrame(index=common_dates)

# Perform the regression for each forward rate and collect the residuals
for n in range(2, 11):
    # Prepare the forward rate data for regression, only using the dates where the CP factor is available
    forward_rate_data = forward_rates_df.loc[common_dates, f'f_t({n})'].dropna()
    
    # Get the CP factor aligned with the forward rate data
    CP_factor_aligned = CP_factor.reindex(forward_rate_data.index)
    
    # Only keep the dates where both the forward rate data and CP factor are not missing
    combined_non_missing = CP_factor_aligned.notna() & forward_rate_data.notna()
    forward_rate_data_final = forward_rate_data[combined_non_missing]
    CP_factor_final = CP_factor_aligned[combined_non_missing]
    
    # Add a constant to the CP factor for the intercept
    CP_factor_with_intercept = sm.add_constant(CP_factor_final)
    
    # Perform the regression
    model = sm.OLS(forward_rate_data_final, CP_factor_with_intercept)
    results = model.fit()
    
    # Store the residuals
    forward_rate_residuals_df[f'f_t({n})_residual'] = results.resid.loc[combined_non_missing]

In [57]:
results.summary()

0,1,2,3
Dep. Variable:,f_t(10),R-squared:,0.752
Model:,OLS,Adj. R-squared:,0.752
Method:,Least Squares,F-statistic:,1913.0
Date:,"Mon, 22 Apr 2024",Prob (F-statistic):,4.8200000000000005e-193
Time:,23:39:24,Log-Likelihood:,-1107.0
No. Observations:,632,AIC:,2218.0
Df Residuals:,630,BIC:,2227.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.2726,0.096,34.102,0.000,3.084,3.461
0,72.7681,1.664,43.736,0.000,69.501,76.035

0,1,2,3
Omnibus:,73.401,Durbin-Watson:,0.114
Prob(Omnibus):,0.0,Jarque-Bera (JB):,20.513
Skew:,0.009,Prob(JB):,3.51e-05
Kurtosis:,2.118,Cond. No.,30.0


In [58]:
# Display the residuals DataFrame
forward_rate_residuals_df.dropna(how='all', inplace=True)  # Drop rows where all values are NaN
forward_rate_residuals_df.head()

Unnamed: 0_level_0,f_t(2)_residual,f_t(3)_residual,f_t(4)_residual,f_t(5)_residual,f_t(6)_residual,f_t(7)_residual,f_t(8)_residual,f_t(9)_residual,f_t(10)_residual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1971-09-30,0.203714,0.169703,0.02383,-0.143949,-0.297833,-0.429358,-0.535568,-0.618506,-0.680572
1971-10-31,-0.330622,-0.082369,-0.137647,-0.283986,-0.433809,-0.56342,-0.668099,-0.750841,-0.811508
1971-11-30,0.611568,0.766117,0.73186,0.61008,0.465347,0.328513,0.213004,0.120742,0.051381
1971-12-31,0.074034,0.260807,0.290167,0.240245,0.158348,0.069265,-0.013156,-0.083813,-0.137899
1972-01-31,0.703641,0.91599,0.855791,0.708919,0.549538,0.406738,0.288796,0.195678,0.12608


#### 1.f Construct the first two principle components based on the panel $f^⊥_t(n)$. Call these factors $PC_t = (PC_{1t}, PC_{2t})′$, where $PC_{it}$ corresponds to the i−th principal component.

In [59]:
from sklearn.decomposition import PCA

# We need to first drop any NaN values to prepare the data for PCA
# Note that PCA cannot be performed with missing values
forward_rate_residuals_clean = forward_rate_residuals_df.dropna()

# Define a PCA object to calculate the first two principal components
pca = PCA(n_components=2)

# Fit the PCA on the residuals data and transform the data accordingly
PC = pca.fit_transform(forward_rate_residuals_clean)

# Convert the principal components into a DataFrame for easier handling and naming
PC_df = pd.DataFrame(data=PC, index=forward_rate_residuals_clean.index, columns=['PC_1t', 'PC_2t'])

PC_df.head()

Unnamed: 0_level_0,PC_1t,PC_2t
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1971-09-30,0.936261,-0.78106
1971-10-31,1.429912,-0.412486
1971-11-30,-1.151818,-0.954922
1971-12-31,-0.238361,-0.391023
1972-01-31,-1.420187,-1.068121


#### 1.g Remove the sample mean from $CP_t$ and the principle component $PC_t$

In [60]:
# Remove the sample mean from CP_t
CP_factor_demeaned = CP_factor - CP_factor.mean()

# Remove the sample mean from each of the principal components
PC_df_demeaned = PC_df - PC_df.mean()

# Display the first few rows of the demeaned CP factor and principal components
CP_factor_demeaned.head()


Date
1971-09-30    0.003696
1971-10-31    0.004831
1971-11-30   -0.004147
1971-12-31   -0.001240
1972-01-31   -0.003804
Freq: ME, dtype: float64

In [61]:
PC_df_demeaned.head()

Unnamed: 0_level_0,PC_1t,PC_2t
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1971-09-30,0.936261,-0.78106
1971-10-31,1.429912,-0.412486
1971-11-30,-1.151818,-0.954922
1971-12-31,-0.238361,-0.391023
1972-01-31,-1.420187,-1.068121


#### 1.h Report

i. R-Squared of the CP Regression

In [63]:
# i. The R-squared of the CP regression (using the full sample)
R_squared_full = CP_results.rsquared

print(f'The R-Squared of the CP regression using full sample is: {R_squared_full}')

The R-Squared of the CP regression using full sample is: 0.9919510494843413


ii. R-Squared separately when estimating the model using data until 1999.12 and thereafter. Comment on the difference

In [64]:
# ii. R-squared separately for the periods until December 1999 and thereafter
# Split the data based on the date
split_date = pd.Timestamp('1999-12-31')
data_pre_2000 = average_excess_returns_aligned[average_excess_returns_aligned.index <= split_date]
data_post_2000 = average_excess_returns_aligned[average_excess_returns_aligned.index > split_date]

# Perform the regression and compute R-squared for the period until December 1999
predictors_pre_2000 = predictors_aligned.loc[data_pre_2000.index]
model_pre_2000 = sm.OLS(data_pre_2000, predictors_pre_2000)
results_pre_2000 = model_pre_2000.fit()

# Perform the regression and compute R-squared for the period after December 1999
predictors_post_2000 = predictors_aligned.loc[data_post_2000.index]
model_post_2000 = sm.OLS(data_post_2000, predictors_post_2000)
results_post_2000 = model_post_2000.fit()

# R-squared values for the two subperiods
R_squared_pre_2000 = results_pre_2000.rsquared
R_squared_post_2000 = results_post_2000.rsquared


In [65]:
print(f'The R-Squared of the CP regression using pre-2000 data is: {R_squared_pre_2000}')

The R-Squared of the CP regression using pre-2000 data is: 0.9788071513734093


In [49]:
print(f'The R-Squared of the CP regression using post-2000 data is: {R_squared_pre_2000}')

The R-Squared of the CP regression using post-2000 data is: 0.9788071513734093


**Discussion:** The difference in the R-squared values suggests that the CP factor has a stronger relationship with the average excess returns in the more recent period (post-1999) compared to the earlier period. This could indicate that the predictive power of the CP factor has increased over time, or that the behavior of bond excess returns has changed in a way that is more closely captured by the CP factor in recent years.

### 2. Affine model

### 3. Estimate δ0 and δ1 by regressing the short rates on the factors. 

#### Report the estimates for δ0 and δ1 as well as the R-squared of the regression. What should the R-squared be if the affine model is correct?

**Note that we first estimated the VAR model**

In [82]:
# Extract the short rate, which is the one-year zero-coupon yield, from the zero-coupon yield data
short_rate = selected_month_end_yields['SVENY01']

# We have already created the demeaned factors DataFrame named 'factors_df' containing CP_t, PC_1t, and PC_2t
# Now, we regress the short rate on the factors to estimate δ0 and δ1

# Ensure that the short rate and factors have the same dates and no missing values
short_rate_aligned = short_rate.reindex(factors_df.index).dropna()
factors_aligned = factors_df.reindex(short_rate_aligned.index).dropna()

# Add a constant for δ0
X_with_constant = sm.add_constant(factors_aligned)

# Fit the OLS regression model
ols_model = sm.OLS(short_rate_aligned, X_with_constant)
ols_results = ols_model.fit()

# Extract the estimated parameters and R-squared
delta_0_est = ols_results.params['const']
delta_1_est = ols_results.params.drop('const')
r_squared = ols_results.rsquared

In [83]:
delta_0_est

4.871642412378073

In [84]:
delta_1_est

0       105.148033
PC_1t    -0.018983
PC_2t    -0.248467
dtype: float64

In [85]:
r_squared

0.9817911544103596

- Estimate for $\delta_0$: 4.8716
- Estimate for $\delta_1$:
    - Loading on $CP_t$: 105.1480
    - Loading on $PC_{1t}$: -0.0190
    - Loading on $PC_{2t}$: -0.2485
- R-squared of the regression: 0.9818

The R-squared is very close to 1, indicating that the factors explain a large portion of the variance in the short rate. If the affine model is correct, we would expect an R-squared close to 1, as it suggests the factors included in the model are capable of capturing most of the movements in the short rate. The result here supports the notion that the affine model is well-specified, at least in terms of its ability to explain the variability in the short rate. ​

### 4. Estimate the VAR

In [67]:
from statsmodels.tsa.api import VAR

# Assuming that we have a DataFrame 'factors_df' that contains the demeaned factors
# For the purpose of this example, let's construct 'factors_df' using the demeaned CP factor and the demeaned PCs
factors_df = pd.concat([CP_factor_demeaned, PC_df_demeaned], axis=1)

# Fit a VAR model with a 12-month lag
# We need to drop the missing values that may arise from lagging the factors by 12 months
var_model = VAR(factors_df.dropna())
results_var = var_model.fit(12)

# The estimated transition matrix (Gamma) can be obtained from the VAR model coefficients
Gamma_hat = results_var.params[1:]  # Exclude the intercept which should not be present due to demeaning

# The covariance matrix of the residuals (Sigma) is the covariance matrix of the VAR model residuals
Sigma_hat = results_var.resid.cov()

In [68]:
Gamma_hat

Unnamed: 0,0,PC_1t,PC_2t
L1.0,1.778015,164.573703,58.593776
L1.PC_1t,-0.0017,0.576603,-0.097413
L1.PC_2t,-0.006309,-1.194997,0.501631
L2.0,-0.781293,-107.29853,-51.042674
L2.PC_1t,0.001614,0.259016,0.100437
L2.PC_2t,0.004193,0.777106,0.253568
L3.0,0.038628,-57.960601,1.968678
L3.PC_1t,-0.000287,0.060477,-0.034199
L3.PC_2t,0.00086,0.440107,0.190113
L4.0,0.009714,-32.887554,-22.435316


In [69]:
Sigma_hat

Unnamed: 0,0,PC_1t,PC_2t
0,8e-06,0.002027,0.000634
PC_1t,0.002027,1.297577,0.319892
PC_2t,0.000634,0.319892,0.212101


### 5. Two-pass regression to figure out which shock is more important

In [88]:
# Calculate the excess returns using the forward rate for maturity n and the short rate y_t(1)
# The short rate is approximated by the one-year zero-coupon yield (SVENY01)

# Assume we have the forward rates DataFrame forward_rates_df and the one-year zero-coupon yield series
# The yields are expressed in percentage, so we need to divide by 100
short_rate = selected_month_end_yields['SVENY01'] / 100

# Calculate the log excess returns for each maturity
excess_returns = {}
for n in range(2, 11):  # Maturity from 2 to 10 years
    forward_rate = forward_rates_df[f'f_t({n})'] / 100
    excess_return = np.log1p(forward_rate) - np.log1p(short_rate)
    excess_returns[f'excess_return_{n}'] = excess_return

# Convert the excess returns dictionary to a DataFrame
excess_returns_df = pd.DataFrame(excess_returns)

# Compute the residuals (shocks) from the VAR model
# The shocks are the residuals of the VAR model for the demeaned factors
shocks = results_var.resid

# First pass regression: Regress the excess returns on the shocks for each maturity
# We should have one regression for each maturity
betas = {}
for maturity, returns in average_excess_returns_global.items():
    # Align the shocks with the excess returns
    aligned_shocks = shocks.reindex(average_excess_returns_global.dropna().index)
    # Add a constant to the shocks DataFrame
    aligned_shocks_with_const = sm.add_constant(aligned_shocks)
    # Perform the regression
    model = sm.OLS(average_excess_returns_global.dropna(), aligned_shocks_with_const)
    results = model.fit()
    # Collect the beta coefficients
    betas[maturity] = results.params.drop('const')  # Exclude the intercept

# Convert the betas dictionary to a DataFrame
betas_df = pd.DataFrame(betas)

betas_df.head()

MissingDataError: exog contains inf or nans

### 6. Derive expression

### 7. Report θ, the predictive coefficient

### 8. Choose λ0(2) to match the risk premium on the 10-year bond and report the estimate.