### Emprical Asset Pricing - Problem Set 1

## 1. Time-Series Predictability of returns and dividend growth

### 1.a Data Extraction and Cleaning

In [1]:
# Packages
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
import wrds
from datetime import datetime, timedelta
import warnings
from pandas.tseries.offsets import MonthEnd
warnings.simplefilter('ignore') # 

# Setups
pd.set_option("display.max_rows", 100)

In [2]:
# Set Up WRDS connection
db = wrds.Connection(wrds_username='vince_solis') # make sure to change the username. 

Loading library list...
Done


In [3]:
def get_libraries(db, printn = False):
    """
    get list of libraries
    """

    # List all libraries in WRDS
    libs = db.list_libraries()
    libs.__class__  # Notice that libs is a list.
    libs = pd.DataFrame({'libraries': libs})  # Transform libs to a Pandas data frame to have a better display.
    libs = libs.sort_values(['libraries']).reset_index(drop = True)
    
    # https://wrds-www.wharton.upenn.edu/pages/browse-data-concept/
    if printn:
        print(libs.to_string())  # tr_mutualfunds, tr_13f, tr_13f
    else:
        return libs

def get_tables_in_library(db, library):
    """
    List of tables in a library
    """

    return db.list_tables(library=library)

In [99]:
# Get specific columns within a particular range from table dsi (daily stock indices) from library crsp_q_stock
start_date, end_date = dt.date(1945, 1, 1), dt.date(2023, 12, 31)
start_date, end_date = start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')

# df = db.raw_sql("SELECT date, vwretd, vwretx  FROM crsp_q_stock.dsi WHERE (date BETWEEN '{}' AND '{}')".format(start_date, end_date))
# df.head()

In [32]:
df = pd.read_csv('./data/crsp_1945-2023.csv', index_col=0)
df.index = pd.to_datetime(df.index).to_period('M')
df.head()

Unnamed: 0_level_0,vwretd,vwretx
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1945-01,0.020218,0.018951
1945-02,0.064477,0.059894
1945-03,-0.039177,-0.043164
1945-04,0.078232,0.076981
1945-05,0.018185,0.012439


In [33]:
ff = pd.read_csv('./data/F-F_Rf.csv', infer_datetime_format=True, index_col=0)
ff.index = pd.to_datetime(ff.index).to_period('M')
ff.head()

Unnamed: 0_level_0,rf
date,Unnamed: 1_level_1
1926-07,0.22
1926-08,0.25
1926-09,0.23
1926-10,0.32
1926-11,0.31


### 1.b Computing monthly dividend

$$ 
\frac{D_t}{P_t} = \frac{P_{t+1} + D_{t+1}}{P_t} -  \frac{P_{t+1}}{P_t} \quad \Rightarrow \quad D_t = \frac{D_t}{P_t} \times P_t 
$$


In [34]:
df['dividend_ret'] = df['vwretd'] - df['vwretx']
df.head()

Unnamed: 0_level_0,vwretd,vwretx,dividend_ret
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1945-01,0.020218,0.018951,0.001267
1945-02,0.064477,0.059894,0.004583
1945-03,-0.039177,-0.043164,0.003987
1945-04,0.078232,0.076981,0.001251
1945-05,0.018185,0.012439,0.005746


Constructing a price index, here we set the intial price as 1

In [35]:
df['price_index'] = (1 + df['vwretd']).cumprod()
# df['price_index'] = df['price_index']*10
df.head()

Unnamed: 0_level_0,vwretd,vwretx,dividend_ret,price_index
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1945-01,0.020218,0.018951,0.001267,1.020218
1945-02,0.064477,0.059894,0.004583,1.085999
1945-03,-0.039177,-0.043164,0.003987,1.043452
1945-04,0.078232,0.076981,0.001251,1.125084
1945-05,0.018185,0.012439,0.005746,1.145543


In [36]:
df['dividend'] = df['dividend_ret'] * df['price_index'].shift(1)
df.fillna(method='bfill',inplace=True)

### 1.c Aggregating dividends

We assume dividend being reinvested at risk-free rate and alternatively into aggregate stock market. 

In [83]:
df_temp = df.merge(ff, left_index=True, right_index=True)
df_temp.head()

Unnamed: 0_level_0,vwretd,vwretx,dividend_ret,price_index,dividend,rf
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1945-01,0.020218,0.018951,0.001267,1.020218,0.004676,0.03
1945-02,0.064477,0.059894,0.004583,1.085999,0.004676,0.02
1945-03,-0.039177,-0.043164,0.003987,1.043452,0.00433,0.02
1945-04,0.078232,0.076981,0.001251,1.125084,0.001305,0.03
1945-05,0.018185,0.012439,0.005746,1.145543,0.006465,0.03


In [84]:
# transfer the risk free rate to numarical and adjust the scale
df_temp['rf'] = pd.to_numeric(df_temp['rf'], errors='coerce')/100
df_temp['compound_month'] = 12 - df_temp.index.month
df_temp.head()

Unnamed: 0_level_0,vwretd,vwretx,dividend_ret,price_index,dividend,rf,compound_month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1945-01,0.020218,0.018951,0.001267,1.020218,0.004676,0.0003,11
1945-02,0.064477,0.059894,0.004583,1.085999,0.004676,0.0002,10
1945-03,-0.039177,-0.043164,0.003987,1.043452,0.00433,0.0002,9
1945-04,0.078232,0.076981,0.001251,1.125084,0.001305,0.0003,8
1945-05,0.018185,0.012439,0.005746,1.145543,0.006465,0.0003,7


In [85]:
def compound_dividend(row, reinvestment_rate):
    """
    calculates the dividend compouned at reinvestment_rate(geometric return)
    """
    rate = row[reinvestment_rate] if reinvestment_rate == 'rf' else row['vwretd']
    months = row['compound_month']
    return row['dividend'] * ((1 + rate)** months )

In [86]:
df_temp['dividend_compounded_cash'] = df_temp.apply(compound_dividend, reinvestment_rate='rf', axis=1)
df_temp['dividend_compounded_stock'] = df_temp.apply(compound_dividend, reinvestment_rate='vwretd', axis=1)

In [87]:
df_temp

Unnamed: 0_level_0,vwretd,vwretx,dividend_ret,price_index,dividend,rf,compound_month,dividend_compounded_cash,dividend_compounded_stock
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1945-01,0.020218,0.018951,0.001267,1.020218,0.004676,0.0003,11,0.004691,0.005827
1945-02,0.064477,0.059894,0.004583,1.085999,0.004676,0.0002,10,0.004685,0.008734
1945-03,-0.039177,-0.043164,0.003987,1.043452,0.004330,0.0002,9,0.004338,0.003022
1945-04,0.078232,0.076981,0.001251,1.125084,0.001305,0.0003,8,0.001308,0.002385
1945-05,0.018185,0.012439,0.005746,1.145543,0.006465,0.0003,7,0.006478,0.007334
...,...,...,...,...,...,...,...,...,...
2023-08,-0.020300,-0.022082,0.001782,3772.627863,6.862124,0.0045,4,6.986478,6.321658
2023-09,-0.048003,-0.049226,0.001223,3591.530408,4.613924,0.0043,3,4.673700,3.980862
2023-10,-0.029225,-0.030265,0.001040,3486.567932,3.735192,0.0047,2,3.770385,3.520060
2023-11,0.093358,0.091189,0.002169,3812.066941,7.562366,0.0044,1,7.595640,8.268373


### 1.d Construct non-overlapping annual returns, annual dividend growth, log price dividend ratio for cash invested and market-invested dividend. 

In [191]:
df_temp['year'] = df_temp.index.year

# Compute Compounded Annual Returns for vwretx
compounded_annual_returns_dv = df_temp.groupby('year')['vwretd'].apply(lambda x: (x + 1).prod() - 1)

# Compute Compounded Annual Returns for vwretx
compounded_annual_returns_xdv = df_temp.groupby('year')['vwretx'].apply(lambda x: (x + 1).prod() - 1)

# Adjusting calculations for annual dividends and their growth
annual_div_zero = df_temp.groupby('year')['dividend'].sum()
annual_div_cash = df_temp.groupby('year')['dividend_compounded_cash'].sum()
annual_div_stock = df_temp.groupby('year')['dividend_compounded_stock'].sum()

# Calculate annual dividend growth as the percentage difference (year-over-year change)
dg_zero = annual_div_zero.pct_change()
dg_cash = annual_div_cash.pct_change()
dg_stock = annual_div_stock.pct_change()

price_index_annual = df_temp.groupby('year')['price_index'].last()

log_price_dividend_ratio_zero = np.log(price_index_annual / annual_div_zero)
log_price_dividend_ratio_cash = np.log(price_index_annual / annual_div_cash)
log_price_dividend_ratio_stock = np.log(price_index_annual / annual_div_cash)

# Creating a new DataFrame to hold all calculated data
annual_df = pd.DataFrame({
    'ret_dv': compounded_annual_returns_dv,
    'ret_xdv': compounded_annual_returns_xdv,
    'g_Zero': dg_zero,
    'g_Cash': dg_cash,
    'g_Stock': dg_stock,
    'Log_PD_Zero': log_price_dividend_ratio_zero,
    'Log_PD_Cash': log_price_dividend_ratio_cash,
    'Log_PD_Stock': log_price_dividend_ratio_stock
})

# Resetting index to have 'year' as a column
annual_df.reset_index(inplace=True)

In [192]:
annual_df.describe().round(2)

Unnamed: 0,year,ret_dv,ret_xdv,g_Zero,g_Cash,g_Stock,Log_PD_Zero,Log_PD_Cash,Log_PD_Stock
count,79.0,79.0,79.0,78.0,78.0,78.0,79.0,79.0,79.0
mean,1984.0,0.13,0.09,0.1,0.1,0.1,3.57,3.55,3.55
std,22.95,0.17,0.17,0.08,0.08,0.15,0.43,0.43,0.43
min,1945.0,-0.38,-0.4,-0.14,-0.15,-0.16,2.77,2.77,2.77
25%,1964.5,0.0,-0.03,0.06,0.06,-0.0,3.26,3.24,3.24
50%,1984.0,0.16,0.12,0.09,0.09,0.09,3.52,3.5,3.5
75%,2003.5,0.25,0.22,0.13,0.13,0.18,3.94,3.94,3.94
max,2023.0,0.5,0.43,0.32,0.32,0.63,4.49,4.46,4.46


In [193]:
# compute mean and volatility
mean_growth_cash = annual_df['g_Cash'].mean()
volatility_growth_cash = annual_df['g_Cash'].std()
mean_growth_market = annual_df['g_Stock'].mean()
volatility_growth_market = annual_df['g_Stock'].std()

print(f"Dividend growth mean_reinvest in cash market: {mean_growth_cash}")
print(f"Dividend growth volatility_reinvest in cash market: {volatility_growth_cash}")
print(f"Dividend growth mean_reinvest in stock market: {mean_growth_market}")
print(f"Dividend growth volatility_reinvest in stock market: {volatility_growth_market}")

Dividend growth mean_reinvest in cash market: 0.09844036884563737
Dividend growth volatility_reinvest in cash market: 0.07833731315087886
Dividend growth mean_reinvest in stock market: 0.10387231928764912
Dividend growth volatility_reinvest in stock market: 0.1457956908109608


Discussion:

### 1.e Predict log returns and log dividend growth using the lagged log price-dividend ratio. 

In [194]:
import statsmodels.api as sm

# Assuming 'annual_df' is prepared as previously described
annual_df['Log_PD_Cash_Lagged'] = annual_df['Log_PD_Cash'].shift(1)
annual_df['Log_Return'] = np.log(1 + annual_df['ret_dv'])
annual_df['Log_Dividend_Growth'] = np.log(1 + annual_df['g_Cash'])

# Dropping NaN values that result from lagging
annual_df.dropna(inplace=True)

In [195]:
def run_regression_sm(data, dependent_var, independent_var):
    # Adding a constant to the independent variable for the intercept
    X = sm.add_constant(data[independent_var])
    y = data[dependent_var]
    
    model = sm.OLS(y, X).fit()
    print(model.summary())

In [196]:
def regress_and_report(data, dependent_var, independent_var, sub_sample=None):
    data_copy = data.copy()
    if sub_sample is not None:
        data_copy = data_copy[(data_copy['Year'] >= sub_sample[0]) & (data_copy['Year'] <= sub_sample[1])]

    lagged_var = 'lagged_' + independent_var
    data_copy[lagged_var] = data_copy[independent_var].shift(1)

    data_copy.dropna(subset=[lagged_var], inplace=True)

    X = sm.add_constant(data_copy[lagged_var])
    y = data_copy[dependent_var]

    model = sm.OLS(y, X).fit()

    return {
        'coefficients': model.params,
        'R_squared': model.rsquared,
        'model_summary': model.summary()
    }

In [197]:
results_return_full = regress_and_report(
    annual_df,
    'Log_Return',
    'Log_PD_Cash_Lagged',
    sub_sample=None
)

# For predicting cash-invested dividend growth using the lagged log price-dividend ratio:
results_div_growth_full = regress_and_report(
    annual_df,
    'Log_Dividend_Growth',
    'Log_PD_Cash_Lagged',
    sub_sample=None
)

In [198]:
print("Full Sample Regression Analysis")
print(f"\nPredicting Returns:{results_return_full['model_summary']}")

Full Sample Regression Analysis

Predicting Returns:                            OLS Regression Results                            
Dep. Variable:             Log_Return   R-squared:                       0.060
Model:                            OLS   Adj. R-squared:                  0.047
Method:                 Least Squares   F-statistic:                     4.779
Date:                Mon, 25 Mar 2024   Prob (F-statistic):             0.0319
Time:                        22:22:31   Log-Likelihood:                 32.308
No. Observations:                  77   AIC:                            -60.62
Df Residuals:                      75   BIC:                            -55.93
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------

In [199]:
print("Full Sample Regression Analysis")
print(f"\nPredicting Dividend Growth:{results_div_growth_full['model_summary']}")

Full Sample Regression Analysis

Predicting Dividend Growth:                             OLS Regression Results                            
Dep. Variable:     Log_Dividend_Growth   R-squared:                       0.065
Model:                             OLS   Adj. R-squared:                  0.052
Method:                  Least Squares   F-statistic:                     5.172
Date:                 Mon, 25 Mar 2024   Prob (F-statistic):             0.0258
Time:                         22:22:31   Log-Likelihood:                 97.172
No. Observations:                   77   AIC:                            -190.3
Df Residuals:                       75   BIC:                            -185.7
Df Model:                            1                                         
Covariance Type:             nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------

#### Sub-sample regression

In [200]:
# Perform regression analysis for the sub-samples (1945-1990 and 1990-2021)
for start_year, end_year in [(1945, 1990), (1990, 2021)]:
    sub_df = annual_df[(annual_df['year'] >= start_year) & (annual_df['year'] <= end_year)]
    
    print(f"\nSub-sample: {start_year}-{end_year} Regression Analysis")
    print("\nPredicting Returns:")
    run_regression_sm(sub_df, 'Log_Return', 'Log_PD_Cash_Lagged')

    print("\nPredicting Dividend Growth:")
    run_regression_sm(sub_df, 'Log_Dividend_Growth', 'Log_PD_Cash_Lagged')


Sub-sample: 1945-1990 Regression Analysis

Predicting Returns:
                            OLS Regression Results                            
Dep. Variable:             Log_Return   R-squared:                       0.236
Model:                            OLS   Adj. R-squared:                  0.219
Method:                 Least Squares   F-statistic:                     13.31
Date:                Mon, 25 Mar 2024   Prob (F-statistic):           0.000708
Time:                        22:22:33   Log-Likelihood:                 26.397
No. Observations:                  45   AIC:                            -48.79
Df Residuals:                      43   BIC:                            -45.18
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------

#### Full Report for Full Sample, 1945-1990, 1990-2021

In [201]:
def get_regression_stats_and_r2(data, dependent_var, independent_var):
    X = sm.add_constant(data[independent_var])
    y = data[dependent_var]
    model = sm.OLS(y, X).fit()
    beta = model.params[1]
    t_beta = model.tvalues[1]
    r_squared = model.rsquared
    return beta, t_beta, r_squared

# Initialize a dictionary to hold the results with R^2 values
results = {
    'Sample': ['Full Sample', '1945-1990', '1990-2021'],
    'beta_d': [],
    't_beta_d': [],
    'R2_d': [],
    'beta_r': [],
    't_beta_r': [],
    'R2_r': []
}

# Full Sample
beta_d, t_beta_d, r2_d = get_regression_stats_and_r2(annual_df, 'Log_Dividend_Growth', 'Log_PD_Cash_Lagged')
beta_r, t_beta_r, r2_r = get_regression_stats_and_r2(annual_df, 'Log_Return', 'Log_PD_Cash_Lagged')
results['beta_d'].append(beta_d)
results['t_beta_d'].append(t_beta_d)
results['R2_d'].append(r2_d)
results['beta_r'].append(beta_r)
results['t_beta_r'].append(t_beta_r)
results['R2_r'].append(r2_r)

# Sub-Samples
for start_year, end_year in [(1945, 1990), (1990, 2021)]:
    sub_df = annual_df[(annual_df['year'] >= start_year) & (annual_df['year'] <= end_year)]
    
    beta_d, t_beta_d, r2_d = get_regression_stats_and_r2(sub_df, 'Log_Dividend_Growth', 'Log_PD_Cash_Lagged')
    beta_r, t_beta_r, r2_r = get_regression_stats_and_r2(sub_df, 'Log_Return', 'Log_PD_Cash_Lagged')
    results['beta_d'].append(beta_d)
    results['t_beta_d'].append(t_beta_d)
    results['R2_d'].append(r2_d)
    results['beta_r'].append(beta_r)
    results['t_beta_r'].append(t_beta_r)
    results['R2_r'].append(r2_r)

results_df = pd.DataFrame(results)

In [202]:
results_df

Unnamed: 0,Sample,beta_d,t_beta_d,R2_d,beta_r,t_beta_r,R2_r
0,Full Sample,-0.034232,-1.840875,0.042686,-0.108912,-2.561629,0.079479
1,1945-1990,-0.082343,-2.337006,0.1127,-0.315011,-3.648901,0.236431
2,1990-2021,0.056158,1.034359,0.034435,-0.194368,-1.765157,0.094087


### 1.f Campbell-Shiller Identify

$$
    pd_{t+1} = a_{pd} + \phi pd_t + \epsilon^{pd}_{t+1}
$$

In [209]:
annual_df['Log_PD_Lagged'] = annual_df['Log_PD_Cash'].shift(1).dropna()

X = sm.add_constant(annual_df['Log_PD_Lagged'][1:])
y = annual_df['Log_PD_Cash'][1:]

model_pd = sm.OLS(y, X).fit()
print(F'AR(1) Model for Log Price-Dividend Ratio::{model_pd.summary()}')

AR(1) Model for Log Price-Dividend Ratio::                            OLS Regression Results                            
Dep. Variable:            Log_PD_Cash   R-squared:                       0.841
Model:                            OLS   Adj. R-squared:                  0.839
Method:                 Least Squares   F-statistic:                     396.9
Date:                Mon, 25 Mar 2024   Prob (F-statistic):           1.11e-31
Time:                        22:29:45   Log-Likelihood:                 26.758
No. Observations:                  77   AIC:                            -49.52
Df Residuals:                      75   BIC:                            -44.83
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
cons

In [215]:
#compute pd_mean and rho
pd_mean = annual_df['Log_PD_Cash'].mean()
rho = np.exp(pd_mean)/(1+np.exp(pd_mean))
rho

0.9722687688693198

In [216]:
phi = model_pd.params[1]  # This is the estimated φ from the AR(1) model.

# Placeholder values for b_r and b_d. Replace these with actual values from your regressions.
b_r = -0.108925  # Coefficient from return predictability regression
b_d = -0.005312  # Coefficient from dividend growth predictability regression

discount_rate_news = -b_r / (1 - rho * phi) * annual_df['Log_PD_Cash']
cash_flow_news = b_d / (1 - rho * phi) * annual_df['Log_PD_Cash']

Divide both side by $Var(pd_t)$ to estimate how much variation is due to discount rate news and cash flow news

In [217]:
var_pd = annual_df['Log_PD_Cash'].var()
cov_cash_flow = np.cov(cash_flow_news, annual_df['Log_PD_Cash'])[0, 1]
cov_discount_rate = np.cov(discount_rate_news, annual_df['Log_PD_Cash'])[0, 1]

In [218]:
discount_rate_proportion = cov_discount_rate / var_pd
cash_flow_proportion = cov_cash_flow / var_pd

print("Variance Decomposition:")
print("Proportion due to Discount Rate News:", discount_rate_proportion)
print("Proportion due to Cash Flow News:", cash_flow_proportion)

Variance Decomposition:
Proportion due to Discount Rate News: 1.052006010892177
Proportion due to Cash Flow News: -0.051303703739814036


### 1.g The present-value identity implies restrictions between $b_r$, $b_d$ and $\phi$. Derive the connection between the coefficients. 

#### 1.h Predict cumulative returns using lagged PD for n = {1, ..., 5}. Is there more predictability over longer horizons?

#### 1.i Assume log pd ratio follow AR(1), derive the connection between the coefficients $b_{r,n}$, $b_{d,n}$ and $\phi$.