In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.formula.api import ols
import statsmodels.api as sm
from numpy.linalg import inv, solve
from scipy.stats import f

In [47]:

def load_data():
    fm_factors = pd.read_excel('../data/Problem_Set5.xlsx', sheet_name='Fama-French factors', header=2) 
    
    size_groups = ['Small', '2', '3', '4', 'Big']
    ret212_groups = ['Low', '2', '3', '4', 'High']
    portfolio_names = [f'{s}_{r}' for s in size_groups for r in ret212_groups]
    final_cols = ['Date'] + portfolio_names

    df = pd.read_excel('../data/Problem_Set5.xlsx', sheet_name='25_Size_212_Portfolios', header=[1,2])

    df_returns = df.iloc[:, 0:26].copy()
    df_returns.columns = final_cols

    df_size = df.iloc[:, [0] + list(range(28, 53))].copy()
    df_size.columns = final_cols

    df_ret212 = df.iloc[:, [0] + list(range(55, 80))].copy() 
    df_ret212.columns = final_cols

    return df_returns, df_size, df_ret212, fm_factors

df_returns, df_size, df_ret212, fm_factors = load_data()

In [49]:
def clean_data(df, replacement):
    df = df.copy()
    df.replace([-99.99, -999], replacement, inplace=True)
    df['Date'] = pd.to_datetime(df.iloc[:, 0], format='%Y%m')
    df.set_index('Date', inplace=True)
    df.dropna(inplace=True)
    return df

df_returns = clean_data(df_returns, np.nan)
df_ret212 = clean_data(df_ret212, np.nan)
df_size = clean_data(df_size, np.nan)
fm_factors = clean_data(fm_factors, np.nan)
fm_factors = fm_factors.iloc[:, 1:]

def excess_returns(df, factors):
    # Align indices before subtracting
    aligned_df, aligned_factors = df.align(factors, join='inner', axis=0)
    df = aligned_df.subtract(aligned_factors['RF'], axis=0)
    return df

df_returns = excess_returns(df_returns, fm_factors)

print("Data cleaning complete.")
print("\n--- Returns ---")
print(df_returns.head(10))
print("\n--- Fama-French Factors ---")
print(fm_factors.head(10))
print("\n--- Size ---")
print(df_size.head(10))
print("\n--- Ret212 ---")
print(df_ret212.head(10))

df_returns.info()

Data cleaning complete.

--- Returns ---
           Small_Low Small_2 Small_3 Small_4 Small_High  2_Low    2_2    2_3  \
Date                                                                           
1927-01-01     -2.03    2.48    3.91    2.97       -5.2  -3.38   9.89   -0.7   
1927-02-01      6.31    5.72    6.08    7.18       8.61   4.66   7.28   5.24   
1927-03-01      -4.1    2.67   -4.58    -4.0      -1.46  -4.37  -5.11  -0.65   
1927-05-01      3.37    11.4    1.47   12.56      21.58   -0.2   3.29  12.36   
1927-06-01     -3.72   -4.52    0.49   -0.57      -9.54   -2.8  -0.01   1.72   
1927-07-01      3.12    4.73    4.65    5.27       8.54   5.57   7.29   5.06   
1927-08-01     -1.01   -4.14   -3.47   -1.14        4.1   2.74  -0.34    1.3   
1927-09-01       1.6   -2.92   -1.42     3.3       1.99   2.68  -0.41  -1.54   
1927-10-01     -7.92   -5.87    1.07   -1.28      -8.89  -2.79  -5.15  -3.12   
1927-11-01     20.77     3.8   15.77   11.78       8.06  12.71  11.29   8.64   

In [50]:
'''
Consider the following Fama-MacBeth cross-sectional regressions:
(1) Ri = γ0 + γM βiM + γsize ln(size) + γret212 (ret212) + ηi1,
(2) Ri = γ0 + γM βiM + γsize βiSMB + γUMD βiUMD + ηi2,
(3) Ri = γ0 + γM βiM + γsize ln(size) + γret212 (ret212) + γSMB βiSMB + γUMD βiUMD
+ ηi3,
where γ0, γM, γsize, γret212, γSMB, and γUMD are regression parameters. βiM, βiSMB, and βiUMD are betas with respect to the Fama-French factors SMB and UMD and size and ret212 are the average size and past 2-to-12-month return characteristics of the portfolio.
Estimate equations (1), (2), and (3) using the full sample of data and the Fama- MacBeth procedure. The following is a brief outline of the procedure:


1. Estimate βiM, βiSMB, and βiUMD for each portfolio by running a time-series regression for each of the 25 portfolios on the Fama-French factors RMRF, SMB, and UMD. Assume that the betas do not change over time; hence, you can estimate the betas using full-period OLS regressions.
2. Each month estimate the regressions using the month-by-month cross-section of realized returns on the 25 portfolios on their estimated betas and characteristics according to equations (1), (2), and (3).
3. Compute the time series average of the estimates of γ0, γM, γsize, γret212, γSMB, and γUMD. In addition, compute the standard error and t-stat of the time series averages in the style of Fama and MacBeth (1973).'''

def fama_macbeth_regression(df_returns, df_ret212, df_size, fm_factors):
    ## Step 1 : Estimate betas for each portfolio
    aligned_data = df_returns.join(fm_factors[['Mkt-RF', 'SMB', 'UMD']], how='inner')
    returns_aligned = aligned_data[df_returns.columns]
    factors_aligned = aligned_data[['Mkt-RF', 'SMB', 'UMD']]

    results_1 = []
    factors = sm.add_constant(factors_aligned.values)
    for col in returns_aligned.columns:
        Y = returns_aligned[col].values
        X = factors
        model = sm.OLS(Y, X).fit()
        betas = model.params[1:] 
        results_1.append({'Portfolio': col, 'Beta_Mkt': betas[0], 'Beta_SMB': betas[1], 'Beta_UMD': betas[2]})

    beta_df = pd.DataFrame(results_1)

    ## Step 2: Estimate the cross-sectional regressions
    market_beta_vector = beta_df['Beta_Mkt'].values
    smb_beta_vector = beta_df['Beta_SMB'].values
    umd_beta_vector = beta_df['Beta_UMD'].values
    size_vector = []
    ret212_vector = []
    for col in returns_aligned.columns:
        size_vector.append(df_size[col].mean())
        ret212_vector.append(df_ret212[col].mean())
    size_vector = np.array(size_vector)
    ret212_vector = np.array(ret212_vector)
    
    # (1)
    results_1 = []
    for i in returns_aligned.index:
        R_t = returns_aligned.loc[i].values
        X_t = np.column_stack((np.ones(len(R_t)), market_beta_vector, np.log(size_vector), ret212_vector))
        model_t = sm.OLS(R_t, X_t).fit()
        gammas_1 = model_t.params
        results_1.append({'Date': i, 'Gamma_0': gammas_1[0], 'Gamma_Mkt': gammas_1[1], 'Gamma_Size': gammas_1[2], 'Gamma_Ret212': gammas_1[3]}) 

    # (2)
    results_2 = []
    for i in returns_aligned.index:
        R_t = returns_aligned.loc[i].values
        X_t = np.column_stack((np.ones(len(R_t)), market_beta_vector, smb_beta_vector, umd_beta_vector))
        model_t = sm.OLS(R_t, X_t).fit()
        gammas_2 = model_t.params
        results_2.append({'Date': i, 'Gamma_0': gammas_2[0], 'Gamma_Mkt': gammas_2[1], 'Gamma_SMB': gammas_2[2], 'Gamma_UMD': gammas_2[3]}) 

    # (3) 
    results_3 = []
    for i in returns_aligned.index:
        R_t = returns_aligned.loc[i].values
        X_t = np.column_stack((np.ones(len(R_t)), market_beta_vector, smb_beta_vector, umd_beta_vector, np.log(size_vector), ret212_vector))
        model_t = sm.OLS(R_t, X_t).fit()
        gammas_3 = model_t.params
        results_3.append({'Date': i, 'Gamma_0': gammas_3[0], 'Gamma_Mkt': gammas_3[1], 'Gamma_SMB': gammas_3[2], 'Gamma_UMD': gammas_3[3], 'Gamma_Size': gammas_3[4], 'Gamma_Ret212': gammas_3[5]}) 

    ## Step 3: Compute the time series average of the estimates of γ0, γM, γsize, γret212, γSMB, and γUMD. In addition, compute the standard error and t-stat of the time series averages in the style of Fama and MacBeth (1973).
    
    results_list = [results_1, results_2, results_3]
    summary_dict = {}
    for idx, results in enumerate(results_list, 1):
        gamma_df = pd.DataFrame(results).set_index('Date')
        gamma_means = gamma_df.mean()
        gamma_stds = gamma_df.std()
        gamma_tstats = gamma_means / gamma_stds * np.sqrt(len(gamma_df))
        summary_df = pd.DataFrame({
            'Mean': gamma_means,
            'Std': gamma_stds,
            't-stat': gamma_tstats
        })
        print(f"\n--- Fama-MacBeth Results ({idx}) ---")
        print(summary_df)
        summary_dict[f'reg{idx}'] = summary_df

    summary_df = summary_dict

    factor_means = factors_aligned.mean()
    return summary_df, factor_means 

summary_df, factor_means = fama_macbeth_regression(df_returns, df_ret212, df_size, fm_factors)
print("\n--- Factor Means ---")
print(factor_means)




--- Fama-MacBeth Results (1) ---
                  Mean        Std    t-stat
Gamma_0       2.555974  14.607034  5.737182
Gamma_Mkt    -1.094127  13.491470 -2.658966
Gamma_Size   -0.098488   1.098988 -2.938299
Gamma_Ret212  0.007630   0.050839  4.921008

--- Fama-MacBeth Results (2) ---
               Mean        Std    t-stat
Gamma_0    1.483069  11.665212  4.168433
Gamma_Mkt -0.675128  12.221825 -1.811150
Gamma_SMB  0.422114   3.695548  3.745028
Gamma_UMD  0.676586   4.829186  4.593602

--- Fama-MacBeth Results (3) ---
                  Mean        Std    t-stat
Gamma_0       2.582146  17.732916  4.774247
Gamma_Mkt    -0.740413  12.248696 -1.981930
Gamma_SMB    -0.083800   6.889772 -0.398790
Gamma_UMD     0.346415   6.666129  1.703835
Gamma_Size   -0.135898   1.683716 -2.646361
Gamma_Ret212  0.004034   0.054884  2.410110

--- Factor Means ---
Mkt-RF    0.631023
SMB       0.224474
UMD       0.652623
dtype: float64


In [51]:
## Question f)
 
''' Repeat part f) using only data after January, 1963. 
Does your answer change in terms of whether characteristics or covariances better capture the cross-section of returns? 
Do you now feel more or less strongly about your answer to this question in general?'''

df_returns_post_1963 = df_returns[df_returns.index >= '1963-01-01']
df_size_post_1963 = df_size[df_size.index >= '1963-01-01']
df_ret212_post_1963 = df_ret212[df_ret212.index >= '1963-01-01']
fm_factors_post_1963 = fm_factors[fm_factors.index >= '1963-01-01']
summary_df_post_1963, factor_means_post_1963 = fama_macbeth_regression(df_returns_post_1963, df_ret212_post_1963, df_size_post_1963, fm_factors_post_1963)
print("\n--- Factor Means (Post 1963) ---")
print(factor_means_post_1963)



--- Fama-MacBeth Results (1) ---
                  Mean       Std    t-stat
Gamma_0       1.686146  7.788895  5.506448
Gamma_Mkt    -0.837282  9.792276 -2.174905
Gamma_Size   -0.043133  0.829862 -1.322078
Gamma_Ret212  0.008800  0.045844  4.882343

--- Fama-MacBeth Results (2) ---
               Mean       Std    t-stat
Gamma_0    1.115672  6.556163  4.328514
Gamma_Mkt -0.418084  7.695254 -1.381952
Gamma_SMB  0.253155  3.306210  1.947636
Gamma_UMD  0.736742  4.269977  4.388762

--- Fama-MacBeth Results (3) ---
                  Mean       Std    t-stat
Gamma_0       2.157011  9.188861  5.970941
Gamma_Mkt    -0.699567  9.550219 -1.863236
Gamma_SMB    -0.342131  5.979527 -1.455386
Gamma_UMD    -0.221358  5.947972 -0.946627
Gamma_Size   -0.120745  1.267081 -2.423902
Gamma_Ret212  0.011483  0.049502  5.900678

--- Factor Means (Post 1963) ---
Mkt-RF    0.518794
SMB       0.226105
UMD       0.664869
dtype: float64
