In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.formula.api import ols
import statsmodels.api as sm
from numpy.linalg import inv, solve
from scipy.stats import f

In [23]:

def load_data():
    fm_factors = pd.read_excel('../data/Problem_Set5.xlsx', sheet_name='Fama-French factors', header=2) 
    
    size_groups = ['Small', '2', '3', '4', 'Big']
    ret212_groups = ['Low', '2', '3', '4', 'High']
    portfolio_names = [f'{s}_{r}' for s in size_groups for r in ret212_groups]
    final_cols = ['Date'] + portfolio_names

    df = pd.read_excel('../data/Problem_Set5.xlsx', sheet_name='25_Size_212_Portfolios', header=[1,2])

    df_returns = df.iloc[:, 0:26].copy()
    df_returns.columns = final_cols

    df_size = df.iloc[:, [0] + list(range(28, 53))].copy()
    df_size.columns = final_cols

    df_ret212 = df.iloc[:, [0] + list(range(55, 80))].copy() 
    df_ret212.columns = final_cols

    return df_returns, df_size, df_ret212, fm_factors

df_returns, df_size, df_ret212, fm_factors = load_data()



In [24]:
def clean_data(df, replacement):
    df = df.copy()
    df.replace([-99.99, -999], replacement, inplace=True)
    df['Date'] = pd.to_datetime(df.iloc[:, 0], format='%Y%m')
    df.set_index('Date', inplace=True)
    df.dropna(inplace=True)
    return df

df_returns = clean_data(df_returns, np.nan)
df_ret212 = clean_data(df_ret212, np.nan)
df_size = clean_data(df_size, np.nan)
fm_factors = clean_data(fm_factors, np.nan)
fm_factors = fm_factors.iloc[:, 1:]

print("Data cleaning complete.")
print("\n--- Returns ---")
print(df_returns.head(10))
print("\n--- Fama-French Factors ---")
print(fm_factors.head(10))
print("\n--- Size ---")
print(df_size.head(10))
print("\n--- Ret212 ---")
print(df_ret212.head(10))

df_returns.info()

Data cleaning complete.

--- Returns ---
            Small_Low  Small_2  Small_3  Small_4  Small_High  2_Low    2_2  \
Date                                                                         
1927-01-01      -1.78     2.73     4.16     3.22       -4.95  -3.13  10.14   
1927-02-01       6.57     5.98     6.34     7.44        8.87   4.92   7.54   
1927-03-01      -3.80     2.97    -4.28    -3.70       -1.16  -4.07  -4.81   
1927-05-01       3.67    11.70     1.77    12.86       21.88   0.10   3.59   
1927-06-01      -3.46    -4.26     0.75    -0.31       -9.28  -2.54   0.25   
1927-07-01       3.42     5.03     4.95     5.57        8.84   5.87   7.59   
1927-08-01      -0.73    -3.86    -3.19    -0.86        4.38   3.02  -0.06   
1927-09-01       1.81    -2.71    -1.21     3.51        2.20   2.89  -0.20   
1927-10-01      -7.67    -5.62     1.32    -1.03       -8.64  -2.54  -4.90   
1927-11-01      20.98     4.01    15.98    11.99        8.27  12.92  11.50   

              2_3   2_

In [None]:
'''
Consider the following Fama-MacBeth cross-sectional regressions:
(1) Ri = γ0 + γM βiM + γsize ln(size) + γret212 (ret212) + ηi1,
(2) Ri = γ0 + γM βiM + γsize βiSMB + γUMD βiUMD + ηi2,
(3) Ri = γ0 + γM βiM + γsize ln(size) + γret212 (ret212) + γSMB βiSMB + γUMD βiUMD
+ ηi3,
where γ0, γM, γsize, γret212, γSMB, and γUMD are regression parameters. βiM, βiSMB, and βiUMD are betas with respect to the Fama-French factors SMB and UMD and size and ret212 are the average size and past 2-to-12-month return characteristics of the portfolio.
Estimate equations (1), (2), and (3) using the full sample of data and the Fama- MacBeth procedure. The following is a brief outline of the procedure:


1. Estimate βiM, βiSMB, and βiUMD for each portfolio by running a time-series regression for each of the 25 portfolios on the Fama-French factors RMRF, SMB, and UMD. Assume that the betas do not change over time; hence, you can estimate the betas using full-period OLS regressions.
2. Each month estimate the regressions using the month-by-month cross-section of realized returns on the 25 portfolios on their estimated betas and characteristics according to equations (1), (2), and (3).
3. Compute the time series average of the estimates of γ0, γM, γsize, γret212, γSMB, and γUMD. In addition, compute the standard error and t-stat of the time series averages in the style of Fama and MacBeth (1973).'''

def fama_macbeth_regression(df_returns, df_ret212, df_size, fm_factors):
    ## Step 1 : Estimate betas for each portfolio
    aligned_data = df_returns.join(fm_factors[['Mkt-RF', 'SMB', 'UMD']], how='inner')
    returns_aligned = aligned_data[df_returns.columns]
    factors_aligned = aligned_data[['Mkt-RF', 'SMB', 'UMD']]

    results_1 = []
    factors = sm.add_constant(factors_aligned.values)
    for col in returns_aligned.columns:
        Y = returns_aligned[col].values
        X = factors
        model = sm.OLS(Y, X).fit()
        betas = model.params[1:] 
        results_1.append({'Portfolio': col, 'Beta_Mkt': betas[0], 'Beta_SMB': betas[1], 'Beta_UMD': betas[2]})

    beta_df = pd.DataFrame(results_1)

    ## Step 2: 
    market_beta_vector = beta_df['Beta_Mkt'].values
    smb_beta_vector = beta_df['Beta_SMB'].values
    umd_beta_vector = beta_df['Beta_UMD'].values
    size_vector = []
    ret212_vector = []
    for col in returns_aligned.columns:
        size_vector.append(df_size[col].mean())
        ret212_vector.append(df_ret212[col].mean())
    size_vector = np.array(size_vector)
    ret212_vector = np.array(ret212_vector)
    
    # (1)
    results_1 = []
    for i in returns_aligned.index:
        R_t = returns_aligned.loc[i].values
        X_t = np.column_stack((np.ones(len(R_t)), market_beta_vector, np.log(size_vector), ret212_vector))
        model_t = sm.OLS(R_t, X_t).fit()
        gammas_1 = model_t.params
        results_1.append({'Date': i, 'Gamma_0': gammas_1[0], 'Gamma_Mkt': gammas_1[1], 'Gamma_Size': gammas_1[2], 'Gamma_Ret212': gammas_1[3]}) 

    # (2)
    results_2 = []
    for i in returns_aligned.index:
        R_t = returns_aligned.loc[i].values
        X_t = np.column_stack((np.ones(len(R_t)), market_beta_vector, smb_beta_vector, umd_beta_vector))
        model_t = sm.OLS(R_t, X_t).fit()
        gammas_2 = model_t.params
        results_2.append({'Date': i, 'Gamma_0': gammas_2[0], 'Gamma_Mkt': gammas_2[1], 'Gamma_SMB': gammas_2[2], 'Gamma_UMD': gammas_2[3]}) 

    # (3) 
    results_3 = []
    for i in returns_aligned.index:
        R_t = returns_aligned.loc[i].values
        X_t = np.column_stack((np.ones(len(R_t)), market_beta_vector, smb_beta_vector, umd_beta_vector, np.log(size_vector), ret212_vector))
        model_t = sm.OLS(R_t, X_t).fit()
        gammas_3 = model_t.params
        results_3.append({'Date': i, 'Gamma_0': gammas_3[0], 'Gamma_Mkt': gammas_3[1], 'Gamma_SMB': gammas_3[2], 'Gamma_UMD': gammas_3[3], 'Gamma_Size': gammas_3[4], 'Gamma_Ret212': gammas_3[5]}) 

    #Step 3 - (3): Compute the time series average of the estimates of γ0, γM, γsize, γret212, γSMB, and γUMD. In addition, compute the standard error and t-stat of the time series averages in the style of Fama and MacBeth (1973).
    gamma_df = pd.DataFrame(results_2).set_index('Date')
    gamma_means = gamma_df.mean()
    gamma_stds = gamma_df.std()
    gamma_tstats = gamma_means / gamma_stds * np.sqrt(len(gamma_df))

    summary_df = pd.DataFrame({
        'Mean': gamma_means,
        'Std': gamma_stds,
        't-stat': gamma_tstats
    })

    factor_means = factors_aligned.mean()
    return summary_df, beta_df, factor_means

summary_df, beta_df, factor_means = fama_macbeth_regression(df_returns, df_ret212, df_size, fm_factors)
print("\n--- Fama-MacBeth Results ---")
print(summary_df)
print("\n--- Estimated Betas ---")
print(beta_df)
print("\n--- Factor Means ---")
print(factor_means)




--- Fama-MacBeth Results ---
                  Mean        Std    t-stat
Gamma_0       2.114419  12.716798  5.451519
Gamma_Mkt    -1.076665  12.861325 -2.744726
Gamma_SMB     0.327007   4.288419  2.500136
Gamma_UMD     0.267692   6.716451  1.306774
Gamma_Size   -0.000008   0.000307 -0.870512
Gamma_Ret212  0.004608   0.055798  2.707775

--- Estimated Betas ---
     Portfolio  Beta_Mkt  Beta_SMB  Beta_UMD
0    Small_Low  1.055137  1.482186 -0.763422
1      Small_2  1.012339  1.306697 -0.463016
2      Small_3  0.964647  1.236010 -0.309949
3      Small_4  1.047558  1.249423 -0.048638
4   Small_High  1.088175  1.317917  0.224089
5        2_Low  1.162817  1.005224 -0.712250
6          2_2  1.023332  0.903277 -0.401841
7          2_3  0.986705  0.755271 -0.173890
8          2_4  1.006544  0.884680  0.044657
9       2_High  1.146399  1.008446  0.323850
10       3_Low  1.178233  0.609542 -0.780246
11         3_2  1.062629  0.523649 -0.397244
12         3_3  1.007449  0.495319 -0.198996
13     