# Finance Project III — CAPM & FF3F on Ken French 5×5 (ME × BE/ME) Portfolios (Monthly)

This notebook:
- Downloads **Ken French** 5×5 portfolios (ME × BE/ME, monthly, value-weighted) and **FF3 factors**.
- Lets you set a **date range** and choose whether descriptives use **raw or excess** returns.
- Runs **Time-Series (TS)** tests for **CAPM** and **FF3F**:
  - Per portfolio: α, s.e.(α), t(α), β’s, R²; report **Average R²**; run **GRS** (joint α = 0).
- Runs **Fama–MacBeth (FMB)** cross-sectional pricing for **CAPM** and **FF3F** with rolling betas.
- Shows **clean tables** and **simple Matplotlib visuals** (no seaborn, one chart per figure, no custom colors).

In [15]:
import re
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from numpy.linalg import inv
from scipy.stats import f as f_dist
from io import StringIO

In [16]:
import warnings
warnings.filterwarnings("ignore")

In [37]:
# Clean tables
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 160)
pd.set_option("display.float_format", lambda x: f"{x:,.6f}")

# ---------- small statistical helpers ----------
def monthly_sharpe(excess_series: pd.Series) -> float:
    s = excess_series.dropna()
    if s.shape[0] < 2:
        return np.nan
    mu = s.mean()
    sd = s.std(ddof=1)
    return np.nan if sd == 0 else mu / sd

def tstat_of_mean(series: pd.Series) -> float:
    s = series.dropna()
    T = s.shape[0]
    if T < 2:
        return np.nan
    mu = s.mean()
    sd = s.std(ddof=1)
    if sd == 0:
        return np.nan
    se = sd / np.sqrt(T)
    if se == 0:
        return np.nan
    return mu / se

def ols_with_const(y: pd.Series, X: pd.DataFrame):
    Xc = sm.add_constant(X)
    return sm.OLS(y, Xc, missing="drop").fit()

def grs_test_with_alphas(alphas, factor_means, factor_cov, residual_cov, T, N, L):
    """
    Gibbons–Ross–Shanken (1989) F-stat for joint α = 0 across N assets, L factors.
    Returns (F, pval).
    """
    a = np.asarray(alphas, dtype=float).reshape(-1, 1)
    m = np.asarray(factor_means, dtype=float).reshape(-1, 1)
    Sigma_f_inv = inv(np.asarray(factor_cov, dtype=float))
    Sigma_e_inv = inv(np.asarray(residual_cov, dtype=float))

    term = float(m.T @ Sigma_f_inv @ m)
    numer = T / N * (T - N - L) / (T - L - 1)
    denom = 1.0 + term

    print(numer, float(a.T @ Sigma_e_inv @ a), denom)
    print("alphas:", a)
    print("factor means:", m)
    print("Sigma_f_inv:", Sigma_f_inv)
    print("Sigma_e_inv:", Sigma_e_inv)
    print("factor cov:", factor_cov)

    F = numer * float(a.T @ Sigma_e_inv @ a) / denom
    df1, df2 = N, T - N - L
    if df2 <= 0:
        return np.nan, np.nan
    pval = 1.0 - f_dist.cdf(F, df1, df2)
    return F, pval

In [38]:
URL_FF3_MONTHLY = 'https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_Factors_CSV.zip'
URL_KF_25_MONTHLY = 'https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/25_Portfolios_5x5_CSV.zip'

FF3_MONTHLY_FILE_NAME = 'F-F_Research_Data_Factors.csv'
KF_25_MONTHLY_FILE_NAME = '25_Portfolios_5x5.csv'

FF3_L = 3
CAPM_L = 1

import requests, zipfile, os
from io import BytesIO

def download_and_save_kf_data(url: str, save_dir: str = "."):
    """
    Download Ken French dataset ZIP file, extract all CSV files, and save to the specified directory.

    Args:
        url (str): URL to the Ken French ZIP file.
        save_dir (str): Directory to save extracted CSVs (default: current folder).
    Returns:
        List of saved file paths.
    """
    os.makedirs(save_dir, exist_ok=True)
    
    resp = requests.get(url)
    resp.raise_for_status()

    with zipfile.ZipFile(BytesIO(resp.content)) as z:
        saved_files = []
        for fname in z.namelist():
            if fname.endswith('.csv'):
                out_path = os.path.join(save_dir, os.path.basename(fname))
                with z.open(fname) as f_in, open(out_path, 'wb') as f_out:
                    f_out.write(f_in.read())
                saved_files.append(out_path)
    
    return saved_files

download_and_save_kf_data(URL_FF3_MONTHLY)
download_and_save_kf_data(URL_KF_25_MONTHLY)

['.\\25_Portfolios_5x5.csv']

In [39]:
import pandas as pd
import numpy as np
import re
from io import StringIO

def _first_monthly_index_csv(lines):
    """
    Return the line index where monthly data start (first token YYYYMM), 
    assuming comma-separated rows.
    """
    for i, line in enumerate(lines):
        toks = [t.strip() for t in line.strip().split(",")]
        if len(toks) > 0 and toks[0].isdigit() and len(toks[0]) == 6:
            return i
    return None

def _dedupe(names):
    """
    Make column names unique by appending _1, _2, ... to duplicates.
    """
    out, seen = [], {}
    for n in names:
        n = (n or "").strip()
        if n == "":
            n = "COL"
        if n in seen:
            seen[n] += 1
            out.append(f"{n}_{seen[n]}")
        else:
            seen[n] = 0
            out.append(n)
    return out

def _align_header_to_ncols(header, ncols):
    """
    Ensure header length == ncols; pad or truncate as needed.
    """
    header = list(header)
    if len(header) < ncols:
        pad = [f"COL{j}" for j in range(len(header)+1, ncols+1)]
        header = header + pad
    elif len(header) > ncols:
        header = header[:ncols]
    return header

# ---------- read Ken French 25-portfolios (monthly) from CSV (no Path) ----------
def read_kf_25_csv(filename: str) -> pd.DataFrame:
    with open(filename, "r", encoding="utf-8", errors="ignore") as f:
        lines = f.readlines()

    # Find the line index where the row contains 'Average Equal Weighted Returns -- Monthly'
    filter_start = None
    for i, line in enumerate(lines):
        if 'Average Equal Weighted Returns -- Monthly' in line:
            filter_start = i + 1  # start reading after this line
            break

    if filter_start is None:
        raise ValueError("Could not find line containing 'Average Equal Weighted Returns -- Monthly'")

    # Now find first monthly data index after this line
    start_idx = _first_monthly_index_csv(lines[filter_start:])
    if start_idx is None:
        raise ValueError("Could not locate YYYYMM data start in 25_Portfolios_5x5.csv")

    start_idx = start_idx + filter_start  # adjust relative to full lines

    # Find end index for data block: first empty line or where first token is not YYYYMM format
    end_idx = len(lines)
    for j in range(start_idx, len(lines)):
        first_token = lines[j].strip().split(",")[0]
        if not (first_token.isdigit() and len(first_token) == 6):
            end_idx = j
            break

    # Header is previous line; parse by commas (NOT whitespace)
    header_line = lines[start_idx - 1]
    header_raw = [t.strip() for t in header_line.strip().split(",")]

    # Detect ncols from the FIRST data row (comma-split)
    first_data_tokens = [t.strip() for t in lines[start_idx].strip().split(",")]
    ncols = len(first_data_tokens)

    # Force first column to 'YYYYMM'
    if not header_raw:
        header_raw = ["YYYYMM"]
    else:
        header_raw[0] = "YYYYMM"

    # align and dedupe
    header = _align_header_to_ncols(header_raw, ncols)
    header = _dedupe(header)

    # read the data block using comma separator with slice lines
    text = "".join(lines[start_idx:end_idx])
    df = pd.read_csv(StringIO(text), sep=",", engine="python", header=None, names=header)

    # keep only proper YYYYMM rows
    df = df[df["YYYYMM"].astype(str).str.isdigit()].copy()
    df["YYYYMM"] = df["YYYYMM"].astype(int)
    df["date"] = pd.to_datetime(df["YYYYMM"].astype(str) + "01", format="%Y%m%d") + pd.offsets.MonthEnd(0)

    # set index and drop YYYYMM
    df = df.set_index("date").drop(columns=["YYYYMM"])

    # keep first 25 portfolio columns (leftmost 25 after YYYYMM)
    keep_cols = list(df.columns)[:25]
    df = df[keep_cols].apply(pd.to_numeric, errors="coerce") / 100.0
    return df

# ---------- read FF factors (monthly only) from CSV (no Path) ----------
def read_ff3_monthly(filename: str) -> pd.DataFrame:
    with open(filename, "r", encoding="utf-8", errors="ignore") as f:
        lines = f.readlines()

    # find where Annual section starts (truncate monthly section there)
    stop_idx = None
    for i, line in enumerate(lines):
        if "Annual" in line or "ANNUAL" in line:
            stop_idx = i
            break
    monthly_lines = lines if stop_idx is None else lines[:stop_idx]

    start_idx = _first_monthly_index_csv(monthly_lines)
    if start_idx is None:
        raise ValueError("Could not locate YYYYMM monthly start in F-F_Research_Data_Factors.csv")

    header_line = monthly_lines[start_idx - 1]
    header_raw = [t.strip() for t in header_line.strip().split(",")]
    first_data_tokens = [t.strip() for t in monthly_lines[start_idx].strip().split(",")]
    ncols = len(first_data_tokens)

    # first column
    if not header_raw:
        header_raw = ["YYYYMM"]
    else:
        header_raw[0] = "YYYYMM"

    header = _align_header_to_ncols(header_raw, ncols)
    header = _dedupe(header)

    text = "".join(monthly_lines[start_idx:])
    df = pd.read_csv(StringIO(text), sep=",", engine="python", header=None, names=header)

    # keep only monthly numeric rows
    df = df[df["YYYYMM"].astype(str).str.isdigit()].copy()
    df["YYYYMM"] = df["YYYYMM"].astype(int)
    df["date"] = pd.to_datetime(df["YYYYMM"].astype(str) + "01", format="%Y%m%d") + pd.offsets.MonthEnd(0)

    # rename factors to RMRF/SMB/HML/RF
    rename_map = {}
    for c in df.columns:
        cu = c.strip().upper()
        if "MKT" in cu: rename_map[c] = "RMRF"
        elif cu == "SMB": rename_map[c] = "SMB"
        elif cu == "HML": rename_map[c] = "HML"
        elif cu == "RF" : rename_map[c] = "RF"
    df = df.rename(columns=rename_map)

    keep = [c for c in ["date","RMRF","SMB","HML","RF"] if c in (["date"] + list(df.columns))]
    df = df[keep].set_index("date").sort_index()

    # numeric & % → decimal
    for c in [col for col in ["RMRF","SMB","HML","RF"] if col in df.columns]:
        df[c] = pd.to_numeric(df[c], errors="coerce") / 100.0
    return df

# ---------- read both files (filenames only) ----------
ret_5x5 = read_kf_25_csv(f'{KF_25_MONTHLY_FILE_NAME}')
ff3     = read_ff3_monthly(f'{FF3_MONTHLY_FILE_NAME}')

# align on common months
data = ret_5x5.join(ff3, how="inner")

print("Portfolios:", ret_5x5.shape, "| Factors:", ff3.shape)
print("Merged data range:", data.index.min().date(), "→", data.index.max().date())
data.head()

Portfolios: (1190, 25) | Factors: (1190, 4)
Merged data range: 1926-07-31 → 2025-08-31


Unnamed: 0_level_0,SMALL LoBM,ME1 BM2,ME1 BM3,ME1 BM4,SMALL HiBM,ME2 BM1,ME2 BM2,ME2 BM3,ME2 BM4,ME2 BM5,ME3 BM1,ME3 BM2,ME3 BM3,ME3 BM4,ME3 BM5,ME4 BM1,ME4 BM2,ME4 BM3,ME4 BM4,ME4 BM5,BIG LoBM,ME5 BM2,ME5 BM3,ME5 BM4,BIG HiBM,RMRF,SMB,HML,RF
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
1926-07-31,0.066093,-0.040865,-0.000429,0.011716,-0.014577,0.016951,0.018125,0.00175,-0.025128,-0.010845,0.014941,0.021933,0.004253,0.0381,-0.012519,0.010331,0.015548,0.020177,0.005859,0.026443,0.047298,0.037848,0.014997,0.039874,0.013568,0.0289,-0.0255,-0.0239,0.0022
1926-08-31,-0.002185,-0.05068,0.006545,0.037326,0.061829,0.018013,-0.011506,0.038679,0.000411,0.054438,-0.008037,0.020918,0.023308,0.039607,0.08178,0.009852,0.042129,0.013765,0.016731,0.057585,0.001742,0.028668,0.025618,0.045299,0.064265,0.0264,-0.0114,0.0381,0.0025
1926-09-30,-0.08418,-0.038457,-0.03856,-0.058837,0.048919,-0.035551,-0.003247,0.013,-0.027563,-0.020148,-0.007254,0.000901,-0.019093,0.026046,-0.037569,0.001808,-0.005257,-0.014875,0.021705,-0.003495,-0.013635,0.012336,0.004175,-0.010864,-0.026329,0.0038,-0.0136,0.0005,0.0023
1926-10-31,-0.083217,-0.042445,-0.069964,0.023415,-0.036554,-0.045372,-0.03392,-0.047092,-0.079607,-0.019664,-0.053439,-0.028176,0.003977,-0.031491,-0.020595,-0.033619,-0.026184,-0.018823,-0.034879,-0.060402,-0.026837,-0.027187,-0.036805,-0.036642,-0.066019,-0.0327,-0.0014,0.0082,0.0032
1926-11-30,0.007153,0.054507,0.015352,-0.036596,0.025351,-0.00458,-0.024352,0.027419,0.058661,0.019119,0.013771,0.043582,5.8e-05,0.05367,0.034893,0.030251,0.024947,0.035038,0.071146,0.016978,0.045427,0.036599,0.033339,0.025422,0.023429,0.0254,-0.0011,-0.0061,0.0031


In [40]:
data.tail()

Unnamed: 0_level_0,SMALL LoBM,ME1 BM2,ME1 BM3,ME1 BM4,SMALL HiBM,ME2 BM1,ME2 BM2,ME2 BM3,ME2 BM4,ME2 BM5,ME3 BM1,ME3 BM2,ME3 BM3,ME3 BM4,ME3 BM5,ME4 BM1,ME4 BM2,ME4 BM3,ME4 BM4,ME4 BM5,BIG LoBM,ME5 BM2,ME5 BM3,ME5 BM4,BIG HiBM,RMRF,SMB,HML,RF
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
2025-04-30,-0.010697,0.027733,0.074539,0.000648,-0.007533,-6.1e-05,-0.012312,-0.032125,-0.042973,-0.070905,-0.004393,-0.029753,-0.043184,-0.061733,-0.039544,-0.012956,-0.012437,-0.023541,-0.048747,-0.064011,0.011799,-0.015817,-0.046797,-0.033215,-0.026231,-0.0084,-0.0059,-0.034,0.0035
2025-05-31,0.060147,0.018938,0.048715,0.08907,0.06293,0.058422,0.046872,0.065158,0.055387,0.077535,0.062496,0.046195,0.038191,0.042902,0.053976,0.05538,0.051684,0.04298,0.072303,0.050438,0.071456,0.03802,0.023924,0.031872,0.058086,0.0606,0.007,-0.0288,0.0038
2025-06-30,0.07571,0.04726,0.080858,0.044965,0.055007,0.059651,0.06392,0.078554,0.057364,0.053739,0.041281,0.03533,0.041829,0.03082,0.051282,0.017996,0.044709,0.027712,0.059609,0.049583,0.042396,0.047116,0.034408,0.031228,0.067347,0.0486,0.0083,-0.016,0.0034
2025-07-31,0.057514,0.047912,0.040334,0.046725,0.044838,0.023074,0.025361,0.027635,0.009883,0.015688,0.018952,0.015167,0.01384,0.009663,0.02954,0.029633,0.023736,0.006161,0.000475,-0.033011,0.00543,0.020283,0.018853,0.01452,-0.028712,0.0198,0.0027,-0.0127,0.0034
2025-08-31,0.053168,0.104251,0.107366,0.088523,0.065124,0.095506,0.064018,0.091247,0.082466,0.096307,0.052868,0.04857,0.054909,0.06015,0.090269,0.026179,0.021138,0.022985,0.062888,0.074024,0.005385,-0.000327,0.035751,0.051216,0.085305,0.0185,0.0387,0.0441,0.0038


In [41]:
def normalize_port_names(cols):
    out = []
    for c in cols:
        s = str(c).strip()
        s = s.replace("SMALL", "ME1").replace("BIG", "ME5")
        s = s.replace("LoBM", "BM1").replace("HiBM", "BM5")
        s = re.sub(r"\s+", " ", s).upper()
        s = s.replace(" ", "_")
        out.append(s)
    return out

factor_cols = [c for c in ["RMRF","SMB","HML","RF"] if c in data.columns]
port_cols   = [c for c in data.columns if c not in factor_cols]

new_port_cols = normalize_port_names(port_cols)
data = data.rename(columns={old:new for old,new in zip(port_cols, new_port_cols)})

factor_cols = [c for c in ["RMRF","SMB","HML","RF"] if c in data.columns]
port_cols   = [c for c in data.columns if c not in factor_cols]
port_cols[:10]

['ME1_BM1',
 'ME1_BM2',
 'ME1_BM3',
 'ME1_BM4',
 'ME1_BM5',
 'ME2_BM1',
 'ME2_BM2',
 'ME2_BM3',
 'ME2_BM4',
 'ME2_BM5']

## User Inputs

- `start`, `end` — inclusive sample window (YYYY-MM).
- `use_raw_for_descriptives` — True ⇒ descriptives on raw returns (TS/FMB always use **excess**).
- `fmb_beta_window` — rolling months for FMB betas.

In [42]:
# import re
# from datetime import datetime

# def prompt_date_range(dates):
#     """
#     Prompt user for a start and end date, normalize to YYYY-MM endpoints, and validate against the given date index.
#     Keeps looping until valid input is given for both.
#     """
#     min_date = dates.min()
#     max_date = dates.max()

#     def normalize_date(s):
#         """Try to convert user input to 'YYYY-MM' or Timestamp."""
#         s = s.strip()
#         patterns = [
#             ("%Y-%m", r"^\d{4}-\d{2}$"),
#             ("%m/%Y", r"^\d{2}/\d{4}$"),
#             ("%Y%m",  r"^\d{6}$"),
#             ("%b-%Y", r"^[A-Za-z]{3}-\d{4}$"),
#             ("%B-%Y", r"^[A-Za-z]+-\d{4}$"),
#         ]
#         for fmt, pat in patterns:
#             if re.match(pat, s):
#                 try:
#                     return pd.to_datetime(datetime.strptime(s, fmt)), None
#                 except Exception:
#                     return None, f"Could not parse '{s}'"
#         try:
#             return pd.to_datetime(s), None
#         except Exception:
#             return None, f"Could not parse '{s}'"

#     while True:
#         start_input = input(f"Enter start date on or after {min_date.strftime('%Y-%m')} (e.g., 2010-01): ").strip()
#         start, err_start = normalize_date(start_input)
#         if err_start:
#             print(err_start)
#             continue  # Invalid start date, retry

#         end_input = input(f"Enter end date on or before {max_date.strftime('%Y-%m')} (e.g., 2023-12): ").strip()
#         end, err_end = normalize_date(end_input)
#         if err_end:
#             print(err_end)
#             continue  # Invalid end date, retry

#         if end < start:
#             print("End date must be after or equal to start date.")
#             continue  # Retry on inverted range

#         if start < min_date or end > max_date:
#             print(f"Date range must be within {min_date.strftime('%Y-%m')} and {max_date.strftime('%Y-%m')}.")
#             continue  # Retry on out of range

#         # Normalize to month end
#         start = pd.to_datetime(start) + pd.offsets.MonthEnd(0)
#         end = pd.to_datetime(end) + pd.offsets.MonthEnd(0)

#         print(f"Validated date range: {start.date()} to {end.date()}")
#         return start, end
        

# def prompt_model_choice():
#     """
#     Prompt user for model choice. Options: CAPM, FF3F, Both (case-insensitive).
#     Returns:
#         One of: 'CAPM', 'FF3F', 'Both'
#     """
#     valid = {"capm": "CAPM", "ff3f": "FF3F", "both": "Both"}
#     while True:
#         inp = input("Choose model (CAPM, FF3F, Both): ").strip().lower()
#         if inp in valid:
#             print(f"Selected model: {valid[inp]}")
#             return valid[inp]
#         print("Invalid option. Please type CAPM, FF3F, or Both.")

# # Usage example:
# # start, end = prompt_date_range(data.index)
# # model = prompt_model_choice()


In [43]:
# sample window
start = "1963-01"
end   = "1993-12"

# descriptive stats mode
use_raw_for_descriptives = False   # True to show Mean/StdDev on raw returns

# FMB rolling window
fmb_beta_window = 60

# subset and rebuild lists
sample = data.loc[
    (data.index >= pd.to_datetime(start) + pd.offsets.MonthEnd(0)) &
    (data.index <= pd.to_datetime(end)   + pd.offsets.MonthEnd(0))
].copy()

factor_cols = [c for c in ["RMRF","SMB","HML","RF"] if c in sample.columns]
port_cols   = [c for c in sample.columns if c not in factor_cols]

# EXCESS returns for 25 portfolios
excess = sample[port_cols].sub(sample["RF"], axis=0)

print("Sample window:", sample.index.min().date(), "→", sample.index.max().date())
print("# portfolios:", len(port_cols), "| factors:", factor_cols)

Sample window: 1963-01-31 → 1993-12-31
# portfolios: 25 | factors: ['RMRF', 'SMB', 'HML', 'RF']


## Descriptive Statistics (per portfolio)

For each portfolio *i* we report:

- Mean return $E[r_i]$  
- Standard deviation $\sigma_i$  
- Sharpe ratio $SR_i = \frac{E[r_i - r_f]}{\sigma_i}$  
- *t-statistic of mean excess return* $t(E[r_i - r_f]) = \frac{E[r_i - r_f]}{s(r_i - r_f)/\sqrt{T}}$

In [44]:
base_for_stats = sample[port_cols] if use_raw_for_descriptives else excess

rows = []
for p in port_cols:
    rows.append({
        "Portfolio": p,
        "Mean": base_for_stats[p].mean(),
        "StdDev": base_for_stats[p].std(ddof=1),
        "Sharpe (monthly)": monthly_sharpe(excess[p]),
        "t(mean excess)": tstat_of_mean(excess[p]),
    })
desc_table = pd.DataFrame(rows).set_index("Portfolio")
desc_table

Unnamed: 0_level_0,Mean,StdDev,Sharpe (monthly),t(mean excess)
Portfolio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ME1_BM1,0.005704,0.078222,0.072924,1.406505
ME1_BM2,0.008908,0.06994,0.127366,2.456539
ME1_BM3,0.009944,0.065207,0.152497,2.941254
ME1_BM4,0.011591,0.062413,0.185707,3.581795
ME1_BM5,0.014047,0.064782,0.21684,4.182261
ME2_BM1,0.003447,0.072583,0.047484,0.915833
ME2_BM2,0.006928,0.062341,0.111133,2.143447
ME2_BM3,0.009076,0.056891,0.159525,3.076813
ME2_BM4,0.009667,0.052839,0.182949,3.528598
ME2_BM5,0.010859,0.060009,0.180963,3.490286


## Time-Series CAPM

We estimate for each portfolio *i*:

$$
r_{i,t} - r_{f,t} = \alpha_i + \beta_i (R_{M,t} - r_{f,t}) + \varepsilon_{i,t}
$$

Reported statistics:

- $\alpha_i$, $SE(\alpha_i)$, $t(\alpha_i)$  
- $\beta_i$, $SE(\beta_i)$, $t(\beta_i)$  
- $R_i^2$, Average $R^2$ across portfolios  
- GRS test for joint $\alpha_i = 0$ across all assets

In [47]:
def cov_div_T_minus_L(X, L):
    T = X.shape[0]
    # Center data
    X_centered = X - X.mean(axis=0)
    # Compute covariance matrix with divisor T-L
    cov_mat = (X_centered.T @ X_centered) / (T - L)
    return cov_mat


In [50]:
capm_rows, alphas_capm, resids_capm = [], [], []
X_capm = sample[["RMRF"]]

T_capm = sample.shape[0]
N_capm = len(port_cols)
L_capm = CAPM_L

for p in port_cols:
    y = excess[p]
    m = ols_with_const(y, X_capm)

    a    = m.params.get("const", np.nan)
    b    = m.params.get("RMRF", np.nan)
    se_a = m.bse.get("const", np.nan)
    se_b = m.bse.get("RMRF", np.nan)
    t_a  = m.tvalues.get("const", np.nan)
    t_b  = m.tvalues.get("RMRF", np.nan)
    r2   = m.rsquared

    capm_rows.append({
        "Portfolio": p,
        "Alpha": a, "SE(Alpha)": se_a, "t(Alpha)": t_a,
        "Beta_MKT": b, "SE(Beta_MKT)": se_b, "t(Beta_MKT)": t_b,
        "R2": r2
    })
    alphas_capm.append(a)
    resids_capm.append(m.resid)

capm_table = pd.DataFrame(capm_rows).set_index("Portfolio")
avg_r2_capm = capm_table["R2"].mean()

# residual covariance (N×N)
resid_mat_capm = pd.DataFrame({p: resids_capm[i] for i, p in enumerate(port_cols)}).dropna()
Sigma_e_capm   = cov_div_T_minus_L(resid_mat_capm, L_capm)

# factor means/cov
f_means_capm = [sample["RMRF"].mean()]
f_cov_capm   = cov_div_T_minus_L(sample[["RMRF"]], L_capm)

GRS_capm, p_capm = grs_test_with_alphas(alphas_capm, f_means_capm, f_cov_capm, Sigma_e_capm, T_capm, N_capm, L_capm)

capm_summary = pd.DataFrame({
    "Average R2": [avg_r2_capm],
    "GRS (CAPM)": [GRS_capm],
    "p-value": [p_capm],
    "T": [T_capm],
    "N": [N_capm]
})

display(capm_table)
display(capm_summary)

13.914810810810812 0.20610211725052538 1.0098756417033512
alphas: [[-0.00028127]
 [ 0.00358032]
 [ 0.00496524]
 [ 0.00691277]
 [ 0.00945833]
 [-0.00292771]
 [ 0.00142777]
 [ 0.00410887]
 [ 0.00508787]
 [ 0.00583985]
 [-0.00190294]
 [ 0.00189078]
 [ 0.00268736]
 [ 0.00498424]
 [ 0.00588794]
 [-0.00091329]
 [-0.00015083]
 [ 0.00205983]
 [ 0.00419079]
 [ 0.0051322 ]
 [-0.00141534]
 [ 0.00020448]
 [ 0.00078877]
 [ 0.00181861]
 [ 0.00332413]]
factor means: [[0.00442177]]
Sigma_f_inv: [[505.09399298]]
Sigma_e_inv: [[ 3.59000665e+03 -1.90936391e+03 -7.47233983e+02 -1.04983871e+03
  -2.25552860e+02 -1.70486728e+03  2.22315864e+02  2.91286720e+02
   6.60146806e+02 -1.15695642e+02  1.45515641e+02  3.58364303e+02
   2.02460582e+02  4.29225250e+02  3.58793831e+02  1.90805984e+02
  -1.23635168e+02  3.62277993e+01 -1.34552224e+02  2.39206937e+02
   4.72840326e+02  3.51821156e+02 -2.25130477e+02  2.65186456e+02
  -9.29471800e+01]
 [-1.90936391e+03  7.28744322e+03 -2.12980579e+03 -1.51343333e+03
  -1.

Unnamed: 0_level_0,Alpha,SE(Alpha),t(Alpha),Beta_MKT,SE(Beta_MKT),t(Beta_MKT),R2
Portfolio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ME1_BM1,-0.000281,0.002604,-0.108016,1.353653,0.058313,23.213497,0.592899
ME1_BM2,0.00358,0.002343,1.527781,1.204871,0.05248,22.958806,0.587563
ME1_BM3,0.004965,0.002178,2.280093,1.125924,0.048766,23.08824,0.590285
ME1_BM4,0.006913,0.002138,3.232875,1.057908,0.047884,22.092966,0.568814
ME1_BM5,0.009458,0.002371,3.98999,1.037801,0.053085,19.549714,0.508104
ME2_BM1,-0.002928,0.001772,-1.651814,1.441553,0.039692,36.318898,0.780944
ME2_BM2,0.001428,0.001497,0.953946,1.243933,0.033517,37.113574,0.788259
ME2_BM3,0.004109,0.001418,2.897499,1.123225,0.031756,35.370169,0.771753
ME2_BM4,0.005088,0.001349,3.770629,1.035545,0.030217,34.27012,0.760431
ME2_BM5,0.00584,0.00169,3.454665,1.1352,0.037855,29.987899,0.708495


Unnamed: 0,Average R2,GRS (CAPM),p-value,T,N
0,0.76888,2.839827,1.2e-05,372,25


## Time-Series FF3F

We estimate:

$$
r_{i,t} - r_{f,t} = \alpha_i + b_{MKT} RMRF_t + b_{SMB} SMB_t + b_{HML} HML_t + \varepsilon_{i,t}
$$

Reported:

- $\alpha_i$, $t(\alpha_i)$, and $R_i^2$  
- Three factor loadings ($b_{MKT}$, $b_{SMB}$, $b_{HML}$) with standard errors and *t*-values  
- Average $R^2$ and GRS for joint $\alpha=0$

In [51]:
ff3_rows, alphas_ff3, resids_ff3 = [], [], []
X_ff3 = sample[["RMRF","SMB","HML"]]

T_ff3 = sample.shape[0]
N_ff3 = len(port_cols)
L_ff3 = FF3_L

for p in port_cols:
    y = excess[p]
    m = ols_with_const(y, X_ff3)

    a     = m.params.get("const", np.nan)
    b_mkt = m.params.get("RMRF", np.nan)
    b_smb = m.params.get("SMB",  np.nan)
    b_hml = m.params.get("HML",  np.nan)

    se_a  = m.bse.get("const", np.nan)
    se_m  = m.bse.get("RMRF",  np.nan)
    se_s  = m.bse.get("SMB",   np.nan)
    se_h  = m.bse.get("HML",   np.nan)

    t_a   = m.tvalues.get("const", np.nan)
    t_m   = m.tvalues.get("RMRF",  np.nan)
    t_s   = m.tvalues.get("SMB",   np.nan)
    t_h   = m.tvalues.get("HML",   np.nan)

    r2    = m.rsquared

    ff3_rows.append({
        "Portfolio": p,
        "Alpha": a, "SE(Alpha)": se_a, "t(Alpha)": t_a,
        "Beta_MKT": b_mkt, "SE(Beta_MKT)": se_m, "t(Beta_MKT)": t_m,
        "Beta_SMB": b_smb, "SE(Beta_SMB)": se_s, "t(Beta_SMB)": t_s,
        "Beta_HML": b_hml, "SE(Beta_HML)": se_h, "t(Beta_HML)": t_h,
        "R2": r2
    })
    alphas_ff3.append(a)
    resids_ff3.append(m.resid)

ff3_table = pd.DataFrame(ff3_rows).set_index("Portfolio")
avg_r2_ff3 = ff3_table["R2"].mean()

# residual covariance
resid_mat_ff3 = pd.DataFrame({p: resids_ff3[i] for i, p in enumerate(port_cols)}).dropna()
Sigma_e_ff3   = cov_div_T_minus_L(resid_mat_ff3, L_ff3)

# factor moments
f_means_ff3 = sample[["RMRF","SMB","HML"]].mean().values
f_cov_ff3   = cov_div_T_minus_L(sample[["RMRF","SMB","HML"]], L_ff3)

GRS_ff3, p_ff3 = grs_test_with_alphas(alphas_ff3, f_means_ff3, f_cov_ff3, Sigma_e_ff3, T_ff3, N_ff3, L_ff3)

ff3_summary = pd.DataFrame({
    "Average R2": [avg_r2_ff3],
    "GRS (FF3F)": [GRS_ff3],
    "p-value": [p_ff3],
    "T": [T_ff3],
    "N": [N_ff3]
})

display(ff3_table)
display(ff3_summary)

13.909565217391306 0.15277394735710959 1.0677987602468255
alphas: [[-2.82210975e-03]
 [-2.51108384e-04]
 [ 4.51914588e-04]
 [ 1.95345727e-03]
 [ 3.25272358e-03]
 [-2.51790130e-03]
 [-5.95532183e-04]
 [ 1.13192084e-03]
 [ 9.75059633e-04]
 [ 1.27772157e-04]
 [-1.08824272e-03]
 [ 4.35364678e-04]
 [-1.17273074e-04]
 [ 1.30817029e-03]
 [ 6.55628353e-04]
 [ 8.35903485e-04]
 [-1.08048837e-03]
 [-8.52491656e-05]
 [ 7.48199487e-04]
 [ 1.77684139e-04]
 [ 1.22133329e-03]
 [ 2.23911446e-04]
 [-4.97052959e-04]
 [-9.90247333e-04]
 [-8.82871505e-04]]
factor means: [[0.00442177]
 [0.00273118]
 [0.00474113]]
Sigma_f_inv: [[ 650.28151751 -334.16172002  358.70567587]
 [-334.16172002 1419.26708069  -53.81732062]
 [ 358.70567587  -53.81732062 1731.55821354]]
Sigma_e_inv: [[ 3.70365552e+03 -1.80690547e+03 -6.74663830e+02 -9.66481274e+02
  -4.29485119e+02 -1.31756839e+03  3.22459824e+02  3.52143738e+02
   6.08788957e+02 -2.57242036e+02  3.95662468e+02  5.99062849e+02
   2.90643046e+02  4.41437706e+02  3.0385

Unnamed: 0_level_0,Alpha,SE(Alpha),t(Alpha),Beta_MKT,SE(Beta_MKT),t(Beta_MKT),Beta_SMB,SE(Beta_SMB),t(Beta_SMB),Beta_HML,SE(Beta_HML),t(Beta_HML),R2
Portfolio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ME1_BM1,-0.002822,0.001423,-1.983509,0.981689,0.035245,27.853639,1.598866,0.052068,30.707106,-0.038221,0.057512,-0.664577,0.885733
ME1_BM2,-0.000251,0.00107,-0.234647,0.907305,0.026509,34.225912,1.500304,0.039163,38.308908,0.22138,0.043258,5.117678,0.919139
ME1_BM3,0.000452,0.000927,0.487613,0.886816,0.022958,38.62785,1.382003,0.033917,40.746886,0.378834,0.037463,10.112245,0.930228
ME1_BM4,0.001953,0.00084,2.325053,0.843302,0.020812,40.51908,1.352124,0.030747,43.975594,0.467263,0.033962,13.758483,0.937412
ME1_BM5,0.003253,0.001054,3.085493,0.853207,0.026114,32.672298,1.401615,0.038579,36.330594,0.673633,0.042613,15.808132,0.908537
ME2_BM1,-0.002518,0.000827,-3.046315,1.11027,0.020475,54.226596,1.095276,0.030248,36.209788,-0.408415,0.033411,-12.224118,0.955212
ME2_BM2,-0.000596,0.000692,-0.861146,1.036998,0.017131,60.533689,0.964492,0.025308,38.109767,0.064144,0.027954,2.294617,0.957498
ME2_BM3,0.001132,0.000651,1.73827,0.974573,0.016131,60.417617,0.880217,0.02383,36.93665,0.259477,0.026322,9.857791,0.95475
ME2_BM4,0.000975,0.000596,1.635471,0.968055,0.014769,65.547956,0.739239,0.021818,33.881512,0.504572,0.0241,20.937022,0.956028
ME2_BM5,0.000128,0.000646,0.197717,1.07992,0.016008,67.460008,0.893708,0.02365,37.789323,0.741518,0.026122,28.386297,0.959945


Unnamed: 0,Average R2,GRS (FF3F),p-value,T,N
0,0.929381,1.990093,0.003736,372,25


## Cross-Sectional Fama–MacBeth (FMB)

1. Estimate rolling time-series betas for each portfolio *i* using a window of *fmb_beta_window* months.

2. At each month *t* after the initial window, run a cross-sectional regression:

**CAPM pricing regression**
$$
r_{i,t}^{excess} = \lambda_{0,t} + \lambda_{MKT,t}\,\beta_{i,MKT} + \epsilon_{i,t}
$$

**FF3F pricing regression**
$$
r_{i,t}^{excess} = \lambda_{0,t} + \lambda_{MKT,t}\,\beta_{i,MKT} + \lambda_{SMB,t}\,\beta_{i,SMB} + \lambda_{HML,t}\,\beta_{i,HML} + \epsilon_{i,t}
$$

3. Average each $\lambda$ over time and report its mean, standard error, and *t-statistic*:

$$
t(\bar{\lambda}_k) = \frac{\bar{\lambda}_k}{s(\lambda_{k,t}) / \sqrt{T}}
$$

In [None]:
# def rolling_betas(excess_df: pd.DataFrame, X_df: pd.DataFrame, window: int):
#     betas = {}
#     T = excess_df.shape[0]
#     for p in excess_df.columns:
#         rows = []
#         for t in range(window, T + 1):
#             y = excess_df[p].iloc[t-window:t]
#             X = X_df.iloc[t-window:t]
#             m = ols_with_const(y, X)
#             params = m.params.drop("const", errors="ignore")
#             rows.append(params)
#         idx = excess_df.index[window-1:]
#         betas[p] = pd.DataFrame(rows, index=idx)
#         print(f"Computed rolling betas for {p}")
#     return betas

# def summarize_lambdas(lambda_df: pd.DataFrame) -> pd.DataFrame:
#     out = []
#     for c in lambda_df.columns:
#         s = lambda_df[c].dropna()
#         n = s.shape[0]
#         if n <= 1:
#             mean = se = tval = np.nan
#         else:
#             mean = s.mean()
#             sd   = s.std(ddof=1)
#             se   = sd / np.sqrt(n) if sd > 0 else np.nan
#             tval = mean / se if (se is not np.nan and se != 0) else np.nan
#         out.append({"Price": c, "Mean": mean, "StdErr": se, "t(Mean)": tval})
#     return pd.DataFrame(out).set_index("Price")

# # ---- CAPM FMB ----
# X_capm = sample[["RMRF"]]
# betas_capm = rolling_betas(excess, X_capm, fmb_beta_window)
# betas_capm

# dates_fmb = excess.index[fmb_beta_window-1:]
# lambda_rows_capm = []

# for dt in dates_fmb:
#     Yi = excess.loc[dt, port_cols].dropna()
#     bmat, names = [], []
#     for p in port_cols:
#         if dt in betas_capm[p].index:
#             bmat.append(betas_capm[p].loc[dt].values)  # one: B_MKT
#             names.append(p)
#     if len(names) == 0:
#         continue
#     Xcs = pd.DataFrame(bmat, index=names, columns=["B_MKT"])
#     Ycs = Yi.reindex(names)
#     mod = ols_with_const(Ycs, Xcs)
#     lambda_rows_capm.append({
#         "date": dt,
#         "lambda_0": mod.params.get("const", np.nan),
#         "lambda_MKT": mod.params.get("B_MKT", np.nan)
#     })

# lambda_capm = pd.DataFrame(lambda_rows_capm).set_index("date")
# lambda_capm_summary = summarize_lambdas(lambda_capm[["lambda_0","lambda_MKT"]])

# display(lambda_capm.head(5))
# display(lambda_capm_summary)

## Visuals (MPL defaults)

- **Alpha bars:** display $\alpha_i$ for CAPM and FF3F.  
- **R² scatter:** compare $R_i^2$(CAPM) vs $R_i^2$(FF3F).  
- **FMB λ plots:** plot $\bar{\lambda}_k$ with ±1 standard error bars.

In [None]:
# # 1) CAPM alpha bars
# plt.figure()
# capm_table["Alpha"].sort_values().plot(kind="barh")
# plt.title("CAPM Alphas by Portfolio (monthly)")
# plt.xlabel("Alpha")
# plt.tight_layout()
# plt.show()

# # 2) FF3F alpha bars
# plt.figure()
# ff3_table["Alpha"].sort_values().plot(kind="barh")
# plt.title("FF3F Alphas by Portfolio (monthly)")
# plt.xlabel("Alpha")
# plt.tight_layout()
# plt.show()

# # 3) R² scatter: CAPM vs FF3F
# plt.figure()
# x = capm_table["R2"]; y = ff3_table["R2"]
# plt.scatter(x, y)
# minv = float(min(x.min(), y.min()))
# maxv = float(max(x.max(), y.max()))
# plt.plot([minv, maxv], [minv, maxv])  # 45-degree line
# plt.title("R²: CAPM vs FF3F (per portfolio)")
# plt.xlabel("R² CAPM")
# plt.ylabel("R² FF3F")
# plt.tight_layout()
# plt.show()

# # 4) FMB CAPM lambdas (±1 s.e.)
# plt.figure()
# s = lambda_capm_summary.loc[["lambda_0","lambda_MKT"]]
# plt.errorbar(range(s.shape[0]), s["Mean"], yerr=s["StdErr"])
# plt.xticks(range(s.shape[0]), s.index, rotation=0)
# plt.title("FMB CAPM — Lambda Means with StdErr")
# plt.tight_layout()
# plt.show()

# # 5) FMB FF3F lambdas (±1 s.e.)
# plt.figure()
# s = lambda_ff3_summary.loc[["lambda_0","lambda_MKT","lambda_SMB","lambda_HML"]]
# plt.errorbar(range(s.shape[0]), s["Mean"], yerr=s["StdErr"])
# plt.xticks(range(s.shape[0]), s.index, rotation=0)
# plt.title("FMB FF3F — Lambda Means with StdErr")
# plt.tight_layout()
# plt.show()