### Homework 4
##### Charles Yan, xy2985

#### Q1 Performance analysis of hedge fund returns using linear regression

##### (a) Download and import data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
import statsmodels.api as sm
from scipy import stats

In [None]:
# Fama-French 5 Factors (monthly)
import pandas_datareader.data as web
ff = web.DataReader("F-F_Research_Data_5_Factors_2x3", "famafrench", start="2000")
ff = ff[0] / 100  # monthly data, convert from percent to decimal
ff.index = ff.index.to_timestamp()
ff.head()

In [None]:
# QMNIX from Yahoo Finance
qmnix = yf.download("QMNIX", start="2000-01-01", end="2026-02-01")
qmnix_monthly = qmnix["Adj Close"].resample("ME").last()
qmnix_ret = qmnix_monthly.pct_change().dropna()
qmnix_ret.name = "QMNIX"
qmnix_ret.index = qmnix_ret.index.to_period("M").to_timestamp()
qmnix_ret.head()

In [None]:
# Align dates
ff.index = ff.index.to_period("M").to_timestamp()
data = pd.merge(qmnix_ret, ff, left_index=True, right_index=True, how="inner")
data["QMNIX_excess"] = data["QMNIX"] - data["RF"]
print(f"Date range: {data.index[0]} to {data.index[-1]}, {len(data)} months")
data.head()

##### (b) Summary statistics and scatter plots

In [None]:
cols = ["QMNIX_excess", "Mkt-RF", "SMB", "HML", "RMW", "CMA"]

# Summary statistics
summary = data[cols].agg(["mean", "std", "min", "max"])
summary.loc["skew"] = data[cols].skew()
summary.loc["kurtosis"] = data[cols].kurtosis()  # excess kurtosis
print(summary.round(4))

In [None]:
# Correlation matrix
print("Correlation Matrix:")
print(data[cols].corr().round(3))

In [None]:
# Scatter plots
fig, axes = plt.subplots(1, 5, figsize=(20, 4))
factors = ["Mkt-RF", "SMB", "HML", "RMW", "CMA"]
for i, f in enumerate(factors):
    axes[i].scatter(data[f], data["QMNIX_excess"], alpha=0.5, s=10)
    axes[i].set_xlabel(f)
    axes[i].set_ylabel("QMNIX excess")
    axes[i].set_title(f"QMNIX vs {f}")
plt.tight_layout()
plt.show()

In [None]:
# Extra credit: ADF stationarity test
from statsmodels.tsa.stattools import adfuller
print("ADF Stationarity Tests:")
for c in cols:
    adf_stat, p_val = adfuller(data[c].dropna())[:2]
    label = "(Stationary)" if p_val < 0.05 else "(Non-stationary)"
    print(f"  {c}: ADF={adf_stat:.3f}, p={p_val:.4f} {label}")

**Discussion:**

- All return series are stationary (as expected for financial returns).
- The correlation matrix reveals QMNIX's relationship with the Fama-French factors. As a market-neutral fund, QMNIX should have low correlation with Mkt-RF.
- Skewness and kurtosis indicate the degree of non-normality in the return distributions. Financial returns typically exhibit negative skewness and excess kurtosis (fat tails).

##### (c) Regression of QMNIX on Fama-French factors

In [None]:
Y = data["QMNIX_excess"]
X = sm.add_constant(data[["Mkt-RF", "SMB", "HML", "RMW", "CMA"]])
model = sm.OLS(Y, X).fit()
print(model.summary())

In [None]:
# Intercept test
print(f"Intercept (alpha): {model.params['const']:.6f}")
print(f"t-stat: {model.tvalues['const']:.3f}, p-value: {model.pvalues['const']:.4f}")
if model.pvalues["const"] < 0.05:
    print("=> Intercept is statistically different from 0 at 5% level.")
else:
    print("=> Intercept is NOT statistically different from 0 at 5% level.")

print(f"\nF-statistic: {model.fvalue:.3f}, p-value: {model.f_pvalue:.4e}")
print(f"R-squared: {model.rsquared:.4f}")

##### (d) Diagnostic plots for regression assumptions

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
residuals = model.resid
fitted = model.fittedvalues

# 1. Residuals vs Fitted (Linearity & Homoscedasticity)
axes[0, 0].scatter(fitted, residuals, alpha=0.5, s=10)
axes[0, 0].axhline(0, color="r", linestyle="--")
axes[0, 0].set_xlabel("Fitted values")
axes[0, 0].set_ylabel("Residuals")
axes[0, 0].set_title("Residuals vs Fitted (Linearity & Homoscedasticity)")

# 2. Q-Q plot (Normality)
sm.qqplot(residuals, line="45", ax=axes[0, 1])
axes[0, 1].set_title("Q-Q Plot (Normality)")

# 3. Scale-Location (Homoscedasticity)
axes[1, 0].scatter(fitted, np.sqrt(np.abs(residuals)), alpha=0.5, s=10)
axes[1, 0].set_xlabel("Fitted values")
axes[1, 0].set_ylabel("sqrt(|Residuals|)")
axes[1, 0].set_title("Scale-Location (Homoscedasticity)")

# 4. ACF of residuals (Independence)
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(residuals, lags=20, ax=axes[1, 1])
axes[1, 1].set_title("ACF of Residuals (Independence)")

plt.tight_layout()
plt.show()

In [None]:
# Formal tests
from statsmodels.stats.diagnostic import het_breuschpagan, acorr_ljungbox
_, bp_pval, _, _ = het_breuschpagan(residuals, X)
print(f"Breusch-Pagan (homoscedasticity): p={bp_pval:.4f}")
jb_stat, jb_pval = stats.jarque_bera(residuals)
print(f"Jarque-Bera (normality): p={jb_pval:.4f}")
lb = acorr_ljungbox(residuals, lags=[10], return_df=True)
print(f"Ljung-Box lag 10 (no autocorrelation): p={lb['lb_pvalue'].values[0]:.4f}")

##### (e) Financial / Economic interpretation

The regression coefficients (factor loadings) reveal QMNIX's exposure to systematic risk factors:

- **Mkt-RF (market beta)**: An equity market neutral fund should have near-zero market beta by design. A significant positive/negative loading would indicate imperfect hedging.
- **SMB**: Positive loading means the fund tilts toward small-cap stocks; negative toward large-cap.
- **HML**: Positive loading indicates a value tilt; negative indicates a growth tilt.
- **RMW**: Exposure to profitability factor (robust minus weak).
- **CMA**: Exposure to investment factor (conservative minus aggressive).
- **Intercept (alpha)**: Represents the fund's risk-adjusted excess return - the return not explained by factor exposures. A statistically significant positive alpha indicates genuine skill by the manager. If alpha is approximately 0, the fund's returns are fully explained by factor exposures.
- **R-squared**: Shows what fraction of the fund's return variability is explained by the five factors. A low R-squared for a market-neutral fund is expected since it hedges out market exposure.

---
#### Q2 Bias-variance decomposition

##### (a) Derive the bias-variance tradeoff for MSE

Let $\hat{\beta}^* = E[\hat{\beta}]$. Then:

$$\begin{aligned}
E[(\hat{\beta} - \beta)^2] &= E[(\hat{\beta} - \hat{\beta}^* + \hat{\beta}^* - \beta)^2]\\
&= E[(\hat{\beta} - \hat{\beta}^*)^2] + 2E[(\hat{\beta} - \hat{\beta}^*)(\hat{\beta}^* - \beta)] + (\hat{\beta}^* - \beta)^2\\
&= E[(\hat{\beta} - \hat{\beta}^*)^2] + 2(\hat{\beta}^* - \beta)\underbrace{E[\hat{\beta} - \hat{\beta}^*]}_{=0} + (\hat{\beta}^* - \beta)^2\\
&= \text{Var}(\hat{\beta}) + (\hat{\beta}^* - \beta)^2 \quad \blacksquare
\end{aligned}$$

##### (b) Monte Carlo simulation of shrinkage estimator

In [None]:
np.random.seed(99)
beta_true = 3.0
sigma = 2.0
B = 10000
lambdas = np.arange(0, 5.1, 0.1)

eps = np.random.normal(0, sigma, B)
y = beta_true + eps

results = []
for lam in lambdas:
    beta_hat = y / (1 + lam)

    bias = beta_hat.mean() - beta_true
    variance = beta_hat.var()
    mse = np.mean((beta_hat - beta_true) ** 2)
    bv_check = variance + bias ** 2  # should be approximately equal to mse

    results.append({"lambda": lam, "bias": bias, "variance": variance,
                    "mse": mse, "bias2+var": bv_check})

df_bv = pd.DataFrame(results)
print(df_bv[["lambda", "bias", "variance", "mse", "bias2+var"]].to_string(index=False))

In [None]:
# Plot
plt.figure(figsize=(10, 6))
plt.plot(df_bv["lambda"], df_bv["mse"], label="MSE", linewidth=2)
plt.plot(df_bv["lambda"], df_bv["variance"], label="Variance", linestyle="--")
plt.plot(df_bv["lambda"], df_bv["bias"] ** 2, label="Bias$^2$", linestyle="--")
plt.xlabel("$\lambda$")
plt.ylabel("Error")
plt.title("Bias-Variance Tradeoff")
plt.legend()
plt.grid(True)
plt.show()

##### (c) Shrinkage estimator with orthogonal regressors

With orthogonal regressors, $\bm{X}^\top\bm{X} = \bm{I}_p$, so $\hat{\bm{\beta}}_{OLS} = \bm{X}^\top\bm{y}$.

The shrinkage estimator is $\tilde{\beta}_j = \dfrac{1}{1+\lambda}\hat{\beta}_{OLS,j}$.

**Bias:**

$$E[\tilde{\beta}_j] = \frac{1}{1+\lambda}E[\hat{\beta}_{OLS,j}] = \frac{1}{1+\lambda}\beta_j$$

$$\text{Bias}_j = E[\tilde{\beta}_j] - \beta_j = -\frac{\lambda}{1+\lambda}\beta_j$$

**Variance:**

$$\text{Var}(\tilde{\beta}_j) = \frac{1}{(1+\lambda)^2}\text{Var}(\hat{\beta}_{OLS,j}) = \frac{\sigma^2}{(1+\lambda)^2}$$

since $\text{Var}(\hat{\bm{\beta}}_{OLS}) = \sigma^2(\bm{X}^\top\bm{X})^{-1} = \sigma^2\bm{I}_p$.

**Total MSE:**

$$\sum_{j=1}^{p}E[(\tilde{\beta}_j - \beta_j)^2] = \sum_{j=1}^{p}\left[\text{Var}(\tilde{\beta}_j) + \text{Bias}_j^2\right] = \sum_{j=1}^{p}\left[\frac{\sigma^2}{(1+\lambda)^2} + \frac{\lambda^2}{(1+\lambda)^2}\beta_j^2\right]$$

$$= p\sigma^2\left(\frac{1}{1+\lambda}\right)^2 + \left(\frac{\lambda}{1+\lambda}\right)^2\sum_{j=1}^{p}\beta_j^2 \quad \blacksquare$$

**Optimal $\lambda^*$:** Let $S = \sum \beta_j^2$. Define $f(\lambda) = \dfrac{p\sigma^2 + \lambda^2 S}{(1+\lambda)^2}$.

$$f'(\lambda) = \frac{2\lambda S(1+\lambda)^2 - 2(1+\lambda)(p\sigma^2 + \lambda^2 S)}{(1+\lambda)^4} = \frac{2(\lambda S - p\sigma^2)}{(1+\lambda)^3}$$

Setting $f'(\lambda) = 0$:

$$\lambda S = p\sigma^2 \implies \lambda^* = \frac{p\sigma^2}{\sum_{j=1}^{p}\beta_j^2} \quad \blacksquare$$

##### (d) Simulation with orthogonal design matrix

In [None]:
np.random.seed(99)
n, p = 200, 4
beta = np.array([1.0, 2.0, 3.0, 0.5])
sigma2 = 2.0
B = 10000
lambdas = np.arange(0, 5.1, 0.1)

# Generate orthogonal X without QR: Gram-Schmidt
Z = np.random.normal(0, 1, (n, p))
X = np.zeros((n, p))
for j in range(p):
    v = Z[:, j].copy()
    for k in range(j):
        v = v - (X[:, k] @ v) * X[:, k]
    X[:, j] = v / np.linalg.norm(v)

print(f"X'X ~ I_p check: ||X'X - I||_F = {np.linalg.norm(X.T @ X - np.eye(p)):.2e}")

# Theoretical lambda*
lambda_star = p * sigma2 / np.sum(beta ** 2)
print(f"Theoretical lambda* = {lambda_star:.4f}")

In [None]:
# Monte Carlo
mse_list = []
for lam in lambdas:
    mse_b = 0
    for b in range(B):
        eps = np.random.normal(0, np.sqrt(sigma2), n)
        y = X @ beta + eps
        beta_ols = X.T @ y  # since X'X = I
        beta_shrink = beta_ols / (1 + lam)
        mse_b += np.sum((beta_shrink - beta) ** 2)
    mse_list.append(mse_b / B)

df_mse = pd.DataFrame({"lambda": lambdas, "mse": mse_list})
min_idx = df_mse["mse"].idxmin()
print(f"Empirical minimizing lambda = {df_mse.loc[min_idx, 'lambda']:.2f}")
print(f"Theoretical lambda* = {lambda_star:.4f}")
print(df_mse.to_string(index=False))

In [None]:
# Plot
plt.figure(figsize=(10, 6))
plt.plot(df_mse["lambda"], df_mse["mse"], label="Empirical MSE", linewidth=2)
plt.axvline(lambda_star, color="r", linestyle="--", label=f"$\lambda^*$ = {lambda_star:.3f}")
plt.axvline(df_mse.loc[min_idx, "lambda"], color="g", linestyle=":",
            label=f"Empirical min $\lambda$ = {df_mse.loc[min_idx, 'lambda']:.2f}")
plt.xlabel("$\lambda$")
plt.ylabel("MSE")
plt.title("MSE vs $\lambda$ (Shrinkage Estimator)")
plt.legend()
plt.grid(True)
plt.show()