In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [2]:
data_dir = Path.cwd().parent / "data"
df_factors = pd.read_csv(data_dir / "factors.csv")

In [3]:
factor_cols = ["ret_geo", "vol_36m", "value", "investment", "profitability"]

### Factor returns

In [None]:
factor_cols = {
    "V": "value",  # value
    "W": "ret_geo",  # momentum signal
    "C": "investment",  # investment
    "R": "profitability",  # profitability
    "L": "vol_36m",  # low volatility
}

ret_col = "ret_geo"  # return used for portfolio performance
w_col = "market_cap"  # value weights


def value_weighted_return(g):
    """Value weighted return of ret_col using w_col within a group."""
    if g.empty:
        return np.nan
    return np.average(g[ret_col], weights=g[w_col])


def assign_terciles(x):
    """Assign 0 (bottom), 1 (middle), 2 (top) tercile based on ranks within a date."""
    n = len(x)
    if n == 0:
        return pd.Series(np.nan, index=x.index)

    ranks = x.rank(method="first")
    t1 = n / 3.0
    t2 = 2.0 * t1

    out = pd.Series(index=x.index, dtype="int8")
    out[ranks <= t1] = 0
    out[(ranks > t1) & (ranks <= t2)] = 1
    out[ranks > t2] = 2

    return out

  top.groupby("date").apply(value_weighted_return).rename(short_name + "_top")
  bottom.groupby("date").apply(value_weighted_return).rename(short_name + "_bot")
  top.groupby("date").apply(value_weighted_return).rename(short_name + "_top")
  bottom.groupby("date").apply(value_weighted_return).rename(short_name + "_bot")
  top.groupby("date").apply(value_weighted_return).rename(short_name + "_top")
  bottom.groupby("date").apply(value_weighted_return).rename(short_name + "_bot")
  top.groupby("date").apply(value_weighted_return).rename(short_name + "_top")
  bottom.groupby("date").apply(value_weighted_return).rename(short_name + "_bot")
  top.groupby("date").apply(value_weighted_return).rename(short_name + "_top")
  bottom.groupby("date").apply(value_weighted_return).rename(short_name + "_bot")


In [None]:
def winsorize(df, cols, lower=0.005, upper=0.995):
    df = df.copy()
    for col in cols:
        # Apply separately for each year (cross-sectional clean)
        df[col] = df.groupby(by="date")[col].transform(
            lambda x: x.clip(lower=x.quantile(lower), upper=x.quantile(upper))
        )
    return df


df_factors = winsorize(df_factors[df_factors], factor_cols).sort_values(
    ["PERMNO", "date"]
)

In [None]:
# Build factor portfolio return series for each factor
factor_ret_list = []

for short_name, score_col in factor_cols.items():
    df = df_factors[["date", ret_col, w_col, score_col]].copy()
    df = df.loc[:, ~df.columns.duplicated()]

    df["tercile"] = df.groupby("date")[score_col].transform(assign_terciles)

    # Top and bottom terciles
    top = df[df["tercile"] == 2]
    bottom = df[df["tercile"] == 0]

    # Value weighted returns by date
    top_ret = (
        top.groupby("date").apply(value_weighted_return).rename(short_name + "_top")
    )

    bottom_ret = (
        bottom.groupby("date").apply(value_weighted_return).rename(short_name + "_bot")
    )

    # Factor return = top minus bottom
    fr = pd.concat([top_ret, bottom_ret], axis=1)
    fr[short_name] = fr[short_name + "_top"] - fr[short_name + "_bot"]

    # Keep only the factor return series
    factor_ret_list.append(fr[[short_name]])

# 2. Combine all factor return series into one dataframe
factor_returns = pd.concat(factor_ret_list, axis=1)
factor_returns = factor_returns.sort_index()

In [5]:
X = factor_returns[["V", "W", "C", "R", "L"]].copy()

vif_table = pd.DataFrame(
    {
        "factor": X.columns,
        "VIF": [variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
    }
)

print(vif_table)

  factor       VIF
0      V  3.941159
1      W  2.323704
2      C  3.467262
3      R  2.071466
4      L  2.774044


In [6]:
factors_cols = ["V", "C", "W", "R", "L"]
df = factor_returns[factors_cols]

results = pd.DataFrame(index=factors_cols, columns=factors_cols)

for dep in factors_cols:
    y = df[dep]
    for ind in factors_cols:
        if ind == dep:
            results.loc[ind, dep] = "-"  # Diagonal
            continue

        X = sm.add_constant(df[[ind]])
        model = sm.OLS(y, X).fit()
        R2 = model.rsquared
        VIF = 1 / (1 - R2)

        results.loc[ind, dep] = round(VIF, 2)

print(results)

      V     C     W     R     L
V     -  1.89  1.12  1.24  1.13
C  1.89     -   1.0  1.03  1.71
W  1.12   1.0     -  1.02   1.0
R  1.24  1.03  1.02     -  1.14
L  1.13  1.71   1.0  1.14     -
