In [None]:
from pathlib import Path

import numpy as np
import pandas as pd


In [32]:
data_dir = Path.cwd().parent / "data"
df = pd.read_csv(data_dir / "factors.csv", parse_dates=["date"])

In [37]:
factor_cols = ["ret_geo", "vol_36m", "value", "investment", "profitability"]

factor = df.replace([np.inf, -np.inf], np.nan)

### EDA

In [38]:
def winsorize_cross_section(df, cols, lower=0.01, upper=0.99):
    df = df.copy()
    for col in cols:
        # Apply separately for each year (cross-sectional clean)
        df[col] = df.groupby(df["date"].dt.year)[col].transform(
            lambda x: x.clip(lower=x.quantile(lower), upper=x.quantile(upper))
        )
    return df


factor_winsorized = winsorize_cross_section(factor, factor_cols)


### Factor returns

In [39]:
factor_df = factor_winsorized

# 0. Basic prep
factor_df = factor_df.sort_values(["date", "PERMNO"])

# Map short factor names (paper) to your signal columns
factor_cols = {
    "V": "value",  # value
    "W": "ret_geo",  # momentum signal
    "C": "investment",  # investment
    "R": "profitability",  # profitability
    "L": "vol_36m",  # low volatility
}

ret_col = "ret_arith"  # return used for portfolio performance
w_col = "market_cap"  # value weights


def value_weighted_return(group):
    """Value weighted return of ret_col using w_col within a group."""
    g = group.dropna(subset=[ret_col, w_col])
    if g.empty:
        return np.nan
    w = g[w_col]
    r = g[ret_col]
    w_sum = w.sum()
    if w_sum <= 0:
        return np.nan
    return (w * r).sum() / w_sum


def assign_terciles(x):
    """Assign 0 (bottom), 1 (middle), 2 (top) tercile based on ranks within a date."""
    n = x.notna().sum()
    if n < 3:
        return pd.Series(index=x.index, data=np.nan)
    ranks = x.rank(method="first")
    t1 = n / 3.0
    t2 = 2.0 * n / 3.0
    out = pd.Series(index=x.index, dtype="float")
    out[ranks <= t1] = 0
    out[(ranks > t1) & (ranks <= t2)] = 1
    out[ranks > t2] = 2
    return out


# 1. Build factor portfolio return series for each factor
factor_ret_list = []

for short_name, score_col in factor_cols.items():
    df = factor_df[["date", ret_col, w_col, score_col]].copy()
    df = df[~df[score_col].isna()]

    # Assign terciles per date on the factor signal
    df["tercile"] = df.groupby("date")[score_col].transform(assign_terciles)

    # Top and bottom terciles
    top = df[df["tercile"] == 2]
    bottom = df[df["tercile"] == 0]

    # Value weighted returns by date
    top_ret = (
        top.groupby("date").apply(value_weighted_return).rename(short_name + "_top")
    )

    bottom_ret = (
        bottom.groupby("date").apply(value_weighted_return).rename(short_name + "_bot")
    )

    # Factor return = top minus bottom
    fr = pd.concat([top_ret, bottom_ret], axis=1)
    fr[short_name] = fr[short_name + "_top"] - fr[short_name + "_bot"]

    # Keep only the factor return series
    factor_ret_list.append(fr[[short_name]])

# 2. Combine all factor return series into one dataframe
factor_returns = pd.concat(factor_ret_list, axis=1)
factor_returns = factor_returns.sort_index().dropna(how="any")

# factor_returns now has columns ["V", "W", "C", "R", "L"]
print(factor_returns.head())


  top.groupby("date").apply(value_weighted_return).rename(short_name + "_top")
  bottom.groupby("date").apply(value_weighted_return).rename(short_name + "_bot")
  top.groupby("date").apply(value_weighted_return).rename(short_name + "_top")
  bottom.groupby("date").apply(value_weighted_return).rename(short_name + "_bot")
  top.groupby("date").apply(value_weighted_return).rename(short_name + "_top")
  bottom.groupby("date").apply(value_weighted_return).rename(short_name + "_bot")
  top.groupby("date").apply(value_weighted_return).rename(short_name + "_top")
  bottom.groupby("date").apply(value_weighted_return).rename(short_name + "_bot")
  top.groupby("date").apply(value_weighted_return).rename(short_name + "_top")


                   V         W         C         R         L
date                                                        
1963-01-31  0.018702  0.027356  0.011811  0.025640  0.025640
1963-02-28 -0.002044  0.009215  0.001875 -0.002044 -0.005891
1963-03-29 -0.011790  0.038703 -0.011831 -0.006966  0.009045
1963-04-30 -0.012700  0.035412  0.007204 -0.007406  0.000926
1963-05-31 -0.002727  0.013192  0.000530 -0.017610  0.008107


  bottom.groupby("date").apply(value_weighted_return).rename(short_name + "_bot")
