In [8]:
from pathlib import Path

import numpy as np
import pandas as pd

In [9]:
data_dir = Path.cwd().parent / "data"
df_factors = pd.read_csv(data_dir / "factors.csv")

In [10]:
factor_rename = {
    "ret_geo": "W",
    "vol_36m": "L",
    "value": "V",
    "investment": "C",
    "profitability": "R",
}
df_factors = df_factors.rename(columns=factor_rename)

### Helpers

In [26]:
def _yearly_z_score(df, cols, date_col="date"):
    years = df[date_col]

    for c in cols:
        if c not in df.columns:
            raise KeyError(f"{c} not in DataFrame")

        def _z(s):
            std = s.std(ddof=0)
            if std == 0 or np.isnan(std):
                return pd.Series(0.0, index=s.index)
            return (s - s.mean()) / std

        df["z_" + c] = df.groupby(years)[c].transform(_z)
    return df


def _yearly_rank_score(df, cols, date_col="date"):
    years = df[date_col]

    for c in cols:
        if c not in df.columns:
            raise KeyError(f"{c} not in DataFrame")

        def _rank01(s):
            n = s.notna().sum()
            if n <= 1:
                return pd.Series(0.0, index=s.index)
            r = s.rank(method="average")
            return (r - 1) / (n - 1)

        df["rank_" + c] = df.groupby(years)[c].transform(_rank01)
    return df


# Function to create z- or rank-scored factors
def yearly_score(df, cols, method, date_col="date"):
    if date_col not in df.columns:
        raise KeyError(f"{date_col} not in DataFrame")

    df = df.copy()

    if method == "z":
        return _yearly_z_score(df, cols, date_col=date_col)
    elif method == "rank":
        return _yearly_rank_score(df, cols, date_col=date_col)
    else:
        raise ValueError("method must be 'z' or 'rank'")


# Function to create an average factor for the integrated approach
def append_avg_score(df, cols, method):
    if method == "z":
        prefix = "z_"
    elif method == "rank":
        prefix = "rank_"
    else:
        raise ValueError("method must be 'z' or 'rank'")

    df = df.copy()

    score_cols = [prefix + c for c in cols]
    for c in score_cols:
        if c not in df.columns:
            raise KeyError(f"{c} not in DataFrame")

    df[prefix + "int"] = df[score_cols].mean(axis=1)
    return df


# Function to create TER and DEC portfolio weights for each factor provided
def percentile_portfolio_weights(
    df, cols, method, p, mkt_cap_col="market_cap", date_col="date"
):
    if method == "z":
        prefix = "z_"
    elif method == "rank":
        prefix = "rank_"
    else:
        raise ValueError("method must be 'z' or 'rank'")

    if mkt_cap_col not in df.columns:
        raise KeyError(f"{mkt_cap_col} not in DataFrame")

    if date_col not in df.columns:
        raise KeyError(f"{date_col} not in DataFrame")

    df = df.copy()

    years = df[date_col]

    df_weights = df[[date_col]].copy()

    for c in cols:
        score_col = prefix + c
        if score_col not in df.columns:
            raise KeyError(f"{score_col} not in DataFrame")

        # Calculate quantile threshold per year
        thresholds = df.groupby(years)[score_col].transform(lambda x: x.quantile(1 - p))

        # Identify companies in top percentile
        mask = (df[score_col] >= thresholds) & (df[score_col].notna())

        # Calculate total market cap of selected companies per year
        masked_mkt_cap = df[mkt_cap_col].where(mask, 0.0)
        yearly_total_cap = masked_mkt_cap.groupby(years).transform("sum")

        # Calculate weights
        weights = masked_mkt_cap / yearly_total_cap
        df_weights[f"weight_{score_col}"] = weights.fillna(0.0)

    return df_weights


# Function to create the BW weights for each factor provided
def bw_portfolio_weights(
    df,
    cols,
    method,
    n_subportfolios,
    high_multiplier,
    increment,
    multiple_power=1,
    mkt_cap_col="market_cap",
    date_col="date",
):
    if method == "z":
        prefix = "z_"
    elif method == "rank":
        prefix = "rank_"
    else:
        raise ValueError("method must be 'z' or 'rank'")

    if mkt_cap_col not in df.columns:
        raise KeyError(f"{mkt_cap_col} not in DataFrame")

    if date_col not in df.columns:
        raise KeyError(f"{date_col} not in DataFrame")

    df = df.copy()

    years = df[date_col]

    df_weights = df[[date_col]].copy()

    for c in cols:
        score_col = prefix + c
        if score_col not in df.columns:
            raise KeyError(f"{score_col} not in DataFrame")

        # Helper to calculate buckets safely
        def _get_buckets(x):
            try:
                return pd.qcut(x, n_subportfolios, labels=False, duplicates="drop")
            except ValueError:
                return pd.Series(np.nan, index=x.index)

        # Create temporary bucket column
        bucket_col = f"_temp_bucket_{c}"
        df[bucket_col] = df.groupby(years)[score_col].transform(_get_buckets)

        # Calculate total market cap per bucket per year
        bucket_caps = df.groupby([years, bucket_col])[mkt_cap_col].transform("sum")

        # Calculate number of buckets per year to ensure equal size partition
        max_bucket = df.groupby(years)[bucket_col].transform("max")
        n_buckets = max_bucket + 1

        # Base weights (equal size buckets)
        base_weights = (df[mkt_cap_col] / bucket_caps) * (1.0 / n_buckets)

        # Multipliers
        # high_multiplier for max_bucket, decreasing by increment
        multipliers = high_multiplier - (max_bucket - df[bucket_col]) * increment

        # Final weights
        df_weights[f"weight_{score_col}"] = (
            base_weights * (multipliers**multiple_power)
        ).fillna(0.0)

        # Cleanup
        df.drop(columns=[bucket_col], inplace=True)

    return df_weights


# Function to create the mixed portfolio weights
def factor_adjusted_weights(df, cols, factor_weights, method):
    if method == "z":
        prefix = "z_"
    elif method == "rank":
        prefix = "rank_"
    else:
        raise ValueError("method must be 'z' or 'rank'")

    if len(cols) != len(factor_weights):
        raise ValueError("cols and factor_weights must have the same length")

    weight_cols = [f"weight_{prefix}{c}" for c in cols]
    for c in weight_cols:
        if c not in df.columns:
            raise KeyError(f"{c} not in DataFrame")

    weighted_sum = 0
    for col, weight in zip(weight_cols, factor_weights):
        weighted_sum += df[col] * weight

    return weighted_sum


# TODO: Create the TE portfolio weights

In [27]:
factor_combs = [["V", "W"]]
p_list = [1 / 3, 1 / 10]
percentile_suffixes = ["_ter", "_dec"]
method_percentile = "rank"
method_bw = "z"
int_factor_name = "int"
n_subportfolios = 20
high_multiplier = 1.95
increment = 0.1

In [24]:
ID_COLS = ["PERMNO", "date", "tic", "conm"]

In [34]:
for factor_comb in factor_combs:
    # Initialize df_weights with ID columns
    df_weights = df_factors[ID_COLS].copy()

    ### TER & DEC ###
    # Compute percentile weight scores
    df_score = yearly_score(
        df_factors,
        factor_comb,
        method_percentile,
    )

    # Compute integrated score
    df_score = append_avg_score(df_score, factor_comb, method_percentile)

    factor_weights = [1 / len(factor_comb)] * len(factor_comb)

    # Define suffixes for the p_list values
    for p, suffix in zip(p_list, percentile_suffixes):
        # Compute integrated percentile weights
        df_w = percentile_portfolio_weights(
            df_score,
            [int_factor_name],
            method_percentile,
            p,
        )
        df_w.columns = ["date", "_".join(factor_comb) + "_int"]

        # Compute temporary factor percentile weights for mixed calculation
        temp = percentile_portfolio_weights(
            df_score,
            factor_comb,
            method_percentile,
            p,
        )

        # Compute mixed percentile weights
        df_w["_".join(factor_comb) + "_mix"] = factor_adjusted_weights(
            temp, factor_comb, factor_weights, method_percentile
        )

        # Rename columns with suffix and drop date (already in df_weights)
        df_w = df_w.drop(columns=["date"]).add_suffix(suffix)

        # Concatenate to main df
        df_weights = pd.concat([df_weights, df_w], axis=1)

    ### BW ###

    # Compute BW weight scores
    df_score = yearly_score(
        df_factors,
        factor_comb,
        method_bw,
    )

    # Compute integrated score
    df_score = append_avg_score(df_score, factor_comb, method_bw)

    # Compute integrated BW weights
    df_w = bw_portfolio_weights(
        df_score,
        [int_factor_name],
        method_bw,
        n_subportfolios,
        high_multiplier,
        increment,
    )
    df_w.columns = ["date", "_".join(factor_comb) + "_int"]

    # Compute temporary factor BW weights for mixed calculation
    temp = bw_portfolio_weights(
        df_score,
        factor_comb,
        method_bw,
        n_subportfolios,
        high_multiplier,
        increment,
    )

    # Compute mixed BW weights
    df_w["_".join(factor_comb) + "_mix"] = factor_adjusted_weights(
        temp, factor_comb, factor_weights, method_bw
    )

    # Rename columns with suffix and drop date
    df_w = df_w.drop(columns=["date"]).add_suffix("_bw")

    # Concatenate to main df
    df_weights = pd.concat([df_weights, df_w], axis=1)

    df_weights.to_csv(data_dir / "strategies" / f"{'_'.join(factor_comb)}.csv")

In [35]:
df_weights

Unnamed: 0,PERMNO,date,tic,conm,V_W_int_ter,V_W_mix_ter,V_W_int_dec,V_W_mix_dec,V_W_int_bw,V_W_mix_bw
0,10006,1963,4165A,ACF INDUSTRIES INC,0.000000,0.011439,0.000000,0.060776,0.014937,0.010634
1,10006,1964,4165A,ACF INDUSTRIES INC,0.000000,0.001044,0.000000,0.000000,0.000898,0.000863
2,10006,1965,4165A,ACF INDUSTRIES INC,0.000000,0.000000,0.000000,0.000000,0.000731,0.000871
3,10006,1966,4165A,ACF INDUSTRIES INC,0.000000,0.000000,0.000000,0.000000,0.000204,0.000185
4,10006,1967,4165A,ACF INDUSTRIES INC,0.000000,0.000000,0.000000,0.000000,0.000655,0.000531
...,...,...,...,...,...,...,...,...,...,...
41928,93436,2020,TSLA,TESLA INC,0.019233,0.020667,0.094106,0.099587,0.030863,0.020898
41929,93436,2021,TSLA,TESLA INC,0.037985,0.047103,0.076266,0.051771,0.025850,0.013367
41930,93436,2022,TSLA,TESLA INC,0.000000,0.026820,0.000000,0.096342,0.001672,0.016596
41931,93436,2023,TSLA,TESLA INC,0.046866,0.049915,0.083047,0.159878,0.029480,0.032323


In [36]:
# Verify that weights sum to 1 per year
weight_cols = [c for c in df_weights.columns if "weight_" in c]
yearly_sums = df_weights.groupby("date")[weight_cols].sum()

# Check if all sums are approximately 1
if np.allclose(yearly_sums, 1.0):
    print("Success: All portfolio weights sum to 1.0 for each year.")
else:
    print("Failure: Some portfolio weights do not sum to 1.0.")
    # Show first few failures
    failures = yearly_sums[~np.isclose(yearly_sums, 1.0).all(axis=1)]
    print(failures.head())

Success: All portfolio weights sum to 1.0 for each year.
