In [13]:
from pathlib import Path

import numpy as np
import pandas as pd

In [11]:
data_dir = Path.cwd().parent / "data"
df_factors = pd.read_csv(data_dir / "factors.csv")

In [12]:
factor_rename = {
    "ret_geo": "W",
    "vol_36m": "L",
    "value": "V",
    "investment": "C",
    "profitability": "R",
}
df_factors = df_factors.rename(columns=factor_rename)

### Helpers

In [None]:
def _yearly_z_score(df, cols, date_col="date"):
    years = df[date_col]

    for c in cols:
        if c not in df.columns:
            raise KeyError(f"{c} not in DataFrame")

        def _z(s):
            std = s.std(ddof=0)
            if std == 0 or np.isnan(std):
                return pd.Series(0.0, index=s.index)
            return (s - s.mean()) / std

        df["z_" + c] = df.groupby(years)[c].transform(_z)
    return df


def _yearly_rank_score(df, cols, date_col="date"):
    years = df[date_col]

    for c in cols:
        if c not in df.columns:
            raise KeyError(f"{c} not in DataFrame")

        def _rank01(s):
            n = s.notna().sum()
            if n <= 1:
                return pd.Series(0.0, index=s.index)
            r = s.rank(method="average")
            return (r - 1) / (n - 1)

        df["rank_" + c] = df.groupby(years)[c].transform(_rank01)
    return df


# Function to create z- or rank-scored factors
def yearly_score(df, cols, method, date_col="date"):
    if date_col not in df.columns:
        raise KeyError(f"{date_col} not in DataFrame")

    if method == "z":
        return _yearly_z_score(df, cols, date_col=date_col)
    elif method == "rank":
        return _yearly_rank_score(df, cols, date_col=date_col)
    else:
        raise ValueError("method must be 'z' or 'rank'")


# Function to create an average factor for the integrated approach
def append_avg_score(df, cols, method):
    if method == "z":
        prefix = "z_"
    elif method == "rank":
        prefix = "rank_"
    else:
        raise ValueError("method must be 'z' or 'rank'")

    score_cols = [prefix + c for c in cols]
    for c in score_cols:
        if c not in df.columns:
            raise KeyError(f"{c} not in DataFrame")

    df[f"avg_{method}_score"] = df[score_cols].mean(axis=1)
    return df


# Function to create TER and DEC portfolio weights for each factor provided
def append_portfolio_weights(
    df, cols, method, p, mkt_cap_col="market_cap", date_col="date"
):
    if method == "z":
        prefix = "z_"
    elif method == "rank":
        prefix = "rank_"
    else:
        raise ValueError("method must be 'z' or 'rank'")

    if mkt_cap_col not in df.columns:
        raise KeyError(f"{mkt_cap_col} not in DataFrame")

    if date_col not in df.columns:
        raise KeyError(f"{date_col} not in DataFrame")

    years = df[date_col]

    for c in cols:
        score_col = prefix + c
        if score_col not in df.columns:
            raise KeyError(f"{score_col} not in DataFrame")

        # Calculate quantile threshold per year
        thresholds = df.groupby(years)[score_col].transform(lambda x: x.quantile(1 - p))

        # Identify companies in top percentile
        mask = (df[score_col] >= thresholds) & (df[score_col].notna())

        # Calculate total market cap of selected companies per year
        masked_mkt_cap = df[mkt_cap_col].where(mask, 0.0)
        yearly_total_cap = masked_mkt_cap.groupby(years).transform("sum")

        # Calculate weights
        weights = masked_mkt_cap / yearly_total_cap
        df[f"weight_{score_col}"] = weights.fillna(0.0)

    return df


# Function to create the BW weights for each factor provided
def append_bucketed_portfolio_weights(
    df,
    cols,
    method,
    n_subportfolios,
    high_multiplier,
    increment,
    mkt_cap_col="market_cap",
    date_col="date",
):
    if method == "z":
        prefix = "z_"
    elif method == "rank":
        prefix = "rank_"
    else:
        raise ValueError("method must be 'z' or 'rank'")

    if mkt_cap_col not in df.columns:
        raise KeyError(f"{mkt_cap_col} not in DataFrame")

    if date_col not in df.columns:
        raise KeyError(f"{date_col} not in DataFrame")

    years = df[date_col]

    for c in cols:
        score_col = prefix + c
        if score_col not in df.columns:
            raise KeyError(f"{score_col} not in DataFrame")

        # Helper to calculate buckets safely
        def _get_buckets(x):
            try:
                return pd.qcut(x, n_subportfolios, labels=False, duplicates="drop")
            except ValueError:
                return pd.Series(np.nan, index=x.index)

        # Create temporary bucket column
        bucket_col = f"_temp_bucket_{c}"
        df[bucket_col] = df.groupby(years)[score_col].transform(_get_buckets)

        # Calculate total market cap per bucket per year
        bucket_caps = df.groupby([years, bucket_col])[mkt_cap_col].transform("sum")

        # Calculate number of buckets per year to ensure equal size partition
        max_bucket = df.groupby(years)[bucket_col].transform("max")
        n_buckets = max_bucket + 1

        # Base weights (equal size buckets)
        base_weights = (df[mkt_cap_col] / bucket_caps) * (1.0 / n_buckets)

        # Multipliers
        # high_multiplier for max_bucket, decreasing by increment
        multipliers = high_multiplier - (max_bucket - df[bucket_col]) * increment

        # Final weights
        df[f"weight_{score_col}"] = (base_weights * multipliers).fillna(0.0)

        # Cleanup
        df.drop(columns=[bucket_col], inplace=True)

    return df


# Function to create the mixed portfolio weights
def append_factor_weighted_score(df, cols, factor_weights, method):
    if method == "z":
        prefix = "z_"
    elif method == "rank":
        prefix = "rank_"
    else:
        raise ValueError("method must be 'z' or 'rank'")

    if len(cols) != len(factor_weights):
        raise ValueError("cols and factor_weights must have the same length")

    weight_cols = [f"weight_{prefix}{c}" for c in cols]
    for c in weight_cols:
        if c not in df.columns:
            raise KeyError(f"{c} not in DataFrame")

    weighted_sum = 0
    for col, weight in zip(weight_cols, factor_weights):
        weighted_sum += df[col] * weight

    df[f"factor_weight_{method}"] = weighted_sum
    return df


# TODO: Create the TE portfolio weights