# Fixed Income Funds Recommendation System

> gpadpoll

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from fif_recsys import hello

# This will print to the notebook output
hello()

In [None]:
import yaml

config_d = yaml.safe_load("""
fetch:
    cda:
        base_url: "https://dados.cvm.gov.br/dados/FI/DOC/CDA/DADOS/"
        periods:
            - "202501"
            - "202502"
            - "202503"
            - "202504"
            - "202505"
            - "202506"
            - "202507"
            - "202508"
            - "202509"
            - "202510"
            - "202511"
            - "202512"
        filename_template: "cda_fi_{period}.zip"

    cotas:
        base_url: "https://dados.cvm.gov.br/dados/FI/DOC/INF_DIARIO/DADOS/"
        periods:
            - "2023"
            - "2024"
            - "2025"
        filename_template: "inf_diario_fi_{period}.zip"
feature:
    group_keys:
        - CNPJ_FUNDO_CLASSE
        - DENOM_SOCIAL
        - competencia
    feature_registry:
        cda:
            patrimonio_liq:
                description: "Maximum reported net asset value per fund-month."
                method: max
                args:
                    - VL_PATRIM_LIQ
                            
            log_aum:
                description: "Maximum reported net asset value per fund-month."
                method: max
                args:
                    - VL_PATRIM_LIQ
                adjustment:
                    - log

            total_posicao:
                description: "Sum of final market value of all positions in the period."
                method: sum
                args:
                    - VL_MERC_POS_FINAL

            n_ativos:
                description: "Number of unique assets in the fund portfolio."
                method: nunique
                args:
                    - CD_ATIVO

            n_emissores:
                description: "Number of unique issuers in the fund portfolio."
                method: nunique
                args:
                    - CPF_CNPJ_EMISSOR

            credito_share:
                description: "Weighted share of credit-linked assets in the portfolio."
                method: credito_share_feature_fn
                args:
                    - ["Debêntures", "Cédula de Crédito", "CRI", "CRA", "Notas Promissórias"]
                adjustment:
                    - clip

            related_party_share:
                description: "Weighted share of related-party issuers."
                method: related_party_share_feature_fn
                adjustment:
                    - clip

            issuer_hhi:
                description: "Herfindahl-Hirschman index based on issuer weights."
                method: hhi_feature_fn
                adjustment:
                    - clip
                    - coalesce
score:
    size_score:
        type: zscore
        description: >
            Measures the relative size of the fund based on its assets under
            management. Larger funds typically exhibit greater operational
            stability, better liquidity access, and lower idiosyncratic risk.
            Computed using the z-score of the log-transformed AUM (log_aum).
        args:
            feature: log_aum

    diversification_score:
        type: zscore
        description: >
            Evaluates how diversified the fund's portfolio is in terms of
            the number of unique assets held. Higher values indicate broader
            asset diversification, reducing exposure to security-specific risks.
        args:
            feature: n_ativos

    issuer_diversification_score:
        type: zscore
        description: >
            Measures diversification across issuers by counting how many distinct
            counterparties the fund is exposed to. Funds with exposures distributed
            across more issuers typically have lower concentration and reduced
            issuer-specific credit risk.
        args:
            feature: n_emissores

    credit_risk_score:
        type: zscore
        description: >
            Quantifies the fund's exposure to credit-linked instruments such as
            debentures, CRIs/CRAs, and promissory notes. A higher credit share
            typically increases sensitivity to credit events. The score is inverted
            so that higher credit exposure corresponds to a lower (worse) score.
        args:
            feature: credito_share
        adjustment:
            - invert

    governance_risk_score:
        type: zscore
        description: >
            Captures exposure to related-party transactions, which may increase
            governance risk due to potential conflicts of interest and reduced
            market discipline. The score is inverted, so funds with higher
            related-party share receive a lower (worse) score.
        args:
            feature: related_party_share
        adjustment:
            - invert

    concentration_risk_score:
        type: zscore
        description: >
            Measures portfolio concentration using the Herfindahl-Hirschman Index
            (HHI) computed over issuer exposure weights. Higher HHI values indicate
            more concentrated portfolios and elevated idiosyncratic and liquidity
            risks. Score is inverted so higher concentration yields a lower score.
        args:
            feature: issuer_hhi
        adjustment:
            - invert

""")

In [None]:
from pathlib import Path

from fif_recsys.commands.data import fetch_manifest


data_sources_d = fetch_manifest(config_d['fetch'], output_dir=Path("/tmp"))


In [18]:
# temp
import pandas as pd

data_sources_d = {}

data_sources_d['cda'] = pd.read_csv("tmp/df_cda.csv", index_col=0)
data_sources_d['cotas'] = pd.read_csv("tmp/df_cotas.csv", index_col=0)

  data_sources_d['cda'] = pd.read_csv("tmp/df_cda.csv", index_col=0)
  data_sources_d['cotas'] = pd.read_csv("tmp/df_cotas.csv", index_col=0)


In [None]:
from fif_recsys.commands.feature import compute_all_features, FEATURE_ENGINE

# aux_df = compute_features_from_df(data_sources_d['cda'], config_d['feature']['group_keys'], config_d['feature']['feature_registry']['cda'])

aux_df = compute_all_features(data_sources_d, config_d, FEATURE_ENGINE)

aux_df

  def build_feature_engine(feature_engine: Dict, yaml_cfg: Dict):
  def build_feature_engine(feature_engine: Dict, yaml_cfg: Dict):
  result = getattr(ufunc, method)(*inputs, **kwargs)
  def build_feature_engine(feature_engine: Dict, yaml_cfg: Dict):
  def build_feature_engine(feature_engine: Dict, yaml_cfg: Dict):
  def build_feature_engine(feature_engine: Dict, yaml_cfg: Dict):
  )


Unnamed: 0,CNPJ_FUNDO_CLASSE,DENOM_SOCIAL,competencia,patrimonio_liq,log_aum,total_posicao,n_ativos,n_emissores,credito_share,related_party_share,issuer_hhi
0,06.323.688/0001-27,IT NOW PIBB IBRX-50 FUNDO DE ÍNDICE RESPONSABI...,202506,9.630971e+08,20.685665,9.659350e+08,47,0,0.0,0.127924,
1,06.323.688/0001-27,IT NOW PIBB IBRX-50 FUNDO DE ÍNDICE RESPONSABI...,202507,9.206483e+08,20.640589,9.225100e+08,46,1,0.0,0.127067,1.0
2,06.323.688/0001-27,IT NOW PIBB IBRX-50 FUNDO DE ÍNDICE RESPONSABI...,202508,9.333802e+08,20.654323,1.015659e+09,50,0,0.0,0.118529,
3,06.323.688/0001-27,IT NOW PIBB IBRX-50 FUNDO DE ÍNDICE RESPONSABI...,202509,9.502398e+08,20.672225,9.649803e+08,49,0,0.0,0.127355,
4,06.323.688/0001-27,IT NOW PIBB IBRX-50 FUNDO DE ÍNDICE RESPONSABI...,202510,9.650222e+08,20.687662,9.672142e+08,49,0,0.0,0.127783,
...,...,...,...,...,...,...,...,...,...,...,...
5471,63.698.764/0001-12,TREND ETF IDEX B50 CLASSE DE ÍNDICE - RESPONSA...,202512,2.263287e+07,16.934914,2.263701e+07,0,1,0.0,0.001269,1.0
5472,63.698.833/0001-98,TREND ETF IDEX B35 CLASSE DE ÍNDICE - RESPONSA...,202512,2.273170e+07,16.939271,2.273587e+07,0,1,0.0,0.002968,1.0
5473,63.756.772/0001-78,GALAPAGOS BITCOIN CME CF FUNDO DE ÍNDICE,202512,4.695182e+06,15.362048,4.696480e+06,1,0,0.0,0.000000,
5474,63.905.124/0001-36,BTG PACTUAL TEVA AUVP ITBR LIQUIDEZ FUNDO DE Í...,202512,7.532298e+07,18.137296,7.533181e+07,0,0,0.0,0.000000,


In [26]:
from fif_recsys.commands.feature import FEATURE_ENGINE

FEATURE_ENGINE['coalesce']['function']['pandas'](aux_df.issuer_hhi)

0       0.0
1       1.0
2       0.0
3       0.0
4       0.0
       ... 
5471    1.0
5472    1.0
5473    0.0
5474    0.0
5475    1.0
Name: issuer_hhi, Length: 5476, dtype: float64