# Creation of the dataset

This notebook contains the steps for the extraction and...

In [70]:
import os
import pymssql

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from dotenv import load_dotenv
load_dotenv()

DATASET_CREATION = False

In [3]:
STATUS_PROTESTO = {
    1: "Incluída em lote de remessa",
    2: "Enviada a Protesto",
    3: "Protestada",
    4: "Paga",
    5: "Solicitação de Desistência",
    6: "Solicitação de Cancelamento (Após o Protesto)",
    7: "Solicitação de Autorização de Cancelamento (Dívida Paga ou Parcelada)",
    8: "Cancelada antes do Protesto",
    9: "Cancelada após o Protesto",
    10: "Cancelada por Pagamento",
    11: "Sustada por Ordem Judicial",
    12: "Devolvida por Irregularidade"
}

FINE_CODE_DICT = {
    1: "Ressarcimento",
    2: "Multa",
    3: "Remanejamento",
    4: "Multa Percentual",
    5: "Multa Cominatória"
}

STATUS_DIVIDA_ATIVA = {
    1: "Inscrito em Dívida Ativa",
    2: "Negociado",
    3: "Quitado",
    4: "Exigibilidade Suspensa",
    5: "Cancelado",
    6: "Pagamento em Atraso",
    7: "Remissão",
    8: "Prescrito"
}


In [4]:
def create_raw_dataset():
    queries = {
    "exe_debito": """
        SELECT 
            IdDebito AS exe_debito_iddebito,
            IdProcessoExecucao AS exe_debito_idprocessoexecucao,
            valorOriginalDebito AS exe_debito_valororiginaldebito,
            ValorPago AS exe_debito_valorpago,
            CodigoTipoDebito AS exe_debito_codigotipodebito,
            datainclusao AS exe_debito_datainclusao,
            DataDecisao,
            StatusProtesto AS protesto_status,
            Status_PGE AS pge_status,
            IdProcessoOrigem
        FROM processo.dbo.Exe_Debito
        
    """,
    "exe_debitopessoa": """
        SELECT 
            IDDebitoPessoa AS exe_debitopessoa_iddebitopessoa,
            IDDebito AS exe_debitopessoa_iddebito,
            IDPessoa AS exe_debitopessoa_idpessoa,
            DataInclusao AS exe_debitopessoa_datainclusao
        FROM processo.dbo.Exe_DebitoPessoa
    """,
    "exe_debitoboleto": """
        SELECT 
            IdDebitoBoleto AS exe_debitoboleto_iddebitoboleto,
            IdDebito as exe_debitoboleto_iddebito,
            ValorOriginal AS exe_debitoboleto_valororiginal,
            ValorPago AS exe_debitoboleto_valorpago,
            DataPagamento AS exe_debitoboleto_datapagamento
        FROM processo.dbo.Exe_DebitoBoleto
    """,
    "exe_debito_multadiaria": """
        SELECT 
            IdDebitoMultaDiaria AS exe_debito_multadiaria_iddebitomultadiaria,
            IdDebito as exe_debito_multadiaria_iddebito,
            ValorMultaDiaria AS exe_debito_multadiaria_valormultadiaria,
            DataInicioImputacaoMultaDiaria AS exe_debito_multadiaria_datainicio,
            DataFinalImputacaoMultaDiaria AS exe_debito_multadiaria_datafinal
        FROM processo.dbo.Exe_Debito_MultaDiaria
    """,
    "exe_creditopagamento": """
        SELECT 
            IdCreditoPagamento AS exe_creditopagamento_idcreditopagamento,
            IdDebito as exe_creditopagamento_iddebito,
            ValorCredito AS exe_creditopagamento_valorcredito,
            DataInclusao AS exe_creditopagamento_datainclusao
        FROM processo.dbo.Exe_CreditoPagamento
    """,
    "exe_parcelamento": """
        SELECT 
            IdParcelamento AS exe_parcelamento_idparcelamento,
            IdDebito as exe_parcelamento_iddebito,
            NumeroParcelas AS exe_parcelamento_numeroparcelas,
            SituacaoParcelamento AS exe_parcelamento_situacaoparcelamento,
            DataCancelamentoParcelamento AS exe_parcelamento_datacancelamento,
            DataReabertura AS exe_parcelamento_datareabertura
        FROM processo.dbo.Exe_Parcelamento
    """,
    "protesto_titulosremessa": """
        SELECT 
            IdTituloRemessa AS protesto_titulosremessa_idtituloremessa,
            IdDebito as protesto_titulosremessa_iddebito,
            NumeroProtocoloTitulo AS protesto_titulosremessa_numeroprotocolotitulo,
            DataProtocoloTitulo AS protesto_titulosremessa_dataprotocolotitulo
        FROM processo.dbo.Protesto_TitulosRemessa
    """,
    "pge_processo": """
        SELECT 
            IdProcessoPGE AS pge_processo_idprocessopge,
            IdDebitoExecucao as pge_processo_iddebitoexecucao,
            NumeroProcessoExecucao AS pge_processo_numeroprocessoexecucao,
            AnoProcessoExecucao AS pge_processo_anoprocessoexecucao,
            ValorAtualizadoPGE AS pge_processo_valoratualizadopge,
            ValorPagoPGE AS pge_processo_valorpagopge,
            HomologadoPGE AS pge_processo_homologadopge
        FROM processo.dbo.PGE_Processo
    """,
    "gen_pessoa": """
        SELECT 
            IdPessoa AS pessoa_idpessoa,
            CASE WHEN TipoPessoa = '2' THEN 1 ELSE 0 END AS is_legal_organization
        FROM processo.dbo.GenPessoa
    """,
    "processos": """
        SELECT 
            IdProcesso as processo_idprocesso,
            numero_processo,
            ano_processo
        FROM processo.dbo.Processos
    """,
    "processo_transitojulgado": """
        SELECT 
            numero_processo,
            ano_processo,
            datatransito AS processo_data_transito_julgado
        FROM processo.dbo.Processo_TransitoJulgado
    """,
    "civil_servant_match": """
        SELECT 
    CAST(ano AS INT) AS ano,
    CAST(mes AS INT) AS mes,
    gp.IdPessoa AS pessoa_idpessoa,
    1 AS is_civil_servant
    FROM BdDIP.dbo.vwSiaiPessoalFolhaResumida fr 
    INNER JOIN processo.dbo.GenPessoa gp ON fr.CPF = gp.Documento COLLATE SQL_Latin1_General_CP1_CI_AS
    WHERE ano >= 2023
    GROUP BY ano, mes, gp.IdPessoa

        """
    }

    conn = pymssql.connect(
        server=os.getenv('SQL_SERVER_HOST'),
        port=int(os.getenv('SQL_SERVER_PORT')),
        user=os.getenv('SQL_SERVER_USER'),
        password=os.getenv('SQL_SERVER_PASS'),
        database=os.getenv('SQL_SERVER_DB'),
    )

    exe_debito = pd.read_sql(queries["exe_debito"], conn)
    exe_debitopessoa = pd.read_sql(queries["exe_debitopessoa"], conn)
    exe_debitoboleto = pd.read_sql(queries["exe_debitoboleto"], conn)
    exe_debito_multadiaria = pd.read_sql(queries["exe_debito_multadiaria"], conn)
    exe_creditopagamento = pd.read_sql(queries["exe_creditopagamento"], conn)
    exe_parcelamento = pd.read_sql(queries["exe_parcelamento"], conn)
    protesto_titulosremessa = pd.read_sql(queries["protesto_titulosremessa"], conn)
    pge_processo = pd.read_sql(queries["pge_processo"], conn)
    gen_pessoa = pd.read_sql(queries["gen_pessoa"], conn)
    processos = pd.read_sql(queries["processos"], conn)
    processo_transitojulgado = pd.read_sql(queries["processo_transitojulgado"], conn)
    civil_servant_match = pd.read_sql(queries["civil_servant_match"], conn)


    # Close the connection early
    conn.close()

    # Start with base Exe_Debito
    df = exe_debito.copy()

    # Join with Exe_DebitoPessoa
    df = df.merge(
        exe_debitopessoa,
        how='left',
        left_on='exe_debito_iddebito',
        right_on='exe_debitopessoa_iddebito'
    )

    # Join with Exe_DebitoBoleto
    df = df.merge(
        exe_debitoboleto,
        how='left',
        left_on='exe_debito_iddebito',
        right_on='exe_debitoboleto_iddebito'
    )

    # Join with Exe_Debito_MultaDiaria
    df = df.merge(
        exe_debito_multadiaria,
        how='left',
        left_on='exe_debito_iddebito',
        right_on='exe_debito_multadiaria_iddebito'
    )

    # Join with Exe_CreditoPagamento
    df = df.merge(
        exe_creditopagamento,
        how='left',
        left_on='exe_debito_iddebito',
        right_on='exe_creditopagamento_iddebito'
    )

    # Join with Exe_Parcelamento
    df = df.merge(
        exe_parcelamento,
        how='left',
        left_on='exe_debito_iddebito',
        right_on='exe_parcelamento_iddebito'
    )

    # Join with Protesto_TitulosRemessa
    df = df.merge(
        protesto_titulosremessa,
        how='left',
        left_on='exe_debito_iddebito',
        right_on='protesto_titulosremessa_iddebito'
    )

    # Join with PGE_Processo
    df = df.merge(
        pge_processo,
        how='left',
        left_on='exe_debito_iddebito',
        right_on='pge_processo_iddebitoexecucao'
    )

    # Join with GenPessoa
    df = df.merge(
        gen_pessoa,
        how='left',
        left_on='exe_debitopessoa_idpessoa',
        right_on='pessoa_idpessoa'
    )

    # Join with Processos
    df = df.merge(
        processos,
        how='left',
        left_on='IdProcessoOrigem',
        right_on='processo_idprocesso'
    )

    # Join with Processo_TransitoJulgado
    df = df.merge(
        processo_transitojulgado,
        how='left',
        on=['numero_processo', 'ano_processo']
    )

    # Extract year/month for civil servant matching
    df['ano'] = pd.to_datetime(df['DataDecisao'], errors='coerce').dt.year
    df['mes'] = pd.to_datetime(df['DataDecisao'], errors='coerce').dt.month

    # Join with civil servant match
    df = df.merge(
        civil_servant_match,
        how='left',
        on=['ano', 'mes', 'pessoa_idpessoa']
    )

    # Fill missing with 0
    df['is_civil_servant'] = df['is_civil_servant'].fillna(0).astype(int)

    df.to_csv('../data/private/raw/tcern_debtors_dataset.csv', index=False)

In [5]:
if DATASET_CREATION:
    create_raw_dataset()

In [6]:
id_columns_to_load_as_int = [
    'exe_debito_iddebito',
    'exe_debito_idprocessoexecucao',
    'exe_debitopessoa_iddebitopessoa',
    'exe_debitopessoa_idpessoa',
    'exe_debitoboleto_iddebitoboleto',
    'exe_debito_multadiaria_iddebitomultadiaria',
    'exe_creditopagamento_idcreditopagamento',
    'exe_parcelamento_idparcelamento',
    'protesto_titulosremessa_idtituloremessa',
    'pge_processo_idprocessopge',
    'protesto_titulosremessa_numeroprotocolotitulo'
]
dtype_map = {col: 'Int64' for col in id_columns_to_load_as_int}
df = pd.read_csv('../data/public/raw/tcern_debtors_dataset.csv', dtype=dtype_map)


  df = pd.read_csv('../data/public/raw/tcern_debtors_dataset.csv', dtype=dtype_map)


In [43]:
df.columns


Index(['exe_debito_iddebito', 'exe_debito_idprocessoexecucao',
       'exe_debito_valororiginaldebito', 'exe_debito_valorpago',
       'exe_debito_codigotipodebito', 'exe_debito_datainclusao', 'DataDecisao',
       'protesto_status', 'pge_status', 'IdProcessoOrigem',
       'exe_debitopessoa_iddebitopessoa', 'exe_debitopessoa_iddebito',
       'exe_debitopessoa_idpessoa', 'exe_debitopessoa_datainclusao',
       'exe_debitoboleto_iddebitoboleto', 'exe_debitoboleto_iddebito',
       'exe_debitoboleto_valororiginal', 'exe_debitoboleto_valorpago',
       'exe_debitoboleto_datapagamento',
       'exe_debito_multadiaria_iddebitomultadiaria',
       'exe_debito_multadiaria_iddebito',
       'exe_debito_multadiaria_valormultadiaria',
       'exe_debito_multadiaria_datainicio', 'exe_debito_multadiaria_datafinal',
       'exe_creditopagamento_idcreditopagamento',
       'exe_creditopagamento_iddebito', 'exe_creditopagamento_valorcredito',
       'exe_creditopagamento_datainclusao', 'exe_parcelam

In [8]:
df.head()

Unnamed: 0,exe_debito_iddebito,exe_debito_idprocessoexecucao,exe_debito_valororiginaldebito,exe_debito_valorpago,exe_debito_codigotipodebito,exe_debito_datainclusao,DataDecisao,protesto_status,pge_status,IdProcessoOrigem,...,pge_processo_homologadopge,pessoa_idpessoa,is_legal_organization,processo_idprocesso,numero_processo,ano_processo,processo_data_transito_julgado,ano,mes,is_civil_servant
0,2,419587,13286.0,,1,2014-11-04 08:18:28.330,2013-03-14,,,232568,...,,3678.0,0.0,232568,9767,2007,2013-08-15,2013.0,3.0,0
1,2,419587,13286.0,,1,2014-11-04 08:18:28.330,2013-03-14,,,232568,...,,3678.0,0.0,232568,9767,2007,2013-08-15,2013.0,3.0,0
2,3,419587,2000.0,3393.88,2,2014-11-04 08:19:54.557,2013-03-14,,True,232568,...,,3678.0,0.0,232568,9767,2007,2013-08-15,2013.0,3.0,0
3,3,419587,2000.0,3393.88,2,2014-11-04 08:19:54.557,2013-03-14,,True,232568,...,,3678.0,0.0,232568,9767,2007,2013-08-15,2013.0,3.0,0
4,3,419587,2000.0,3393.88,2,2014-11-04 08:19:54.557,2013-03-14,,True,232568,...,,3678.0,0.0,232568,9767,2007,2013-08-15,2013.0,3.0,0


# Preprocessing

In [9]:
df['exe_parcelamento_situacaoparcelamento'].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['exe_parcelamento_situacaoparcelamento'].fillna(0, inplace=True)


In [10]:
df['exe_parcelamento_datacancelamento'].isna().sum()

np.int64(47026)

# Features for Clustering Debtor Profiles

## Financial Features
- `current_outstanding_balance`: Current net debt value (original debt minus paid amount).
- `percent_paid_amount`: Proportion of total original debt that has been paid.
- `num_distinct_debts`: Total count of unique debt obligations for the debtor.
- `total_multa_value`: Total original value of debts specifically identified as fines.
- `total_ressarcimento_value`: Total original value of debts identified as reimbursements.
- `num_boleto_payments`: Number of distinct payments made via boleto.
- `time_since_last_payment_days`: Time elapsed (in days) since the debtor's last recorded payment.

## Installment Features
- `has_ever_made_installment_agreement`: Binary flag (1 if yes, 0 if no) indicating if the debtor ever entered an installment agreement.

## Legal Process Features
- `has_ever_been_protested`: Binary flag (1 if yes, 0 if no) indicating if the debtor was ever subject to a notary protest for any of their debts.

## Debtor Profile Features
- `age_of_oldest_debt_days`: Time elapsed (in days) since the debtor's oldest recorded debt was included.

In [57]:
# Agrupamento principal por pessoa devedora
grouped = df.groupby('exe_debitopessoa_idpessoa')

# Calcula cada feature individualmente como Series
current_outstanding_balance = grouped.apply(
    lambda g: (g['exe_debito_valororiginaldebito'].fillna(0) - g['exe_debito_valorpago'].fillna(0)).sum()
)

percent_paid_amount = grouped.apply(
    lambda g: g['exe_debito_valorpago'].fillna(0).sum() / g['exe_debito_valororiginaldebito'].replace(0, 1).sum()
)

num_distinct_debts = grouped['exe_debito_iddebito'].nunique()

total_multa_value = grouped.apply(
    lambda g: g.loc[g['is_multa'] == 1, 'exe_debito_valororiginaldebito'].fillna(0).sum()
)

total_ressarcimento_value = grouped.apply(
    lambda g: g.loc[g['is_ressarcimento'] == 1, 'exe_debito_valororiginaldebito'].fillna(0).sum()
)

has_ever_made_installment_agreement = grouped['exe_parcelamento_idparcelamento'].apply(
    lambda x: int(x.notna().any())
)

has_ever_been_protested = grouped['protesto_titulosremessa_idtituloremessa'].apply(
    lambda x: int(x.notna().any())
)

protest_paid = grouped['protesto_status'].apply(
    lambda s: s.isin([4, 7, 10]).sum()
)

pge_paid = grouped.apply(
    lambda g: g.loc[g['pge_processo_homologadopge'] == 1, 'pge_processo_valorpagopge'].sum(
    )
)

num_failed_parcelamentos = grouped['exe_parcelamento_datacancelamento'].count()
age_oldest_decision = 2025 - pd.to_datetime(grouped['DataDecisao'].min(), errors='coerce').dt.year
difference_in_years_decisions = pd.to_datetime(grouped['DataDecisao'].max(), errors='coerce').dt.year - pd.to_datetime(grouped['DataDecisao'].min(), errors='coerce').dt.year


# Combina todas as séries em um único DataFrame final
features_df = pd.concat([
    current_outstanding_balance.rename("current_outstanding_balance"),
    percent_paid_amount.rename("percent_paid_amount"),
    num_distinct_debts.rename("num_distinct_debts"),
    total_multa_value.rename("total_multa_value"),
    total_ressarcimento_value.rename("total_ressarcimento_value"),
    has_ever_made_installment_agreement.rename("has_ever_made_installment_agreement"),
    has_ever_been_protested.rename("has_ever_been_protested"),
    protest_paid.rename("protest_paid_sum_all_numeric"), # Added with a more descriptive name due to its nature
    pge_paid.rename("pge_paid_amount"),                 # Added
    num_failed_parcelamentos.rename("num_cancelled_installments"), # Added (renamed for clarity based on calculation)
    age_oldest_decision.rename("age_oldest_decision"),         # Added
    difference_in_years_decisions.rename("difference_in_years_decisions") # Added
], axis=1)


  current_outstanding_balance = grouped.apply(
  percent_paid_amount = grouped.apply(
  total_multa_value = grouped.apply(
  total_ressarcimento_value = grouped.apply(
  pge_paid = grouped.apply(


In [58]:
features_df

Unnamed: 0_level_0,current_outstanding_balance,percent_paid_amount,num_distinct_debts,total_multa_value,total_ressarcimento_value,has_ever_made_installment_agreement,has_ever_been_protested,protest_paid_sum_all_numeric,pge_paid_amount,num_cancelled_installments,age_oldest_decision,difference_in_years_decisions
exe_debitopessoa_idpessoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
164,33809.5900,0.000000,1,0.0000,33809.59,0,0,0,0.0,0,17.0,0.0
247,-36.7200,1.020400,1,1800.0000,0.00,0,0,0,0.0,0,10.0,0.0
288,10000.0000,0.000000,1,10000.0000,0.00,1,0,0,0.0,0,,
701,104.0928,0.854638,5,716.0928,0.00,0,0,0,0.0,0,13.0,2.0
773,-539.1600,1.107832,3,5000.0000,0.00,0,1,9,0.0,0,9.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
76188,415.3500,0.000000,1,0.0000,415.35,0,0,0,0.0,0,2.0,0.0
76662,63887.4800,0.000000,2,31943.7500,31943.73,0,0,0,0.0,0,3.0,0.0
76930,0.0000,0.000000,1,0.0000,0.00,0,0,0,0.0,0,,
79427,106.4100,0.000000,1,106.4100,0.00,0,0,0,0.0,0,,


In [53]:
[c for c in df.columns if 'exe_debito_' in c]

['exe_debito_iddebito',
 'exe_debito_idprocessoexecucao',
 'exe_debito_valororiginaldebito',
 'exe_debito_valorpago',
 'exe_debito_codigotipodebito',
 'exe_debito_datainclusao',
 'exe_debito_multadiaria_iddebitomultadiaria',
 'exe_debito_multadiaria_iddebito',
 'exe_debito_multadiaria_valormultadiaria',
 'exe_debito_multadiaria_datainicio',
 'exe_debito_multadiaria_datafinal',
 'exe_debito_tipodebito']

In [69]:

# --- Hopkins Statistic Function ---
def hopkins_statistic(X_df):
    """
    Calculates the Hopkins statistic for a given DataFrame X_df.
    Assumes X_df contains only numerical features and no NaNs.

    Interpretation:
    - H close to 1.0: Data is highly clusterable.
    - H close to 0.5: Data is random.
    - H close to 0.0: Data is uniform/regularly spaced (anti-clustered).
    """
    if not isinstance(X_df, pd.DataFrame):
        raise TypeError("Input X_df must be a pandas DataFrame.")
    if X_df.isnull().values.any():
        raise ValueError("Input DataFrame X_df contains NaN values. Please handle them first.")
    if not all(X_df.dtypes.apply(pd.api.types.is_numeric_dtype)):
        raise ValueError("All columns in X_df must be numeric.")

    X = X_df.values
    n_samples, n_features = X.shape

    if n_samples <= 1:
        return 0.5 # Not enough points to assess clustering

    # 1. Scale the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Determine the number of points to sample (m)
    # Common practice is 5-15% of the dataset size, or a fixed number like 20 if dataset is small.
    # Let's use 10% but ensure it's at least 1 and not more than n_samples / 2 for stability.
    m = int(0.1 * n_samples)
    if m <= 0: m = 1
    if m >= n_samples / 2 and n_samples > 2 : m = int(n_samples / 2) -1 # ensure m < n-m for sampling
    if m <=0 and n_samples > 1: m = 1
    if n_samples <=2 : m = 1 # handle very small n


    # 3. Fit NearestNeighbors on the entire scaled dataset
    # We need k=2 for u_distances because the point itself is its own 0th neighbor.
    # We need k=1 for w_distances.
    try:
        # For u_distances (real points to real points)
        # We need to find the distance to the *actual* nearest neighbor, not itself.
        # So we query for 2 neighbors and take the second one.
        nbrs_u = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X_scaled)

        # For w_distances (random points to real points)
        # We need to find the distance to the single nearest real point.
        nbrs_w = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(X_scaled)
    except Exception as e:
        print(f"Error fitting NearestNeighbors (likely due to too few samples or m value): {e}")
        return 0.5 # Default to random if NN fails


    # 4. Select m random data points from X_scaled (for u_distances)
    # Ensure we can sample m points without replacement
    if m >= n_samples:
        idx = np.arange(n_samples) # Use all points if m is too large
    else:
        idx = np.random.choice(n_samples, size=m, replace=False)
    
    sampled_X = X_scaled[idx, :]

    # 5. Generate m random points Y uniformly within the data space of X_scaled
    # The bounds should be based on the scaled data
    min_vals = np.min(X_scaled, axis=0)
    max_vals = np.max(X_scaled, axis=0)
    Y = np.random.uniform(low=min_vals, high=max_vals, size=(m, n_features))

    # 6. Calculate u_distances: sum of distances from each point in sampled_X to its nearest neighbor in X_scaled
    u_distances, _ = nbrs_u.kneighbors(sampled_X, n_neighbors=2)
    sum_u_distances = np.sum(u_distances[:, 1]) # Take the 2nd column (k=1, index 1)

    # 7. Calculate w_distances: sum of distances from each point in Y to its nearest neighbor in X_scaled
    w_distances, _ = nbrs_w.kneighbors(Y, n_neighbors=1)
    sum_w_distances = np.sum(w_distances[:, 0]) # Take the 1st column (k=0, index 0)

    # 8. Calculate Hopkins Statistic
    if (sum_u_distances + sum_w_distances) == 0:
        return 0.5 # Avoid division by zero, implies points are likely identical or overlapping

    # H = sum_u_distances / (sum_u_distances + sum_w_distances) # This is for H close to 0 = clusterable
    H = sum_w_distances / (sum_u_distances + sum_w_distances) # This is for H close to 1 = clusterable

    return H

# --- Applying the Hopkins Statistic Test ---
# First, ensure features_df is ready for the test (all numeric, no NaNs)

# Handle potential NaNs that might arise from calculations (e.g., division by zero in percent_paid_amount if sum is 0)
# or from date conversions if some dates were unparseable.
print("\n--- Pre-Hopkins Check for features_df ---")
print(f"Shape of features_df before NaN handling: {features_df.shape}")
print(f"NaN counts per column in features_df:\n{features_df.isnull().sum()}")

# Simple NaN handling: fill with mean for numeric columns.
# For a more robust approach, consider median or model-based imputation.
for col in features_df.columns:
    if features_df[col].isnull().any():
        if pd.api.types.is_numeric_dtype(features_df[col]):
            features_df[col] = features_df[col].fillna(features_df[col].mean())
            print(f"Filled NaNs in '{col}' with mean.")
        else:
            # If somehow a non-numeric column slipped through or was created with NaNs
            features_df[col] = features_df[col].fillna(features_df[col].mode()[0] if not features_df[col].mode().empty else "Unknown")
            print(f"Filled NaNs in '{col}' with mode (or 'Unknown').")


# Ensure all columns are numeric after NaN handling
# Convert boolean-like (0/1) features to int if they are not already, just in case.
for col in ['has_ever_made_installment_agreement', 'has_ever_been_protested']:
    if col in features_df.columns:
        features_df[col] = features_df[col].astype(int)

# Drop any rows that might still have NaNs if mean/mode filling wasn't perfect (e.g., all values were NaN)
# Or if a column was entirely NaN and couldn't be filled by mean/mode.
# features_df_cleaned = features_df.dropna() # Option 1: Drop rows with any NaNs
# Option 2: Check if any column is ALL NaN after filling, which would cause issues
if features_df.isnull().all().any():
    all_nan_cols = features_df.columns[features_df.isnull().all()].tolist()
    print(f"Warning: Columns {all_nan_cols} are entirely NaN after filling. They will be dropped.")
    features_df = features_df.drop(columns=all_nan_cols)


if features_df.empty:
    print("features_df is empty after NaN handling. Cannot calculate Hopkins statistic.")
else:
    print(f"\nShape of features_df after NaN handling: {features_df.shape}")
    print(f"Cleaned NaN counts per column in features_df:\n{features_df.isnull().sum()}")

    # Select only numeric columns for Hopkins, just to be safe
    numeric_features_for_hopkins = features_df.select_dtypes(include=np.number)

    if numeric_features_for_hopkins.empty:
        print("No numeric columns left in features_df after cleaning. Cannot calculate Hopkins statistic.")
    elif numeric_features_for_hopkins.isnull().values.any():
        print("Still NaNs in numeric features after cleaning. Hopkins cannot be calculated.")
        print(numeric_features_for_hopkins.isnull().sum())
    else:
        try:
            hopkins_score = hopkins_statistic(numeric_features_for_hopkins.copy()) # Pass a copy
            print(f"\nHopkins Statistic for the new features_df: {hopkins_score:.4f}")
            if hopkins_score > 0.75:
                print("This suggests the data is highly clusterable.")
            elif hopkins_score < 0.25:
                print("This suggests the data is uniform or regularly spaced (not clusterable).")
            else:
                print("This suggests the data is somewhat random (no strong clustering tendency).")
        except Exception as e:
            print(f"An error occurred during Hopkins statistic calculation: {e}")
            print("This might be due to too few samples, all identical samples, or issues with data values.")



--- Pre-Hopkins Check for features_df ---
Shape of features_df before NaN handling: (3460, 12)
NaN counts per column in features_df:
current_outstanding_balance            0
percent_paid_amount                    0
num_distinct_debts                     0
total_multa_value                      0
total_ressarcimento_value              0
has_ever_made_installment_agreement    0
has_ever_been_protested                0
protest_paid_sum_all_numeric           0
pge_paid_amount                        0
num_cancelled_installments             0
age_oldest_decision                    0
difference_in_years_decisions          0
dtype: int64

Shape of features_df after NaN handling: (3460, 12)
Cleaned NaN counts per column in features_df:
current_outstanding_balance            0
percent_paid_amount                    0
num_distinct_debts                     0
total_multa_value                      0
total_ressarcimento_value              0
has_ever_made_installment_agreement    0
has_ever_been_pr



In [72]:
# --- Hopkins Statistic Function ---
def hopkins_statistic(X_df):
    """
    Calculates the Hopkins statistic for a given DataFrame X_df.
    Assumes X_df contains only numerical features and no NaNs.

    Interpretation:
    - H close to 1.0: Data is highly clusterable.
    - H close to 0.5: Data is random.
    - H close to 0.0: Data is uniform/regularly spaced (anti-clustered).
    """
    if not isinstance(X_df, pd.DataFrame):
        raise TypeError("Input X_df must be a pandas DataFrame.")
    if X_df.isnull().values.any():
        raise ValueError("Input DataFrame X_df contains NaN values. Please handle them first.")
    if not all(X_df.dtypes.apply(pd.api.types.is_numeric_dtype)):
        raise ValueError("All columns in X_df must be numeric.")

    X = X_df.values
    n_samples, n_features = X.shape

    if n_samples <= 1:
        return 0.5 # Not enough points to assess clustering

    # 1. Scale the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Determine the number of points to sample (m)
    # Common practice is 5-15% of the dataset size, or a fixed number like 20 if dataset is small.
    # Let's use 10% but ensure it's at least 1 and not more than n_samples / 2 for stability.
    m = int(0.1 * n_samples)
    if m <= 0: m = 1
    if m >= n_samples / 2 and n_samples > 2 : m = int(n_samples / 2) -1 # ensure m < n-m for sampling
    if m <=0 and n_samples > 1: m = 1
    if n_samples <=2 : m = 1 # handle very small n


    # 3. Fit NearestNeighbors on the entire scaled dataset
    # We need k=2 for u_distances because the point itself is its own 0th neighbor.
    # We need k=1 for w_distances.
    try:
        # For u_distances (real points to real points)
        # We need to find the distance to the *actual* nearest neighbor, not itself.
        # So we query for 2 neighbors and take the second one.
        nbrs_u = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X_scaled)

        # For w_distances (random points to real points)
        # We need to find the distance to the single nearest real point.
        nbrs_w = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(X_scaled)
    except Exception as e:
        print(f"Error fitting NearestNeighbors (likely due to too few samples or m value): {e}")
        return 0.5 # Default to random if NN fails


    # 4. Select m random data points from X_scaled (for u_distances)
    # Ensure we can sample m points without replacement
    if m >= n_samples:
        idx = np.arange(n_samples) # Use all points if m is too large
    else:
        idx = np.random.choice(n_samples, size=m, replace=False)
    
    sampled_X = X_scaled[idx, :]

    # 5. Generate m random points Y uniformly within the data space of X_scaled
    # The bounds should be based on the scaled data
    min_vals = np.min(X_scaled, axis=0)
    max_vals = np.max(X_scaled, axis=0)
    Y = np.random.uniform(low=min_vals, high=max_vals, size=(m, n_features))

    # 6. Calculate u_distances: sum of distances from each point in sampled_X to its nearest neighbor in X_scaled
    u_distances, _ = nbrs_u.kneighbors(sampled_X, n_neighbors=2)
    sum_u_distances = np.sum(u_distances[:, 1]) # Take the 2nd column (k=1, index 1)

    # 7. Calculate w_distances: sum of distances from each point in Y to its nearest neighbor in X_scaled
    w_distances, _ = nbrs_w.kneighbors(Y, n_neighbors=1)
    sum_w_distances = np.sum(w_distances[:, 0]) # Take the 1st column (k=0, index 0)

    # 8. Calculate Hopkins Statistic
    if (sum_u_distances + sum_w_distances) == 0:
        return 0.5 # Avoid division by zero, implies points are likely identical or overlapping

    # H = sum_u_distances / (sum_u_distances + sum_w_distances) # This is for H close to 0 = clusterable
    H = sum_w_distances / (sum_u_distances + sum_w_distances) # This is for H close to 1 = clusterable

    return H

# --- Applying the Hopkins Statistic Test ---
# First, ensure features_df is ready for the test (all numeric, no NaNs)

# Handle potential NaNs that might arise from calculations (e.g., division by zero in percent_paid_amount if sum is 0)
# or from date conversions if some dates were unparseable.
print("\n--- Pre-Hopkins Check for features_df ---")
print(f"Shape of features_df before NaN handling: {features_df.shape}")
print(f"NaN counts per column in features_df:\n{features_df.isnull().sum()}")

# Simple NaN handling: fill with mean for numeric columns.
# For a more robust approach, consider median or model-based imputation.
for col in features_df.columns:
    if features_df[col].isnull().any():
        if pd.api.types.is_numeric_dtype(features_df[col]):
            features_df[col] = features_df[col].fillna(features_df[col].mean())
            print(f"Filled NaNs in '{col}' with mean.")
        else:
            # If somehow a non-numeric column slipped through or was created with NaNs
            features_df[col] = features_df[col].fillna(features_df[col].mode()[0] if not features_df[col].mode().empty else "Unknown")
            print(f"Filled NaNs in '{col}' with mode (or 'Unknown').")


# Ensure all columns are numeric after NaN handling
# Convert boolean-like (0/1) features to int if they are not already, just in case.
for col in ['has_ever_made_installment_agreement', 'has_ever_been_protested']:
    if col in features_df.columns:
        features_df[col] = features_df[col].astype(int)

# Drop any rows that might still have NaNs if mean/mode filling wasn't perfect (e.g., all values were NaN)
# Or if a column was entirely NaN and couldn't be filled by mean/mode.
# features_df_cleaned = features_df.dropna() # Option 1: Drop rows with any NaNs
# Option 2: Check if any column is ALL NaN after filling, which would cause issues
if features_df.isnull().all().any():
    all_nan_cols = features_df.columns[features_df.isnull().all()].tolist()
    print(f"Warning: Columns {all_nan_cols} are entirely NaN after filling. They will be dropped.")
    features_df = features_df.drop(columns=all_nan_cols)


if features_df.empty:
    print("features_df is empty after NaN handling. Cannot calculate Hopkins statistic.")
else:
    print(f"\nShape of features_df after NaN handling: {features_df.shape}")
    print(f"Cleaned NaN counts per column in features_df:\n{features_df.isnull().sum()}")

    # Select only numeric columns for Hopkins, just to be safe
    numeric_features_for_hopkins = features_df.select_dtypes(include=np.number)

    if numeric_features_for_hopkins.empty:
        print("No numeric columns left in features_df after cleaning. Cannot calculate Hopkins statistic.")
    elif numeric_features_for_hopkins.isnull().values.any():
        print("Still NaNs in numeric features after cleaning. Hopkins cannot be calculated.")
        print(numeric_features_for_hopkins.isnull().sum())
    else:
        try:
            hopkins_score = hopkins_statistic(numeric_features_for_hopkins.copy()) # Pass a copy
            print(f"\nHopkins Statistic for the new features_df: {hopkins_score:.4f}")
            if hopkins_score > 0.75:
                print("This suggests the data is highly clusterable.")
            elif hopkins_score < 0.25:
                print("This suggests the data is uniform or regularly spaced (not clusterable).")
            else:
                print("This suggests the data is somewhat random (no strong clustering tendency).")
        except Exception as e:
            print(f"An error occurred during Hopkins statistic calculation: {e}")
            print("This might be due to too few samples, all identical samples, or issues with data values.")




--- Pre-Hopkins Check for features_df ---
Shape of features_df before NaN handling: (3460, 12)
NaN counts per column in features_df:
current_outstanding_balance            0
percent_paid_amount                    0
num_distinct_debts                     0
total_multa_value                      0
total_ressarcimento_value              0
has_ever_made_installment_agreement    0
has_ever_been_protested                0
protest_paid_sum_all_numeric           0
pge_paid_amount                        0
num_cancelled_installments             0
age_oldest_decision                    0
difference_in_years_decisions          0
dtype: int64

Shape of features_df after NaN handling: (3460, 12)
Cleaned NaN counts per column in features_df:
current_outstanding_balance            0
percent_paid_amount                    0
num_distinct_debts                     0
total_multa_value                      0
total_ressarcimento_value              0
has_ever_made_installment_agreement    0
has_ever_been_pr

In [73]:
features_df

Unnamed: 0_level_0,current_outstanding_balance,percent_paid_amount,num_distinct_debts,total_multa_value,total_ressarcimento_value,has_ever_made_installment_agreement,has_ever_been_protested,protest_paid_sum_all_numeric,pge_paid_amount,num_cancelled_installments,age_oldest_decision,difference_in_years_decisions
exe_debitopessoa_idpessoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
164,33809.5900,0.000000,1,0.0000,33809.59,0,0,0,0.0,0,17.000000,0.000000
247,-36.7200,1.020400,1,1800.0000,0.00,0,0,0,0.0,0,10.000000,0.000000
288,10000.0000,0.000000,1,10000.0000,0.00,1,0,0,0.0,0,11.055175,2.748939
701,104.0928,0.854638,5,716.0928,0.00,0,0,0,0.0,0,13.000000,2.000000
773,-539.1600,1.107832,3,5000.0000,0.00,0,1,9,0.0,0,9.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
76188,415.3500,0.000000,1,0.0000,415.35,0,0,0,0.0,0,2.000000,0.000000
76662,63887.4800,0.000000,2,31943.7500,31943.73,0,0,0,0.0,0,3.000000,0.000000
76930,0.0000,0.000000,1,0.0000,0.00,0,0,0,0.0,0,11.055175,2.748939
79427,106.4100,0.000000,1,106.4100,0.00,0,0,0,0.0,0,11.055175,2.748939


In [74]:
features_df.reset_index(inplace=True)
features_df.drop(columns=['exe_debitopessoa_idpessoa'], inplace=True)

In [75]:
features_df.to_csv('../data/public/processed/tcern_debtors_features.csv', index=False)