# Importando dependencias

In [None]:
import pyspark.sql.functions as sf
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType, StringType
from pyspark.sql import DataFrame

In [None]:
import gcsfs
import pyarrow.parquet as pq

In [None]:
from functools import reduce
import subprocess
import os

In [None]:
from feature_store import FeatureStore, Catalog

In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 150)

# Utils

In [None]:
import gcsfs
import pyarrow.parquet as pq
import json
def read_parquet_to_pandas(path: str, projectGCS="analytics-k8s-dev-4742"):
    """function to read parquet file as
     as pandas dataframe - useful for
     training tasks"""
    fs = gcsfs.GCSFileSystem(project=projectGCS, requester_pays=True)
    files = ["gs://" + path for path in fs.glob(path + "/part-*")]
    df_pandas = pq.ParquetDataset(files, filesystem=fs).read().to_pandas()
    return df_pandas

In [None]:
BASE_GS = "gs://oculto/loss_prevention/perfil_laranja/data/"

In [None]:
df = read_parquet_to_pandas("gs://oculto/loss_prevention/perfil_laranja/data/perfil_laranja_clean_altaRenda_reduzido.parquet")

In [None]:
df.shape

(5604459, 125)

# Modelagem

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_squared_error

In [None]:
pfs_sem_patrimonio_cond = (
    (df["qtdImoveis"] == 0) &
    (df["qtdVeiculosFinanciados"] == 0) &
    (df["quantidadeVeiculosPesados"] == 0) &
    (df["quantidadePropriedades"] == 0) &
    (df["quantidadeAeronaves"] == 0)
)

df = df[~pfs_sem_patrimonio_cond]

In [None]:
df.shape

(5142696, 125)

In [None]:
df_sample = df.sample(frac=0.1, random_state=2025)

In [None]:
X = df_sample.copy()
X.shape

(514270, 125)

In [None]:
n_estimators = 100
contamination = 0.001
sample_size = 256

In [None]:
iso_forest = IsolationForest(n_estimators=n_estimators,
                            contamination=contamination,
                            max_samples=sample_size,
                            random_state=42)

In [None]:
%%time
iso_forest.fit(X)

CPU times: user 4.41 s, sys: 219 ms, total: 4.63 s
Wall time: 4.62 s


In [None]:
df_sample['scoreAnomalia'] = iso_forest.decision_function(X)
df_sample['anomalia'] = iso_forest.predict(X)

In [None]:
df_sample['anomalia'].value_counts()

anomalia
 1    513755
-1       515
Name: count, dtype: int64

In [None]:
columns = X.columns

A importância é baseada na variação dos scores de anomalia ao embaralhar cada variável.

In [None]:
%%time
original_scores = iso_forest.decision_function(X)


feature_importance = {}

for feature in X.columns:
    print(feature)
    df_permuted = X.copy()
    df_permuted[feature] = np.random.permutation(df_permuted[feature])

    permuted_scores = iso_forest.decision_function(df_permuted)
    score_change = mean_squared_error(original_scores, permuted_scores)  # Impacto no score de anomalia

    feature_importance[feature] = score_change

flagBeneficiarioBPC
flagVulneravelBPC
flagBeneficiarioSeguroDefeso
flagBeneficiarioGarantiaSafra
flagBeneficiarioNovoBolsaFamilia
flagBeneficiarioProgramasSociais
flagAltaQualificacaoBeneficiarioProgramasSociais
indicioFalecimento
flagFalecidoConfirmado
flagSupercentenario
flagAdolescente
flagIdoso
flagEmpresario
flagAltaQualificacao
flagServidorPublico
flagRegistroAntt
flagProdutorRural
quantidadeCandidatosApoiados
quantidadeEleicoesComoDoador
valorMedioDoado
valorTotalDoado
quantidadeDoacoesDescricaoNaoEspecificada
valorMaximoDoado
quantidadeDoacoesPoliticas
flagContratoPublicoBeneficiarioProgramasSociais
flagLaranja
quantidadeEmails
quantidadeDiferentesprovedoresEmail
quantidadeEmailPadraoSuspeito
quantidadeEmailNaoConsistenteNomeTitular
flagAreaUrbanaBaixaDensidadeEdificios
flagAglomeradoRural
flagAreaRural
flagFavelaOuComunidadeUrbana
flagDividaDauCresceu180Dias
valorTotalDividasDau
qtdVeiculosFinanciados
qtdFinanciamentosVeicularesQuitados
recorrenciaFinanciamentoVeicular
Recorre

In [None]:
max_importance = max(feature_importance.values())
for feature in feature_importance:
    feature_importance[feature] = (feature_importance[feature] / max_importance) * 100


df_importance = pd.DataFrame(
    list(feature_importance.items()), columns=["Feature", "Importance"]
).sort_values(by="Importance", ascending=False)

df_importance["Importance"] = df_importance["Importance"].round(2)

In [None]:
df_importance = pd.DataFrame(
    list(feature_importance.items()), columns=["Feature", "Importance"]
).sort_values(by="Importance", ascending=False)

In [None]:
df_importance.head(60)

Unnamed: 0,Feature,Importance
122,perfilInvestidorDIVERSIFICADO,100.0
99,flagSegundoGrauProcessoJudicialCobranca,70.836132
109,flagSocioClasseEmpresaFachadaALTA,70.506065
111,flagSocioClasseEmpresaFachadaMEDIO,56.370383
105,quantidadeMediaEmpresasMesmoCnaeAbertas12Meses,55.636098
61,flagProcessoJudicialCobranca,53.525677
44,flagAlterouBancoDeclaracaoIrpfUltimos5Anos,53.35117
114,historicoAtrasoFinanciamentoVeicularSEM_INFORM...,51.206053
115,historicoAtrasoFinanciamentoVeicularATRASA,46.050038
18,quantidadeEleicoesComoDoador,44.714998


# Salvar a Importância

In [None]:
importance_dict = df_importance.set_index("Feature")["Importance"].to_dict()

In [None]:
fs = gcsfs.GCSFileSystem(project="seu-projeto-gcp")

path = "gs://oculto/loss_prevention/perfil_laranja/data/df_importance_alta_renda.json"

with fs.open(path, 'w') as f:
    json.dump(importance_dict, f)


In [None]:
spark.createDataFrame(df_importance).write.mode("overwrite").parquet(
    "gs://oculto/loss_prevention/perfil_laranja/data/df_importance_alta_renda.parquet"
)


                                                                                