# Importando dependencias

In [None]:
import pyspark.sql.functions as sf
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType
from feature_store import FeatureStore, Catalog
from pyspark.sql import DataFrame
import time
from pyspark.sql.functions import col, when, lit

# Pré-Tratamento dos Dados

In [None]:
dataset = spark.read.parquet('gs://oculto/oculto/perfil_laranja/data/perfil_laranja_unificado.parquet')

In [None]:
dataset.count()

                                                                                

184050329

## Apagar colunas irrelevantes

In [None]:
dt_treino = dataset.drop("cpf","id","insertDate","nome","idade","dataIndicioFalecimento","situacao","quantidadeRelacionamentos","processingDateB3","referencia","referenciaAnoMes")

In [None]:
dt_treino = dt_treino.drop("flagSituacaoRegular")

## Tratar colunas categoricas

In [None]:
anos = {
"SEM INFORMACAO": "-1",
"SEM RECORRENCIA": "0",
"ATE 30 DIAS": "1",
"DE 30 A 90 DIAS": "2",
"DE 90 A 180 DIAS": "3",
"DE 180 DIAS A 1 ANO": "4",
"DE 1 A 2 ANOS": "5",
"DE 2 A 3 ANOS": "6",
"DE 3 A 5 ANOS": "7",
"DE 5 A 10 ANOS": "8",
"MAIS QUE 10 ANOS": "9",
}

letras = {
    "SEM INFORMACAO": "-1",
    "A": "1",
    "B": "2",
    "C": "3",
    "D": "4",
    "E": "5",
    "F": "6",
    "G": "7",
    "H": "8",
    "I": "9",
    "J": "10",
    "K": "11",
    "L": "12",
    "M": "13",
    "N": "14",
    "O": "15",
    "P": "16",
    "Q": "17",
    "R": "18",
    "S": "19",
    "T": "20",
    "U": "21",
    "V": "22",
    "W": "23",
    "X": "24",
    "Y": "25",
    "Z": "26"
}

classe1 = {
    "SEM INFORMACAO": "-1",
    "BAIXO": "1",
    "MEDIO": "2",
    "ALTO": "3"
}

classe2 = {
    "SEM INFORMACAO": "-1",
    "BAIXA": "1",
    "MEDIA": "2",
    "ALTA": "3"
}

In [None]:
colunas_string = [f.name for f in dt_treino.schema.fields if (f.dataType.simpleString() == 'string')&(f.dataType.simpleString() != 'boolean')]

dt_treino = dt_treino.fillna("SEM INFORMACAO", subset=colunas_string)

mapeamentos = {
    "recorrenciaFinanciamentoVeicular": anos,
    "ativosComCota": letras,
    "ativosSemCota": letras,
    "bancarizacao": letras,
    "bancarizacaoBest": classe1,
    "bancarizacaoFaixa": classe2,
    "diversificacao": letras,
    "diversificacaoBest": classe1,
    "diversificacaoFaixa": classe2,
    "patrimonio": letras
}

# Aplica os replaces de forma dinâmica
for coluna, mapa in mapeamentos.items():
    dt_treino = dt_treino.replace(mapa, subset=[coluna])

    dt_treino = dt_treino.withColumn(coluna, col(coluna).cast("int"))



In [None]:
dt_treino = (
    dt_treino
    .withColumn("historicoAtrasoFinanciamentoVeicularSEM_INFORMACAO", (col("historicoAtrasoFinanciamentoVeicular") == "SEM INFORMACAO").cast("double"))
    .withColumn("historicoAtrasoFinanciamentoVeicularATRASA", (col("historicoAtrasoFinanciamentoVeicular") == "ATRASA").cast("double"))
    .withColumn("historicoAtrasoFinanciamentoVeicularNAO_ATRASA", (col("historicoAtrasoFinanciamentoVeicular") == "NAO ATRASA").cast("double"))
)

dt_treino = (
    dt_treino
    .withColumn("evolucaoPatrimonioSEM_INFORMACAO", (col("evolucaoPatrimonio") == "SEM INFORMACAO").cast("double"))
    .withColumn("evolucaoPatrimonioAUMENTOU", (col("evolucaoPatrimonio") == "AUMENTOU").cast("double"))
    .withColumn("evolucaoPatrimonioDIMINUIU", (col("evolucaoPatrimonio") == "DIMINUIU").cast("double"))
    .withColumn("evolucaoPatrimonioMANTEVE", (col("evolucaoPatrimonio") == "MANTEVE").cast("double"))
)

dt_treino = (
    dt_treino
    .withColumn("perfilInvestidorSEM_INFORMACAO", (col("perfilInvestidor") == "SEM INFORMACAO").cast("double"))
    .withColumn("perfilInvestidorDIVERSIFICADO", (col("perfilInvestidor") == "DIVERSIFICADO").cast("double"))
    .withColumn("perfilInvestidorCONSERVADOR", (col("perfilInvestidor") == "CONSERVADOR").cast("double"))
    .withColumn("perfilInvestidorAGRESSIVO", (col("perfilInvestidor") == "AGRESSIVO").cast("double"))
)

dt_treino = dt_treino.drop("historicoAtrasoFinanciamentoVeicular","evolucaoPatrimonio","perfilInvestidor")

In [None]:
# VALIDACAO
colunas_string = [f.name for f in dt_treino.schema.fields if f.dataType.simpleString() == 'string']

for i in colunas_string:
    print("-------------",i,"-------------")
    dt_treino.groupby(i).count().show()
    print()

------------- cpf -------------


[Stage 34:>                                                         (0 + 1) / 1]

+-----------+-----+
|        cpf|count|
+-----------+-----+
|00000818330|    1|
|00000948977|    1|
|00001036750|    1|
|00001177656|    1|
|00001179780|    1|
|00001209604|    1|
|00001323652|    1|
|00001420771|    1|
|00001483269|    1|
|00001806025|    1|
|00001949993|    1|
|00002562260|    1|
|00002567571|    1|
|00002584310|    1|
|00002693526|    1|
|00002718294|    1|
|00002814684|    1|
|00003056260|    1|
|00003061000|    1|
|00003197085|    1|
+-----------+-----+
only showing top 20 rows




                                                                                

## Tratar coluna booleans

In [None]:
colunas_boolean = [f.name for f in dt_treino.schema.fields if f.dataType.simpleString() == 'boolean']

for coluna in colunas_boolean:
    dt_treino = dt_treino.withColumn(coluna, sf.col(coluna).cast("int"))

In [None]:
# VALIDACAO
colunas_boolean = [f.name for f in dt_treino.schema.fields if f.dataType.simpleString() == 'boolean']

for coluna in colunas_boolean:
    print("-------------",coluna,"-------------")
    dt_treino.groupby(coluna).count().show()
    print()

## Colunas Nulas

In [None]:
def countMissingValues(dataframe: DataFrame) -> DataFrame:
    missing_values = dataframe.select([
        sf.count(sf.when(sf.col(c).isNull(), c)).alias(c) for c in dataframe.columns
    ])

    return missing_values

In [None]:
countMissingValues(dt_treino).show(vertical=True, truncate=False)



-RECORD 0--------------------------------------------------------------
 cpf                                                       | 0         
 flagBeneficiarioBPC                                       | 0         
 flagVulneravelBPC                                         | 0         
 flagBeneficiarioSeguroDefeso                              | 0         
 flagBeneficiarioGarantiaSafra                             | 0         
 flagBeneficiarioNovoBolsaFamilia                          | 0         
 flagBeneficiarioProgramasSociais                          | 0         
 flagAltaQualificacaoBeneficiarioProgramasSociais          | 0         
 indicioFalecimento                                        | 183622104 
 flagFalecidoConfirmado                                    | 0         
 flagSupercentenario                                       | 0         
 flagAdolescente                                           | 0         
 flagIdoso                                                 | 0  

                                                                                

In [None]:
dt_treino = dt_treino.fillna(0)

In [None]:
countMissingValues(dt_treino).show(vertical=True, truncate=False)



-RECORD 0--------------------------------------------------------
 cpf                                                       | 0   
 flagBeneficiarioBPC                                       | 0   
 flagVulneravelBPC                                         | 0   
 flagBeneficiarioSeguroDefeso                              | 0   
 flagBeneficiarioGarantiaSafra                             | 0   
 flagBeneficiarioNovoBolsaFamilia                          | 0   
 flagBeneficiarioProgramasSociais                          | 0   
 flagAltaQualificacaoBeneficiarioProgramasSociais          | 0   
 indicioFalecimento                                        | 0   
 flagFalecidoConfirmado                                    | 0   
 flagSupercentenario                                       | 0   
 flagAdolescente                                           | 0   
 flagIdoso                                                 | 0   
 flagEmpresario                                            | 0   
 flagAltaQ

                                                                                

In [None]:
dt_treino.printSchema()

root
 |-- cpf: string (nullable = false)
 |-- flagBeneficiarioBPC: integer (nullable = true)
 |-- flagVulneravelBPC: integer (nullable = true)
 |-- flagBeneficiarioSeguroDefeso: integer (nullable = true)
 |-- flagBeneficiarioGarantiaSafra: integer (nullable = true)
 |-- flagBeneficiarioNovoBolsaFamilia: integer (nullable = true)
 |-- flagBeneficiarioProgramasSociais: integer (nullable = true)
 |-- flagAltaQualificacaoBeneficiarioProgramasSociais: integer (nullable = true)
 |-- indicioFalecimento: integer (nullable = true)
 |-- flagFalecidoConfirmado: integer (nullable = true)
 |-- flagSupercentenario: integer (nullable = true)
 |-- flagAdolescente: integer (nullable = true)
 |-- flagIdoso: integer (nullable = true)
 |-- flagEmpresario: integer (nullable = true)
 |-- flagAltaQualificacao: integer (nullable = true)
 |-- flagServidorPublico: integer (nullable = true)
 |-- flagRegistroAntt: integer (nullable = true)
 |-- flagProdutorRural: integer (nullable = true)
 |-- quantidadeCandidato

In [None]:
print("BAIXA RENDA")
(dt_treino
 .filter(sf.col('flagBaixaRenda')==1)
 .drop("flagBaixaRenda","flagMediaRenda","flagAltaRenda")
 .write.mode("overwrite")
 .parquet("gs://oculto/loss_prevention/perfil_laranja/data/perfil_laranja_clean_baixaRenda.parquet")
)

print("MEDIA RENDA")
(dt_treino
 .filter(sf.col('flagMediaRenda')==1)
 .drop("flagBaixaRenda","flagMediaRenda","flagAltaRenda")
 .write.mode("overwrite")
 .parquet("gs://oculto/loss_prevention/perfil_laranja/data/perfil_laranja_clean_mediaRenda.parquet")
)

print("ALTA RENDA")
(dt_treino
 .filter(sf.col('flagAltaRenda')==1)
 .drop("flagBaixaRenda","flagMediaRenda","flagAltaRenda")
 .write.mode("overwrite")
 .parquet("gs://oculto/loss_prevention/perfil_laranja/data/perfil_laranja_clean_altaRenda.parquet")
)

BAIXA RENDA


                                                                                

MEDIA RENDA


                                                                                

ALTA RENDA


                                                                                

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [None]:
dt_treino.limit(5).toPandas()

Unnamed: 0,flagBeneficiarioBPC,flagVulneravelBPC,flagBeneficiarioSeguroDefeso,flagBeneficiarioGarantiaSafra,flagBeneficiarioNovoBolsaFamilia,flagBeneficiarioProgramasSociais,flagAltaQualificacaoBeneficiarioProgramasSociais,indicioFalecimento,flagFalecidoConfirmado,flagSupercentenario,flagAdolescente,flagIdoso,flagEmpresario,flagAltaQualificacao,flagServidorPublico,flagRegistroAntt,flagProdutorRural,quantidadeCandidatosApoiados,quantidadeEleicoesComoDoador,valorMedioDoado,valorTotalDoado,quantidadeDoacoesDescricaoNaoEspecificada,valorMaximoDoado,quantidadeDoacoesPoliticas,flagContratoPublicoBeneficiarioProgramasSociais,flagLaranja,quantidadeEmails,quantidadeDiferentesprovedoresEmail,quantidadeEmailPadraoSuspeito,quantidadeEmailNaoConsistenteNomeTitular,flagAreaUrbanaBaixaDensidadeEdificios,flagAglomeradoRural,flagAreaRural,flagFavelaOuComunidadeUrbana,flagDividaDauCresceu180Dias,valorTotalDividasDau,qtdVeiculosFinanciados,qtdFinanciamentosVeicularesQuitados,recorrenciaFinanciamentoVeicular,RecorrenciaFinanciamentoMaioridade,flagFinanciamentoImobiliarioAltoValorBeneficiario,flagFinanciamentoImobiliarioAltoValorRenda,quantidadeRestituicaoIrpfObservadas,quantidadeDeclaracoesIrpfObservadas,flagAlterouBancoDeclaracaoIrpfUltimos5Anos,ativosComCota,ativosComCotaPercentual,ativosSemCota,bancarizacao,bancarizacaoBest,bancarizacaoFaixa,diversificacao,diversificacaoBest,diversificacaoFaixa,patrimonio,flagSocioProcessoJudicialLavagemDinheiro,flagProcessoJudicialCorrupcao,flagProcessoJudicialLavagem,flagProcessoJudicialFraude,flagProcessoJudicialRouboFurtos,flagSocioProcessoJudicialFraude,flagProcessoJudicialCobranca,flagSocioProcessoJudicialCorrupcao,flagSocioProcessoJudicialTributario,quantidadeDuplaIdentidadeNomeDataNascimento,flagDuplaIdentidadeNomeDataNascimento,quantidadeDuplaIdentidadeNomeMae,flagDuplaIdentidadeNomeMae,quantidadeDuplaIdentidadeCpfMae,flagDuplaIdentidadeCpfMae,quantidadeIndicioForteDuplaIdentidade,flagIndicioForteDuplaIdentidade,qtdImoveis,valorTotal,flagTop10ValorImoveisRenda,flagTop5ValorImoveisRenda,flagTop1ValorImoveisRenda,flagTop10AltoValorImoveisBeneficiario,flagTop5AltoValorImoveisBeneficiario,flagTop1AltoValorImoveisBeneficiario,quantidadeVeiculosPesados,quantidadeVeiculosPesadosAnoFabricacaoEntre0E4AnosAtras,quantidadeVeiculosPesadosAnoFabricacaoEntre5E9AnosAtras,quantidadeVeiculosPesadosAnoFabricacaoEntre10E14AnosAtras,valorAreaTotalPropriedadesRurais,quantidadePropriedades,quantidadePropriedadesAtivas,flagOperadorAeronave,flagProprietarioAeronave,quantidadeAeronaves,flagPrimeiroGrauProcessoJudicialCorrupcao,flagPrimeiroGrauProcessoJudicialLavagem,flagPrimeiroGrauProcessoJudicialFraude,flagPrimeiroGrauProcessoJudicialRouboFurtos,flagPrimeiroGrauProcessoJudicialCobranca,flagSegundoGrauProcessoJudicialCorrupcao,flagSegundoGrauProcessoJudicialLavagem,flagSegundoGrauProcessoJudicialFraude,flagSegundoGrauProcessoJudicialRouboFurtos,flagSegundoGrauProcessoJudicialCobranca,flagRendaSemInformacao,flagBaixaRenda,flagMediaRenda,flagAltaRenda,flagSocioDuplaId,quantidadeMediaEmpresasMesmoCnaeAbertas12Meses,flagSocioEmpresasGrandePorte,flagSocioEmpresasEPP,flagSocioBeneficiarioProgramaSocial,flagSocioClasseEmpresaFachadaALTA,flagSocioClasseEmpresaFachadaMUITOALTA,flagSocioClasseEmpresaFachadaMEDIO,quantidadeTelefones,quantidadeTelefonesEnderecoConsistente,historicoAtrasoFinanciamentoVeicularSEM_INFORMACAO,historicoAtrasoFinanciamentoVeicularATRASA,historicoAtrasoFinanciamentoVeicularNAO_ATRASA,evolucaoPatrimonioSEM_INFORMACAO,evolucaoPatrimonioAUMENTOU,evolucaoPatrimonioDIMINUIU,evolucaoPatrimonioMANTEVE,perfilInvestidorSEM_INFORMACAO,perfilInvestidorDIVERSIFICADO,perfilInvestidorCONSERVADOR,perfilInvestidorAGRESSIVO
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0.0,0,0,0,1,1,0,1,0,0,0,0,0,0.0,1,1,0,24.0,0,0,0,3,0,-1,0,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0,0,0,0,0,0,2,2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0.0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,2,3,5,16.5,0,0,0,3,0,-1,0,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0,0,0,0,0,1,1,1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0,0.0,0,0,0,1,1,1,1,0,0,0,0,0,0.0,0,0,-1,0.0,0,0,0,3,0,-1,0,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0,0,0,0,0,0,2,2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,-1,0.0,0,0,0,3,0,-1,0,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0,0,0,0,0,0,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0,0.0,0,0,0,1,1,0,1,0,0,0,0,0,0.0,0,0,-1,0.0,0,0,0,3,0,-1,0,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0,0,0,0,0,0,3,2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
dt_treino.write.mode("overwrite").parquet("gs://oculto/loss_prevention/perfil_laranja/data/perfil_laranja_clean.parquet")

                                                                                

In [None]:
dt_treino.count()

                                                                                

184050329