In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler


In [2]:
# Caminho para a base original
caminho_csv = "../data/credit_data.csv"

df = pd.read_csv(caminho_csv)

df.head()


Unnamed: 0,id_cliente,idade,renda,tempo_emprego_anos,valor_divida,num_atrasos_12m,utilizacao_credito,possui_cartao_credito,score_interno,relacao_divida_renda,inadimplente
0,1,56,2502.64,13.7,4977.61,0,0.47,0,724,1.988,0
1,2,69,1720.57,1.0,608.15,2,0.383,0,629,0.353,0
2,3,46,4551.98,2.3,1483.04,0,0.101,1,610,0.326,0
3,4,32,1058.87,13.2,1703.27,2,0.502,1,647,1.607,1
4,5,60,1000.0,0.3,492.96,1,0.456,0,643,0.492,0


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id_cliente             2000 non-null   int64  
 1   idade                  2000 non-null   int64  
 2   renda                  2000 non-null   float64
 3   tempo_emprego_anos     2000 non-null   float64
 4   valor_divida           2000 non-null   float64
 5   num_atrasos_12m        2000 non-null   int64  
 6   utilizacao_credito     2000 non-null   float64
 7   possui_cartao_credito  2000 non-null   int64  
 8   score_interno          2000 non-null   int64  
 9   relacao_divida_renda   2000 non-null   float64
 10  inadimplente           2000 non-null   int64  
dtypes: float64(5), int64(6)
memory usage: 172.0 KB


In [4]:
df.describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id_cliente,2000.0,1000.5,577.494589,1.0,500.75,1000.5,1500.25,2000.0
idade,2000.0,44.3765,15.226644,18.0,31.0,45.0,57.0,70.0
renda,2000.0,2104.631265,1084.961572,1000.0,1323.515,1827.39,2543.2175,12875.98
tempo_emprego_anos,2000.0,4.8719,4.843831,0.0,1.4,3.3,6.7,33.6
valor_divida,2000.0,2611.99969,2135.848677,0.81,1063.8075,2127.17,3667.5025,18601.83
num_atrasos_12m,2000.0,1.4715,1.178508,0.0,1.0,1.0,2.0,7.0
utilizacao_credito,2000.0,0.285318,0.160472,0.008,0.162,0.266,0.39,0.845
possui_cartao_credito,2000.0,0.6755,0.468305,0.0,0.0,1.0,1.0,1.0
score_interno,2000.0,648.3715,80.877426,341.0,593.75,650.0,703.0,950.0
relacao_divida_renda,2000.0,1.237011,0.719182,0.001,0.61275,1.229,1.8615,2.498


In [5]:
df.isna().sum()


id_cliente               0
idade                    0
renda                    0
tempo_emprego_anos       0
valor_divida             0
num_atrasos_12m          0
utilizacao_credito       0
possui_cartao_credito    0
score_interno            0
relacao_divida_renda     0
inadimplente             0
dtype: int64

In [6]:
df_fe = df.copy()
df_fe.head()


Unnamed: 0,id_cliente,idade,renda,tempo_emprego_anos,valor_divida,num_atrasos_12m,utilizacao_credito,possui_cartao_credito,score_interno,relacao_divida_renda,inadimplente
0,1,56,2502.64,13.7,4977.61,0,0.47,0,724,1.988,0
1,2,69,1720.57,1.0,608.15,2,0.383,0,629,0.353,0
2,3,46,4551.98,2.3,1483.04,0,0.101,1,610,0.326,0
3,4,32,1058.87,13.2,1703.27,2,0.502,1,647,1.607,1
4,5,60,1000.0,0.3,492.96,1,0.456,0,643,0.492,0


In [7]:
bins_idade = [18, 25, 35, 45, 60, 100]
labels_idade = ["18-25", "26-35", "36-45", "46-60", "60+"]

df_fe["faixa_idade"] = pd.cut(df_fe["idade"], bins=bins_idade, labels=labels_idade, right=True)
df_fe["faixa_idade"].value_counts()


faixa_idade
46-60    584
60+      388
36-45    380
26-35    353
18-25    256
Name: count, dtype: int64

In [8]:
bins_renda = [0, 2000, 4000, 8000, 20000, np.inf]
labels_renda = ["até 2k", "2k-4k", "4k-8k", "8k-20k", "20k+"]

df_fe["faixa_renda"] = pd.cut(df_fe["renda"], bins=bins_renda, labels=labels_renda, right=False)
df_fe["faixa_renda"].value_counts()


faixa_renda
até 2k    1132
2k-4k      747
4k-8k      116
8k-20k       5
20k+         0
Name: count, dtype: int64

In [9]:
df_fe["alto_endividamento"] = (df_fe["relacao_divida_renda"] > 0.6).astype(int)
df_fe["alto_endividamento"].value_counts()


alto_endividamento
1    1510
0     490
Name: count, dtype: int64

In [10]:
def categorizar_tempo_emprego(anos):
    if anos < 1:
        return "até 1 ano"
    elif anos < 3:
        return "1-3 anos"
    elif anos < 5:
        return "3-5 anos"
    elif anos < 10:
        return "5-10 anos"
    else:
        return "10+ anos"

df_fe["faixa_tempo_emprego"] = df_fe["tempo_emprego_anos"].apply(categorizar_tempo_emprego)
df_fe["faixa_tempo_emprego"].value_counts()


faixa_tempo_emprego
1-3 anos     575
5-10 anos    446
3-5 anos     359
até 1 ano    355
10+ anos     265
Name: count, dtype: int64

In [11]:
df_fe["cliente_novo"] = ((df_fe["tempo_emprego_anos"] < 1.0) & (df_fe["num_atrasos_12m"] == 0)).astype(int)
df_fe["cliente_novo"].value_counts()


cliente_novo
0    1922
1      78
Name: count, dtype: int64

In [12]:
df_fe.head()


Unnamed: 0,id_cliente,idade,renda,tempo_emprego_anos,valor_divida,num_atrasos_12m,utilizacao_credito,possui_cartao_credito,score_interno,relacao_divida_renda,inadimplente,faixa_idade,faixa_renda,alto_endividamento,faixa_tempo_emprego,cliente_novo
0,1,56,2502.64,13.7,4977.61,0,0.47,0,724,1.988,0,46-60,2k-4k,1,10+ anos,0
1,2,69,1720.57,1.0,608.15,2,0.383,0,629,0.353,0,60+,até 2k,0,1-3 anos,0
2,3,46,4551.98,2.3,1483.04,0,0.101,1,610,0.326,0,46-60,4k-8k,0,1-3 anos,0
3,4,32,1058.87,13.2,1703.27,2,0.502,1,647,1.607,1,26-35,até 2k,1,10+ anos,0
4,5,60,1000.0,0.3,492.96,1,0.456,0,643,0.492,0,46-60,até 2k,0,até 1 ano,0


In [13]:
colunas_numericas = [
    "idade",
    "renda",
    "tempo_emprego_anos",
    "valor_divida",
    "num_atrasos_12m",
    "utilizacao_credito",
    "score_interno",
    "relacao_divida_renda",
]

scaler = StandardScaler()
df_scaled = df_fe.copy()
df_scaled[colunas_numericas] = scaler.fit_transform(df_scaled[colunas_numericas])

df_scaled.head()


Unnamed: 0,id_cliente,idade,renda,tempo_emprego_anos,valor_divida,num_atrasos_12m,utilizacao_credito,possui_cartao_credito,score_interno,relacao_divida_renda,inadimplente,faixa_idade,faixa_renda,alto_endividamento,faixa_tempo_emprego,cliente_novo
0,1,0.763557,0.366933,1.823001,1.107851,-1.248924,1.151151,0,0.935334,1.044488,0,46-60,2k-4k,1,10+ anos,0
1,2,1.617537,-0.354075,-0.799546,-0.938433,0.44856,0.608865,0,-0.239577,-1.229498,0,60+,até 2k,0,1-3 anos,0
2,3,0.106649,2.256265,-0.531097,-0.528709,-1.248924,-1.148888,1,-0.474559,-1.26705,0,46-60,4k-8k,0,1-3 anos,0
3,4,-0.813022,-0.96411,1.719751,-0.425572,0.44856,1.350612,1,-0.016962,0.514587,1,26-35,até 2k,1,10+ anos,0
4,5,1.02632,-1.018384,-0.944096,-0.992378,-0.400182,1.063886,0,-0.066432,-1.036174,0,46-60,até 2k,0,até 1 ano,0


In [14]:
caminho_saida_scaled = "../data/credit_data_fe_scaled.csv"
df_scaled.to_csv(caminho_saida_scaled, index=False)
caminho_saida_scaled


'../data/credit_data_fe_scaled.csv'