In [1]:
#IMPORTANDO BIBLIOTECAS NECESSÁRIAS
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [2]:
#TRANSFORMANDO ARQUIVO CSV COM DADOS EM DATAFRAME
df = pd.read_csv('credit_data.csv')
df.head(10)

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1
5,6,24904.06414,57.471607,15.498598,0
6,7,48430.359613,26.809132,5722.581981,0
7,8,24500.141984,32.897548,2971.00331,1
8,9,40654.892537,55.496853,4755.82528,0
9,10,25075.872771,39.776378,1409.230371,0


In [3]:
#RENOMEANDO COLUNAS
df = df.rename(columns={'i#clientid':'id_cliente',
                        'income':'renda',
                        'age':'idade',
                        'loan':'valor_emprestimo',
                        'c#default':'pagou'})
df

Unnamed: 0,id_cliente,renda,idade,valor_emprestimo,pagou
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.642260,0
4,5,66952.688845,18.584336,8770.099235,1
...,...,...,...,...,...
1995,1996,59221.044874,48.518179,1926.729397,0
1996,1997,69516.127573,23.162104,3503.176156,0
1997,1998,44311.449262,28.017167,5522.786693,1
1998,1999,43756.056605,63.971796,1622.722598,0


In [4]:
#DESCRICAO GERAL
df.describe()

Unnamed: 0,id_cliente,renda,idade,valor_emprestimo,pagou
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [5]:
#LOCALIZANDO VALORES INCONGRUENTES DE 'idade'
df.loc[df.idade < 0]

Unnamed: 0,id_cliente,renda,idade,valor_emprestimo,pagou
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


In [6]:
#SUBSTITUINDO VALORES INCONGRUENTES DE 'idade' PELA MÉDIA 
media = df['idade'][df.idade > 0].mean() #obtendo média que desconsidera os valores incongruentes
print(f"Média correta = {media}")

df.loc[df.idade < 0, 'idade'] = media
df.iloc[15:26,:]

Média correta = 40.92770044906149


Unnamed: 0,id_cliente,renda,idade,valor_emprestimo,pagou
15,16,50501.726689,40.9277,3977.287432,0
16,17,43548.654711,39.57453,3935.544453,0
17,18,43378.175194,60.848318,3277.737553,0
18,19,20542.365073,61.690571,3157.44229,0
19,20,58887.357549,26.076093,4965.516066,0
20,21,23000.784002,31.761354,1148.118057,0
21,22,32197.620701,40.9277,4244.057136,0
22,23,23329.319414,48.576975,222.622299,0
23,24,27845.800894,51.970624,4959.921226,0
24,25,65301.984029,48.840922,5465.267886,0


In [7]:
#VERIFICANDO OCORRÊNCIAS DE VALORES NULO
df.isnull().sum()

id_cliente          0
renda               0
idade               3
valor_emprestimo    0
pagou               0
dtype: int64

In [8]:
#LOCALIZANDO VALORES NULOS DE 'idade'
df.loc[df['idade'].isnull()]

Unnamed: 0,id_cliente,renda,idade,valor_emprestimo,pagou
28,29,59417.805406,,2082.625938,0
30,31,48528.852796,,6155.78467,0
31,32,23526.302555,,2862.010139,0


In [9]:
#DEFININDO CAMPOS DE VALORES SIGNIFICATIVOS / PREVISORES E CLASSES
previsores = df.iloc[:, 1:4].values #Coluna 1 -> RENDA / 2-> IDADE / 3-> VALOR EMPRESTIMO
classe = df.iloc[:,4].values
classe

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [10]:
#INSTANCIA DA CLASSE SIMPLEIMPUTER QUE TRATA VALORES NULOS 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') 

imputer = imputer.fit(previsores)
# Utilização da média como estratégia de substituição de valores nulos

previsores = imputer.transform(previsores)
np.set_printoptions(formatter={'float': '{:.2f}'.format}) #utilizando padrao de formatacao .2f para numpy array
previsores

array([[66155.93, 59.02, 8106.53],
       [34415.15, 48.12, 6564.75],
       [57317.17, 63.11, 8020.95],
       ...,
       [44311.45, 28.02, 5522.79],
       [43756.06, 63.97, 1622.72],
       [69436.58, 56.15, 7378.83]])

In [11]:
#ESCALONAMENTO/NORMALIZAÇÃO DE ATRIBUTOS (devido a intervalos de valores diferenciados para renda/idade/valor emprestimo)
scaler = StandardScaler() #Instancia da classe StandardScaler para normalização de valores
previsores = scaler.fit_transform (previsores) #Padronização do modelo de dados
classe = classe.astype(int)

previsores, classe


(array([[1.45, 1.37, 1.20],
        [-0.76, 0.54, 0.70],
        [0.84, 1.67, 1.17],
        ...,
        [-0.07, -0.97, 0.35],
        [-0.11, 1.74, -0.93],
        [1.68, 1.15, 0.96]]),
 array([0, 0, 0, ..., 1, 0, 0]))

In [12]:
#SEPARANDO DADOS DE TREINAMENTO E DE TESTE:
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classe, test_size=0.35)

In [13]:
#UTILIZAÇÃO DO MODELO DECISIONTREE
classificador = DecisionTreeClassifier(criterion='entropy', random_state = 0)
classificador.fit(previsores_treinamento, classe_treinamento)

previsoes = classificador.predict(previsores_teste)

In [14]:
classificador.feature_importances_

array([0.19, 0.42, 0.39])

In [15]:
classificador.score(previsores_teste, classe_teste)

0.9871428571428571

In [16]:
export_graphviz(classificador, 
                out_file='arvore_dados_credito.dot',
                feature_names=['renda', 'idade','valor_emprestimo'],
                class_names=['não pagou', 'pagou'],
                filled=True,
                leaves_parallel=True
               )

In [18]:
np.unique(classe)

array([0, 1])