<a href="https://colab.research.google.com/github/dedemasutti/data-science/blob/main/projeto_credito.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importa as bibliotecas e pacotes


In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

importa o dataset

In [3]:
dados = pd.read_excel("dados_credito.xlsx")

Preparando os dados

In [4]:
# usa o shape para contar linhas e colunas, respectivamente.
dados.shape

(3000, 30)

In [5]:
# Chamamos o .head para visualizarmos as cinco primeiras colunas do dataset
dados.head(5)

Unnamed: 0,TARGET,ID,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,...,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
0,1.0,582,3.0,3.0,0.0,4.0,0.0,5.0,117.0,27.0,...,3.0,0.9179,0.2083,2.0,3.0,7.0,0.2083,4.0,4.0,0.0
1,1.0,662,15.0,9.0,0.0,3.0,1.0,3.0,14.0,14.0,...,1.0,0.8,0.0,0.0,0.0,0.0,1.0,12.0,0.0,1.0
2,1.0,805,0.0,0.0,0.0,1.0,5.0,1.0,354.0,7.0,...,5.0,0.3552,0.6538,0.0,1.0,1.0,0.7308,1.0,1.0,0.5263
3,1.0,1175,8.0,5.0,0.0,6.0,1.0,10.0,16.0,4.0,...,3.0,0.9127,0.25,1.0,1.0,1.0,0.75,7.0,1.0,1.3333
4,1.0,1373,3.0,1.0,0.0,9.0,0.0,8.0,130.0,52.0,...,1.0,1.2511,0.0,0.0,1.0,4.0,0.1429,3.0,1.0,0.0


In [6]:
# Utiliza o método drop para tirar a coluna 'ID'
dados = dados.drop('ID', axis=1)
dados.shape

(3000, 29)

In [7]:
# visualizando os valores missing
dados.isna().sum()

TARGET               0
DerogCnt             0
CollectCnt           0
BanruptcyInd         0
InqCnt06             0
InqTimeLast        188
InqFinanceCnt24      0
TLTimeFirst          0
TLTimeLast           0
TLCnt03              0
TLCnt12              0
TLCnt24              0
TLCnt                3
TLSum               40
TLMaxSum            40
TLSatCnt             4
TLDel60Cnt           0
TLBadCnt24           0
TL75UtilCnt         99
TL50UtilCnt         99
TLBalHCPct          41
TLSatPct             4
TLDel3060Cnt24       0
TLDel90Cnt24         0
TLDel60CntAll        0
TLOpenPct            3
TLBadDerogCnt        0
TLDel60Cnt24         0
TLOpen24Pct          3
dtype: int64

In [8]:
# Preenche valores faltantes com mean
dados = dados.fillna(dados.mean())

In [9]:
dados.isna().sum()

TARGET             0
DerogCnt           0
CollectCnt         0
BanruptcyInd       0
InqCnt06           0
InqTimeLast        0
InqFinanceCnt24    0
TLTimeFirst        0
TLTimeLast         0
TLCnt03            0
TLCnt12            0
TLCnt24            0
TLCnt              0
TLSum              0
TLMaxSum           0
TLSatCnt           0
TLDel60Cnt         0
TLBadCnt24         0
TL75UtilCnt        0
TL50UtilCnt        0
TLBalHCPct         0
TLSatPct           0
TLDel3060Cnt24     0
TLDel90Cnt24       0
TLDel60CntAll      0
TLOpenPct          0
TLBadDerogCnt      0
TLDel60Cnt24       0
TLOpen24Pct        0
dtype: int64

In [10]:
# Separa em 0 e 1 os clientes. Sendo 0 bom cliente e 1 mau cliente
dados['TARGET'].value_counts()

0.0    2500
1.0     500
Name: TARGET, dtype: int64

#Train Test Split

In [11]:
# Usa método iloc para encontrar valores
y = dados.iloc[:, 0].values
x = dados.iloc[:, 1:28].values


In [12]:
# Divide o dataset em treino e teste. Respectivamente, 80% e 20%
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [13]:
# Faz a normalização dos dados com StandardScaler. Dividindo entre 0 e 1 nossas variaveis.
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

#Construindo o modelo de risco

In [14]:
classificador = LogisticRegression()
classificador.fit(x_train, y_train)
y_pred = classificador.predict(x_test)

#Acurácia do modelo


In [15]:
print(confusion_matrix(y_test, y_pred))

[[482  16]
 [ 87  15]]


In [16]:
print(accuracy_score(y_test, y_pred))

0.8283333333333334


#Arquivo de saída

In [19]:
prediction = classificador.predict_proba(x_test)
prediction

array([[0.04230096, 0.95769904],
       [0.93683282, 0.06316718],
       [0.70459693, 0.29540307],
       ...,
       [0.9724934 , 0.0275066 ],
       [0.44332485, 0.55667515],
       [0.86810657, 0.13189343]])

writing model output file

In [24]:
df_prediction_prob = pd.DataFrame(prediction, columns = ['prob_0','prob_1'])
df_prediction_target = pd.DataFrame(classificador.predict(x_test), columns = ['predicted_target'])
df_test_dataset = pd.DataFrame(y_test, columns = ['Resultado'])
df_new = pd.concat([df_test_dataset,df_prediction_target,df_prediction_prob], axis = 1)
df_new.head()

Unnamed: 0,Resultado,predicted_target,prob_0,prob_1
0,1.0,1.0,0.042301,0.957699
1,0.0,0.0,0.936833,0.063167
2,0.0,0.0,0.704597,0.295403
3,0.0,0.0,0.908269,0.091731
4,0.0,0.0,0.866217,0.133783


In [27]:
df_new.to_csv("https://drive.google.com/drive/u/0/my-drive/Model_prediction.xlsx", sep =',' , encoding='UTF-8')