In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from yellowbrick.classifier import ConfusionMatrix

from xgboost import XGBClassifier

In [2]:
credito = pd.read_csv("credit.csv")
credito.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,'critical/other existing credit',radio/tv,1169,'no known savings',>=7,4,'male single',none,...,'real estate',67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,'existing paid',radio/tv,5951,<100,1<=X<4,2,'female div/dep/mar',none,...,'real estate',22,none,own,1,skilled,1,none,yes,bad
2,'no checking',12,'critical/other existing credit',education,2096,<100,4<=X<7,2,'male single',none,...,'real estate',49,none,own,1,'unskilled resident',2,none,yes,good
3,<0,42,'existing paid',furniture/equipment,7882,<100,4<=X<7,2,'male single',guarantor,...,'life insurance',45,none,'for free',1,skilled,2,none,yes,good
4,<0,24,'delayed previously','new car',4870,<100,1<=X<4,3,'male single',none,...,'no known property',53,none,'for free',2,skilled,2,none,yes,bad


In [3]:
x = credito.iloc[:,0:20].values
x

array([['<0', 6, "'critical/other existing credit'", ..., 1, 'yes',
        'yes'],
       ['0<=X<200', 48, "'existing paid'", ..., 1, 'none', 'yes'],
       ["'no checking'", 12, "'critical/other existing credit'", ..., 2,
        'none', 'yes'],
       ...,
       ["'no checking'", 12, "'existing paid'", ..., 1, 'none', 'yes'],
       ['<0', 45, "'existing paid'", ..., 1, 'yes', 'yes'],
       ['0<=X<200', 45, "'critical/other existing credit'", ..., 1,
        'none', 'yes']], dtype=object)

In [4]:
y = credito.iloc[:,20].values

In [5]:
labelencoder = LabelEncoder()

In [6]:
transform = [0,2,3,5,6,8,9,11,13,14,16,18,19]

In [7]:
for i in transform:
    x[:,i] = labelencoder.fit_transform(x[:,i])

In [8]:
x

array([[2, 6, 1, ..., 1, 1, 1],
       [1, 48, 3, ..., 1, 0, 1],
       [0, 12, 1, ..., 2, 0, 1],
       ...,
       [0, 12, 3, ..., 1, 0, 1],
       [2, 45, 3, ..., 1, 1, 1],
       [1, 45, 1, ..., 1, 0, 1]], dtype=object)

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

print("\nDisposição da divisão Treino/Teste\n")

print("X Treino:", x_train.shape[0], "registros x", x_train.shape[1], "variáveis preditoras")
print("Y Treino:", y_train.shape[0], "registros x 1 variável de resposta\n")

print("X Teste:", x_test.shape[0], "registros x", x_test.shape[1], "variáveis preditoras")
print("Y Teste:", y_test.shape[0], "registros x 1 variável de resposta\n")


Disposição da divisão Treino/Teste

X Treino: 700 registros x 20 variáveis preditoras
Y Treino: 700 registros x 1 variável de resposta

X Teste: 300 registros x 20 variáveis preditoras
Y Teste: 300 registros x 1 variável de resposta



In [10]:
print(x_train[0])

[1 24 1 6 11938 3 0 2 3 0 3 3 39 1 1 2 0 2 1 1]


In [11]:
modelo = XGBClassifier(base_score=0.5, 
                       booster='gbtree', 
                       colsample_bylevel=1,
                       colsample_bynode=1,
                       colsample_bytree=0.8,
                       gamma=0,
                       learning_rate=0.2, 
                       max_delta_step=0, 
                       max_depth=5,
                       min_child_weight=1, 
                       missing=None, 
                       n_estimators=1000,
                       n_jobs=1,
                       nthread=None, 
                       objective='binary:logistic', 
                       random_state=0,
                       reg_alpha=0, 
                       reg_lambda=1, 
                       scale_pos_weight=1, 
                       seed=None,
                       silent=None, 
                       subsample=0.8, 
                       verbosity=1)

modelo.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0,
              learning_rate=0.2, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.8, verbosity=1)

In [12]:
predicao_y_test = modelo.predict(x_test)
print(predicao_y_test)

['bad' 'good' 'good' 'good' 'good' 'good' 'good' 'good' 'good' 'bad' 'bad'
 'good' 'good' 'good' 'bad' 'good' 'good' 'good' 'bad' 'good' 'good' 'bad'
 'good' 'good' 'good' 'bad' 'good' 'good' 'good' 'bad' 'good' 'good'
 'good' 'bad' 'good' 'bad' 'good' 'good' 'good' 'good' 'good' 'good'
 'good' 'good' 'good' 'bad' 'good' 'bad' 'bad' 'good' 'good' 'bad' 'good'
 'good' 'good' 'good' 'bad' 'good' 'good' 'good' 'bad' 'good' 'bad' 'good'
 'good' 'good' 'good' 'good' 'bad' 'good' 'good' 'good' 'good' 'bad' 'bad'
 'good' 'good' 'good' 'good' 'good' 'good' 'good' 'good' 'good' 'good'
 'good' 'bad' 'bad' 'good' 'good' 'good' 'good' 'good' 'good' 'good'
 'good' 'good' 'good' 'bad' 'good' 'bad' 'good' 'good' 'good' 'good'
 'good' 'bad' 'bad' 'bad' 'good' 'good' 'bad' 'bad' 'good' 'good' 'good'
 'bad' 'good' 'good' 'good' 'good' 'good' 'good' 'bad' 'good' 'good'
 'good' 'good' 'good' 'good' 'good' 'good' 'bad' 'good' 'good' 'good'
 'good' 'good' 'good' 'bad' 'bad' 'bad' 'good' 'good' 'good' 'good'

In [13]:
taxa_acerto = accuracy_score(y_test, predicao_y_test)
taxa_acerto

0.7833333333333333

In [14]:
matriz = confusion_matrix(y_test, predicao_y_test)
print(matriz)

[[ 48  38]
 [ 27 187]]
