In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("UCI_Credit_Card.csv", names = ["ID","LIMIT_BAL","SEX","EDUCATION","MARRIAGE","AGE","PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6","BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6","PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6","default.payment.next.month"])
df = df.drop([0])
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
1,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
2,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
5,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


# A base possui 30k itens e 25 atributos

In [3]:
print("Default Credit Card Clients data -  rows:",df.shape[0]," columns:", df.shape[1])

Default Credit Card Clients data -  rows: 30000  columns: 25


In [4]:
df.describe()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
count,30000,30000,30000,30000,30000,30000,30000,30000,30000,30000,...,30000,30000,30000,30000,30000,30000,30000,30000,30000,30000
unique,30000,81,2,7,4,56,11,11,11,11,...,21548,21010,20604,7943,7899,7518,6937,6897,6939,2
top,2921,50000,2,2,2,29,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
freq,1,3365,18112,14030,15964,1605,14737,15730,15764,16455,...,3195,3506,4020,5249,5396,5968,6408,6703,7173,23364


# Não há nenhum valor ausente. Todos os atributos estão preenchidos para todos os itens

In [5]:
total = df.isnull().sum().sort_values(ascending = False)
percent = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending = False)
pd.concat([total, percent], axis=1, keys=['Total', 'Percent']).transpose()

Unnamed: 0,default.payment.next.month,PAY_6,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,ID
Total,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Percent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Corrigindo o atributo Marriage
Trata-se de um atributo categórico em que 0 signfica desconhecido. Para os que não se sabe o status de relacionamento, serão substituídos por 3, que significa "outros".

In [6]:
df['MARRIAGE'] = df['MARRIAGE'].apply(lambda x: 3 if x==0 else x)

# Corrigindo atributo Education
Trata-se de um atributo categórico em que 5 e 6 significam desconhecido. Os casos em que que não se sabe o grau de escolaridade, serão substrituídos por 4 que significa "outros".

In [7]:
df['EDUCATION'] = df['EDUCATION'].apply(lambda x: 4 if (x==5 or x==6) else x)

# Normalizando BILL's e PAY's
Primeiramente, é preciso verificar qual o tamanho máximo para poder dividir todos os elementos da coluna por ele. Dessa forma, os valores da coluna ficariam entre -1 e 1.

In [38]:
max_BILL_AMT1 = df['BILL_AMT1'].astype(float).max()
max_BILL_AMT2 = df['BILL_AMT2'].astype(float).max()
max_BILL_AMT3 = df['BILL_AMT3'].astype(float).max()
max_BILL_AMT4 = df['BILL_AMT4'].astype(float).max()
max_BILL_AMT5 = df['BILL_AMT5'].astype(float).max()
max_BILL_AMT6 = df['BILL_AMT6'].astype(float).max()
max_PAY_AMT1 = df['PAY_AMT1'].astype(float).max()
max_PAY_AMT2 = df['PAY_AMT2'].astype(float).max()
max_PAY_AMT3 = df['PAY_AMT3'].astype(float).max()
max_PAY_AMT4 = df['PAY_AMT4'].astype(float).max()
max_PAY_AMT5 = df['PAY_AMT5'].astype(float).max()
max_PAY_AMT6 = df['PAY_AMT6'].astype(float).max()
max_AGE = df['AGE'].astype(float).max()

In [39]:
df['BILL_AMT1'] = df['BILL_AMT1'].astype(float).apply(lambda x: (x/max_BILL_AMT1))
df['BILL_AMT2'] = df['BILL_AMT2'].astype(float).apply(lambda x: (x/max_BILL_AMT2))
df['BILL_AMT3'] = df['BILL_AMT3'].astype(float).apply(lambda x: (x/max_BILL_AMT3))
df['BILL_AMT4'] = df['BILL_AMT4'].astype(float).apply(lambda x: (x/max_BILL_AMT4))
df['BILL_AMT5'] = df['BILL_AMT5'].astype(float).apply(lambda x: (x/max_BILL_AMT5))
df['BILL_AMT6'] = df['BILL_AMT6'].astype(float).apply(lambda x: (x/max_BILL_AMT6))
df['PAY_AMT1'] = df['PAY_AMT1'].astype(float).apply(lambda x: (x/max_PAY_AMT1))
df['PAY_AMT2'] = df['PAY_AMT2'].astype(float).apply(lambda x: (x/max_PAY_AMT2))
df['PAY_AMT3'] = df['PAY_AMT3'].astype(float).apply(lambda x: (x/max_PAY_AMT3))
df['PAY_AMT4'] = df['PAY_AMT4'].astype(float).apply(lambda x: (x/max_PAY_AMT4))
df['PAY_AMT5'] = df['PAY_AMT5'].astype(float).apply(lambda x: (x/max_PAY_AMT5))
df['PAY_AMT6'] = df['PAY_AMT6'].astype(float).apply(lambda x: (x/max_PAY_AMT6))
df['AGE'] = df['AGE'].astype(int).apply(lambda x: (x/max_AGE))