# 0. Bibliotecas

In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# 1. Obtenção dos dados

In [2]:
# !kaggle datasets download -d parisrohan/credit-score-classification

# 2. Importação dos dados

In [3]:
train = pd.read_csv('credit-score-classification/train.csv', low_memory=False)
test =  pd.read_csv('credit-score-classification/test.csv')

# 3. Inspeção geral dos dados

| Variável                  | Descrição                                                                 |
|---------------------------|---------------------------------------------------------------------------|
| ID                        | Identificador único da transação ou registro                               |
| Customer_ID               | Identificador único do cliente                                             |
| Month                     | Mês de referência da análise                                               |
| Name                      | Nome do cliente                                                            |
| Age                       | Idade do cliente                                                           |
| SSN                       | Número de Segurança Social (Social Security Number)                        |
| Occupation                | Ocupação profissional do cliente                                           |
| Annual_Income             | Renda anual do cliente                                                     |
| Monthly_Inhand_Salary      | Salário mensal disponível (em mãos)                                        |
| Num_Bank_Accounts          | Número de contas bancárias que o cliente possui                           |
| Num_Credit_Card            | Número de cartões de crédito que o cliente possui                         |
| Interest_Rate              | Taxa de juros aplicada aos empréstimos ou dívidas                         |
| Num_of_Loan                | Número de empréstimos que o cliente possui                                |
| Type_of_Loan               | Tipo de empréstimo (pessoal, hipotecário, etc.)                           |
| Delay_from_due_date        | Atraso em dias no pagamento após a data de vencimento                     |
| Num_of_Delayed_Payment     | Número de pagamentos atrasados                                             |
| Changed_Credit_Limit       | Alteração no limite de crédito                                             |
| Num_Credit_Inquiries       | Número de consultas de crédito feitas por instituições                    |
| Credit_Mix                 | Mistura de tipos de crédito (cartões, empréstimos, etc.)                  |
| Outstanding_Debt           | Dívida pendente do cliente                                                |
| Credit_Utilization_Ratio   | Proporção da utilização do crédito disponível                             |
| Credit_History_Age         | Tempo total de histórico de crédito do cliente                            |
| Payment_of_Min_Amount      | Se o cliente paga o valor mínimo da fatura (Sim/Não)                      |
| Total_EMI_per_month        | Total de parcelas mensais pagas pelo cliente                              |
| Amount_invested_monthly    | Quantia investida mensalmente pelo cliente                                |
| Payment_Behaviour          | Comportamento de pagamento (padrões e hábitos)                            |
| Monthly_Balance            | Saldo mensal disponível após todas as despesas                            |
| Credit_Score               | Pontuação de crédito do cliente                                           |
| target                     | Variável alvo ou de interesse criada a partir da variável **Credit_Score**|


In [4]:
train.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


Para deixar o problema ainda mais próximo da área de crédito, optou-se por modificar as classes da seguinte maneira:

* Clientes classificados como `Poor` &#x27A1; `Classe 1` (maus pagadores)
* Clientes classifiados como `Standard` &#x27A1; `Classe 0` (bons pagadores)
* Clientes classifiados como `Good` &#x27A1; `Classe 0` (bons pagadores)

In [5]:
# Antes da modificação
train['Credit_Score'].unique()

array(['Good', 'Standard', 'Poor'], dtype=object)

In [6]:
# Modificando as classes para bons e maus pagadores

train['target'] = train['Credit_Score'].apply(lambda x: 1 if x == 'Poor' else 0)

In [7]:
# Checando as novas classes
train['target'].value_counts()

target
0    71002
1    28998
Name: count, dtype: int64

In [9]:
train.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score,target
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good,0
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good,0
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good,0
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good,0
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good,0


In [15]:
# Verificando a dimensionalidade dos dados
print(f'Total de linhas: {train.shape[0]}')
print(f'Total de colunas: {train.shape[1]}')

Total de linhas: 100000
Total de colunas: 29


# 4. EDA

In [26]:
train.head(17)

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score,target
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good,0
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good,0
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good,0
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good,0
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good,0
5,0x1607,CUS_0xd40,June,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,809.98,27.262259,22 Years and 6 Months,No,49.574949,62.430172331195294,!@9#%8,340.4792117872438,Good,0
6,0x1608,CUS_0xd40,July,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,809.98,22.537593,22 Years and 7 Months,No,49.574949,178.3440674122349,Low_spent_Small_value_payments,244.5653167062043,Good,0
7,0x1609,CUS_0xd40,August,,23,#F%$D@*&8,Scientist,19114.12,1824.843333,3,...,809.98,23.933795,,No,49.574949,24.785216509052056,High_spent_Medium_value_payments,358.12416760938714,Standard,0
8,0x160e,CUS_0x21b1,January,Rick Rothackerj,28_,004-07-5839,_______,34847.84,3037.986667,2,...,605.03,24.464031,26 Years and 7 Months,No,18.816215,104.291825168246,Low_spent_Small_value_payments,470.69062692529184,Standard,0
9,0x160f,CUS_0x21b1,February,Rick Rothackerj,28,004-07-5839,Teacher,34847.84,3037.986667,2,...,605.03,38.550848,26 Years and 8 Months,No,18.816215,40.39123782853101,High_spent_Large_value_payments,484.5912142650067,Good,0


In [11]:
train.columns

Index(['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation',
       'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'Credit_Score', 'target'],
      dtype='object')

In [18]:
train['Age'].unique()

array(['23', '-500', '28_', ..., '4808_', '2263', '1342'], dtype=object)

In [20]:
train['Name'].unique()

array(['Aaron Maashoh', nan, 'Rick Rothackerj', ..., 'Chris Wickhamm',
       'Sarah McBridec', 'Nicks'], dtype=object)

In [27]:
# Validação da base (procurando inconsistências)