# PRE PROCESSAMENTO DE DADOS DE CENSO 


Pré-processamento de dados de censo para determinação de quais perfis se enquadram em classificações de renda

Retirado de um arquivo do repositório UCI Machine Learning Repository

Link: https://archive.ics.uci.edu/dataset/20/census+income

In [29]:
#IMPORTANDO BIBLIOTECAS NECESSÁRIAS
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import confusion_matrix, accuracy_score

In [13]:
#INSERINDO CONTEÚDO DO ARQUIVO CSV NUM DATAFRAME
df = pd.read_csv('census.csv')
df

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [14]:
#RENOMEANDO COLUNAS
df = df.rename(columns = {'age':'idade',
                    'workclass': 'area_de_trabalho',
                    'final-weight': 'peso_final',
                    'education':'nivel_educacao',
                    'education-num':'anos_de_estudo',
                    'marital-status':'estado_civil',
                    'occupation':'ocupacao',
                    'relationship':'relacionamento',
                    'race':'raça',
                    'sex':'sexo',
                    'capital-gain':'ganho_capital',
                    'capital-loos':'perda_capital',
                    'hour-per-week':'horas_por_semana',
                    'native-country':'pais_nativo',
                    'income':'renda'})

In [15]:
    df.describe()

Unnamed: 0,idade,peso_final,anos_de_estudo,ganho_capital,perda_capital,horas_por_semana
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [16]:
#VERIFICANDO SE ALGUMA COLUNA POSSUI VALORES NÃO PREENCHIDOS
df.isnull().any()

idade               False
area_de_trabalho    False
peso_final          False
nivel_educacao      False
anos_de_estudo      False
estado_civil        False
ocupacao            False
relacionamento      False
raça                False
sexo                False
ganho_capital       False
perda_capital       False
horas_por_semana    False
pais_nativo         False
renda               False
dtype: bool

In [17]:
#VERIFICANDO SE ALGUMA COLUNA POSSUI VALORES NULOS 0
(df == 0).any()

idade               False
area_de_trabalho    False
peso_final          False
nivel_educacao      False
anos_de_estudo      False
estado_civil        False
ocupacao            False
relacionamento      False
raça                False
sexo                False
ganho_capital        True
perda_capital        True
horas_por_semana    False
pais_nativo         False
renda               False
dtype: bool

In [18]:
# DEFINIÇÃO DOS PREVISORES E DA CLASSE
previsores = df.iloc[:, 0:14].values
classe = df.iloc[:, 14].values

In [19]:
# TRANSFORMAÇÃO DE VARIÁVEIS CATEGÓRICAS EM DISCRETAS
label_encoder_previsores = LabelEncoder()
indices_categoricas = [1, 3, 5, 6, 7, 8, 9, 13]

for indice in indices_categoricas:
    previsores[:, indice] = label_encoder_previsores.fit_transform(previsores[:, indice])
    
previsores

array([[39, 7, 77516, ..., 0, 40, 39],
       [50, 6, 83311, ..., 0, 13, 39],
       [38, 4, 215646, ..., 0, 40, 39],
       ...,
       [58, 4, 151910, ..., 0, 40, 39],
       [22, 4, 201490, ..., 0, 20, 39],
       [52, 5, 287927, ..., 0, 40, 39]], dtype=object)

In [20]:
# UTILIZAÇÃO DE VARIÁVEIS DUMMY PARA OS ATRIBUTOS PREVISORES
onehotencoder = OneHotEncoder() #classe onehotencoder q faz a transformação para dummy

#classe column_transformer que altera um conjunto de valores numa coluna
column_transformer = ColumnTransformer(
    transformers=[
        ('onehot', onehotencoder, indices_categoricas),
    ],
    remainder='passthrough'  # Mantém as colunas não especificadas sem transformação
)

previsores = column_transformer.fit_transform(previsores).toarray()
previsores

array([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.1740e+03, 0.0000e+00,
        4.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.3000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+01],
       ...,
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5024e+04, 0.0000e+00,
        4.0000e+01]])

In [21]:
# TRANSFORMANDO A CLASSE EM VARIÁVEL DISCRETA:
label_encoder_classe = LabelEncoder()
classe = label_encoder_classe.fit_transform(classe)
classe

array([0, 0, 0, ..., 0, 0, 1])

In [22]:
# ESCALONAMENTO DOS PREVISORES:
scaler = StandardScaler()

previsores = scaler.fit_transform(previsores)
previsores

array([[-0.2444502 , -0.17429511, -0.26209736, ...,  0.1484529 ,
        -0.21665953, -0.03542945],
       [-0.2444502 , -0.17429511, -0.26209736, ..., -0.14592048,
        -0.21665953, -2.22215312],
       [-0.2444502 , -0.17429511, -0.26209736, ..., -0.14592048,
        -0.21665953, -0.03542945],
       ...,
       [-0.2444502 , -0.17429511, -0.26209736, ..., -0.14592048,
        -0.21665953, -0.03542945],
       [-0.2444502 , -0.17429511, -0.26209736, ..., -0.14592048,
        -0.21665953, -1.65522476],
       [-0.2444502 , -0.17429511, -0.26209736, ...,  1.88842434,
        -0.21665953, -0.03542945]])

In [68]:
# SEPARANDO DADOS DE TESTE E TREINO
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classe, test_size=0.4)

In [69]:
# APLICANDO MODELO DECISIONTREE
classificador = DecisionTreeClassifier(criterion='entropy', random_state=0)
classificador.fit(previsores_treinamento, classe_treinamento)

In [70]:
previsoes = classificador.predict(previsores_teste)

In [71]:
precisao = accuracy_score(classe_teste, previsoes)
matriz = confusion_matrix(classe_teste, previsoes)

matriz, precisao

(array([[8679, 1217],
        [1167, 1962]], dtype=int64),
 0.8169673704414587)

In [63]:
np.unique(classe)

array([0, 1])