In [113]:
import pandas as pd
import pandera as pa

In [114]:
#Criação do dataframe
path = r'/home/vegh/Desktop/Estudos/python-dev-test/data/Adult.data'

df_sensus = pd.read_csv(path, sep=',')

In [115]:
#Verificação do número de linhas e colunas
df_sensus.shape

(32560, 15)

In [116]:
df_sensus.sample()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
26601,25,Private,64860,Some-college,10,Married-spouse-absent,Adm-clerical,Unmarried,White,Female,0,0,22,United-States,<=50K


In [117]:
#Normalização das colunas
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
           'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
           'hours_per_week', 'native_country', 'class']

df_sensus.columns = columns

In [118]:
#Validação do schema baseado nos tipos de dados das colunas
schema = pa.DataFrameSchema(
    columns = {
        'age':pa.Column(pa.Int),
        'workclass':pa.Column(pa.String),
        'fnlwgt':pa.Column(pa.Int),
        'education':pa.Column(pa.String),
        'education_num':pa.Column(pa.Int),
        'marital_status':pa.Column(pa.String),
        'occupation':pa.Column(pa.String),
        'relationship':pa.Column(pa.String),
        'race':pa.Column(pa.String),
        'sex':pa.Column(pa.String),
        'capital_gain':pa.Column(pa.Int),
        'capital_loss':pa.Column(pa.Int),
        'hours_per_week':pa.Column(pa.Int),
        'native_country':pa.Column(pa.String),
        'class':pa.Column(pa.String)


    }
)

try:
    schema.validate(df_sensus, lazy=True)
except Exception as err:
    print(err.failure_cases) 


        
    

  schema_context        column           check check_number failure_case index
0         Column           age  dtype('int64')         None       object  None
1         Column        fnlwgt  dtype('int64')         None       object  None
2         Column  capital_gain  dtype('int64')         None       object  None


In [119]:
#Validação de caracteres não numéricos das colunas do tipo inteiro
df_sensus.loc[df_sensus['age'].apply(lambda x: not x.isnumeric())]
#df_sensus.loc[df_sensus['fnlwgt'].apply(lambda x: not x.isnumeric())]
#df_sensus.loc[df_sensus['capital_gain'].apply(lambda x: not x.isnumeric())]

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class
22196,B,Self-emp-not-inc,182771,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,0,0,48,South,>50K
32539,D,State-gov,252208,HS-grad,9,Separated,Adm-clerical,Own-child,White,Female,0,0,40,United-States,<=50K


In [120]:
#Retirada de caracteres não númericos nas colunas numéricas de tipo inteiro
df_sensus.age = df_sensus.age.replace('B', 0).replace('D', 0)
df_sensus.fnlwgt = df_sensus.fnlwgt.replace(' C', 0)
df_sensus.capital_gain = df_sensus.capital_gain.replace(' A', 0)
df_sensus.occupation = df_sensus.occupation.replace('?', 'unidentified')

In [121]:
#Conversão das colunas de string para inteiro
int_columns = ['age', 'fnlwgt', 'capital_gain']

for col in int_columns:
    if col in df_sensus.columns:
        df_sensus[col] = df_sensus[col].astype('int64')

df_sensus.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
class             object
dtype: object

In [128]:
#Criação de uma nova coluna de class_type utilizando lambda, facilitando a distinção entre as faixas de salário
udf_class = lambda row: row.replace('<=50K', 'B') if row != '>50K' else row.replace('>50K	', 'A')

df_sensus['class_type'] = df_sensus['class'].apply(udf_class)

In [129]:
#Troca do nome da coluna index
df_sensus.index.name = 'id'
df_sensus.sample()


Unnamed: 0_level_0,adult_id,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class,class_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
27768,27768,27,Private,305647,Assoc-acdm,12,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,>50K,>50K


In [124]:
#Criação de uma coluna de id para integração com o banco de dados utilizando o index
df_sensus['adult_id'] = df_sensus.index
df_sensus.adult_id.astype('int64')

id
0            0
1            1
2            2
3            3
4            4
         ...  
32555    32555
32556    32556
32557    32557
32558    32558
32559    32559
Name: adult_id, Length: 32560, dtype: int64

In [125]:
#Reorganização das colunas
temp_cols = df_sensus.columns.tolist()
new_cols = temp_cols[-1:] + temp_cols[:-1]
df_sensus = df_sensus[new_cols]
df_sensus.dtypes

adult_id           int64
age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
class             object
class_type        object
dtype: object

In [126]:
df_sensus.sample()

Unnamed: 0_level_0,adult_id,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class,class_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
7871,7871,81,?,162882,HS-grad,9,Divorced,?,Not-in-family,White,Female,0,0,35,United-States,<=50K,B


In [127]:
# Cria um csv com os dados transformados 
#Achei mais interessante não automatizar a transformação por preferir analisar e ter resposta visual das alterações realizadas
to_path = r'/home/vegh/Desktop/Estudos/python-dev-test/data/adult.csv'
df_sensus.to_csv(to_path, sep=',', index=False, header=False)