Tensor Flow Data Validation: https://www.tensorflow.org/tfx/tutorials/data_validation/tfdv_basic

Tensor Flow Extension: https://www.tensorflow.org/tfx

Census Income Data Set: http://archive.ics.uci.edu/ml/datasets/Census+Income

In [None]:
#!pip install tensorflow_data_validation

In [None]:
import tensorflow as tf
import tensorflow_data_validation as tfdv
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split

from tensorflow_metadata.proto.v0 import schema_pb2

print('TFDV Version: {}'.format(tfdv.__version__))
print('Tensorflow Version: {}'.format(tf.__version__))

In [None]:
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', skipinitialspace=True, header=None)

In [None]:
train_df, eval_df = train_test_split(df, test_size=0.2, shuffle=False)

In [None]:
train_df.head()

In [None]:
eval_df.head()

In [None]:
eval_df.info()

In [None]:
novos_dados = [[46,'NaN',257473,'Bachelors',8,'Married-civ-spouse','Plumber','Husband','Other','Male',1000,0,41,'Australia','>50K'],
               [0,'Private',257473,'Masters',8,'Married-civ-spouse','Adm-clerical','Wife','Asian','Female',0,0,40,'Pakistan','>50K'],
               [1000,'Private',257473,'Masters',8,'Married-civ-spouse','Prof-specialty','Husband','Black','Male',0,0,20,'Cameroon','<=50K'],
               [25,'?',257473,'Masters',8,'Married-civ-spouse','gamer','Husband','Asian','Female',0,0,50,'Mongolia','<=50K']]

eval_df2 = eval_df.append(novos_dados, ignore_index=True)


In [None]:
eval_df2.info()

In [None]:
eval_df2.tail(5)

## Geração e visualização do dataset de treino

In [None]:
estatistica_treino = tfdv.generate_statistics_from_dataframe(train_df)

In [None]:
tfdv.visualize_statistics(estatistica_treino)

In [None]:
schema = tfdv.infer_schema(statistics=estatistica_treino)

tfdv.display_schema(schema)

## Geração e visualização do dataset de score

In [None]:
estatistica_score = tfdv.generate_statistics_from_dataframe(eval_df2)

tfdv.visualize_statistics(
    lhs_statistics=estatistica_score,
    rhs_statistics=estatistica_treino,
    lhs_name='SCORE',
    rhs_name='TREINO'
)

In [None]:
eval_df2 = eval_df2[eval_df2[0] > 16]
eval_df2 = eval_df2[eval_df2[0] < 91]

eval_df2.dropna(inplace=True)

In [None]:
estatistica_score2 = tfdv.generate_statistics_from_dataframe(eval_df2)

tfdv.visualize_statistics(
    lhs_statistics=estatistica_score2,
    rhs_statistics=estatistica_treino,
    lhs_name='SCORE',
    rhs_name='TREINO'
)

## Encontrar anomalias

In [None]:
anomalia =  tfdv.validate_statistics(statistics=estatistica_score2, schema=schema)

tfdv.display_anomalies(anomalia)

In [None]:
country_feature = tfdv.get_feature(schema, '13')
country_feature.distribution_constraints.min_domain_mass = 0.9

occupation_feature = tfdv.get_feature(schema, '6')
occupation_feature.distribution_constraints.min_domain_mass = 0.9

In [None]:
race_domain = tfdv.get_domain(schema, '8')
race_domain.value.append('Asian')

In [None]:
tfdv.set_domain(schema, '0', schema_pb2.IntDomain(name='0', min=17, max=90))
tfdv.display_schema(schema)

In [None]:
anomalias_atualizadas = tfdv.validate_statistics(estatistica_score2, schema)


In [None]:
tfdv.display_anomalies(anomalias_atualizadas)

In [None]:
workclass_domain = tfdv.get_domain(schema, '1')
workclass_domain.value.append('NaN')

In [None]:
anomalias_atualizadas = tfdv.validate_statistics(estatistica_score2, schema)

In [None]:
tfdv.display_anomalies(anomalias_atualizadas)