# Installing packages

In [152]:
import tensorflow as tf
import tensorflow_data_validation as tfdv
import pandas as pd
from sklearn.model_selection import train_test_split

print('TFDV version', tfdv.__version__)
print('Tf version', tf.__version__)


TFDV version 1.1.0
Tf version 2.5.0


# Data 


    The dataset used for this demo is the Census income dataset used to predict if a someone earns more or less than 50k based on their following personal and professional attributes.

        1. age: age of the individual (continuous)
        2. workclass: To which working category does an individual belong (working class categories are: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked)
        3. fnlwgt: weight assigned by the census bureau. If two individuals have the same weight that come from the same class of same race, similar educational and social background, etc. these weights are not standardised across different states, therefore use this variable judiciously(continuous).
        4. education:Category representing highest education level (Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool).
        5. education-num: numerical label representing the education class, Bachelors=13,HS_grad=9, etc. (continuous).
        6. marital-status: Category representing the marital status of a person(categories are: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse).
        7. occupation: Category of occupation of each individual (Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces).
        8. relationship: Categories representing what this individual is relative to others (Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried).
        9. race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
        10. sex: Female, Male.
        11. capital-gain: Capital gains of an individual (continuous).
        12. capital-loss: Capital loss of an individual (continuous).
        13 hours-per-week: Number of hours of work per week (continuous).
        14 native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.

In [200]:
df=pd.read_csv('Census_income_data.csv', skipinitialspace=True)
train_df, test_df=train_test_split(df, test_size=0.2, shuffle=False)
print("Length of training data:", len(train_df))
print("Length of test data: ",len(test_df))

Length of training data: 26052
Length of test data:  6513


In [201]:
train_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [202]:
test_df.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K
32561,46,,257473,Bachelors,8,Married-civ-spouse,Plumber,Husband,Other,Male,1000,0,41,Australia,>50K
32562,0,Private,257473,Masters,8,Married-civ-spouse,Adm-clerical,Wife,Asian,Female,0,0,40,Pakistan,>50K
32563,1000,Private,257473,Masters,8,Married-civ-spouse,Prof-specialty,Husband,Black,Male,0,0,20,Cameroon,<=50K
32564,25,?,257473,Masters,8,Married-civ-spouse,gamer,Husband,Asian,Female,0,0,50,Mongolia,<=50K


# Drop Irrelevant Features


    Before begining with the analysis, it is important to select only relevant features. 
    To do so, TFDV provides with tfdv.StatsOptions class
    
    Since the variable "fnlwgt" is assigned by census bureau and is not standard across different states, it may not be very informative and thus could be dropped

In [242]:
features_to_remove={"fnlwgt"}

approved_columns=[col for col in df.columns if col not in features_to_remove]
stats_options=tfdv.StatsOptions(feature_allowlist=approved_columns)
print(stats_options.feature_allowlist)

['age', 'workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label']


# Generate and visualize datasets


    The function used to generate statistics is "generate_statistics_from_dataframe()". Behind the scene TFDV distributes analysis using Apache Beam allowing it to scale over large datasets. 
    TFDV takes input in three forms: 
        1. csv, 
        2. Pandas DataFrame and,
        3. TensorFlow's TFRecord.
    To visualise statistics the function visualize_statistics() is used. The result opens a Facets interface with the following features:
        1. Sort the results by feature name (ascending or descending order)
        2. Search a specific feature by name
        3. Exclusively visualize only numerical and/or string
        4. Available stats: Count, missing (%), stdev, number of values set to zero(%), minimum, median and maximum
        5. For visualisation we have : standard histograms, quantile plots and value list length. We can choose log option for features that have a large range.
    

In [203]:
train_stats=tfdv.generate_statistics_from_dataframe(train_df)
type(train_stats)

tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList

In [204]:
tfdv.visualize_statistics(train_stats)


### Observations for numerical variables:
    1. The variables "capital gain" and "capital loss" have a lot of zeros (>90%). 
    2. For the variable "hours per week", majority of values are clustered around 40 hours (70th percentile). There is one individual that works 1 hour a week. This value could be an anomaly which needs to verified. 
    3. The variable "age" cannot be clearly analyzed using a histogram and therefore we need to change the chart to "Quantiles". In addition, if you select "log" and "expand", you can see that age follows a symmetric distribution with median=37 years.
    4. The variables "education_num", though numeric indicates a category. Therefore, should not be analysed as a numerical variables. To analyse this variable we will use the categorical variable "education"
    
### Observations for categorical variables:
    1. The distribution for each variable is pretty self explanatory form the distribution plots. There are two options for visualization: chart or raw data.
    2. None of the categorical variables have missing values.
    3. From the distribution of the variable "label" we can see that our dataset is unbalanced. The ration of ≤50k:>50k is almost 3:1.

In [250]:

print(f"Number of features: {len(train_stats.datasets[0].features)}")

print(f"Number of examples: {train_stats.datasets[0].num_examples}")


print(f"First feature: {train_stats.datasets[0].features[0].path.step[0]}")
print(f"Last feature: {train_stats.datasets[0].features[-1].path.step[0]}")

Number of features: 15
Number of examples: 26052
First feature: age
Last feature: label


# Infer Schema


    Schema refers to the following characteristics about each attribute/feature of a dataset:
    
        1. The expected type of each feature.
        2. The expected presence of each feature, in terms of a minimum count and fraction of examples that must contain the feature.
        3. The expected valency of the feature in each example, i.e., minimum and maximum number of values.
        4. The expected domain of a feature, i.e., the small universe of values for a string feature, or range for an integer feature.


    To get the schema of dataset we use the combination of two functions:

    1. infer_schema(): This function returns a schema protocol buffer. Protocol buffer are used to serialize structured data in a faster and simpler fashion. We first define our data structure using a .proto file. This file is then compiled to generate classes for each variable in the data. The compiled .proto file is then used to read and write your structured data and to form a variety of data streams. It's a language neutral platform neutral extensible mechanism for serializing structured data. It used output from generate_statistics_from_dataframe() as input

    2. display_schema(): which uses the result of infer_schema to generate a dataframe describing the above mentioned characteristics of each feature.

In [243]:
schema=tfdv.infer_schema(statistics=test_stats)
tfdv.display_schema(schema)


Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'age',INT,required,,-
'workclass',STRING,required,,'workclass'
'fnlwgt',INT,required,,-
'education',STRING,required,,'education'
'education-num',INT,required,,-
'marital-status',STRING,required,,'marital-status'
'occupation',STRING,required,,'occupation'
'relationship',STRING,required,,'relationship'
'race',STRING,required,,'race'
'sex',STRING,required,,'sex'


  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'workclass',"'?', 'Federal-gov', 'Local-gov', 'Never-worked', 'Private', 'Self-emp-inc', 'Self-emp-not-inc', 'State-gov', 'Without-pay'"
'education',"'10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th', 'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters', 'Preschool', 'Prof-school', 'Some-college'"
'marital-status',"'Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'"
'occupation',"'?', 'Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial', 'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct', 'Other-service', 'Priv-house-serv', 'Prof-specialty', 'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving', 'gamer'"
'relationship',"'Husband', 'Not-in-family', 'Other-relative', 'Own-child', 'Unmarried', 'Wife'"
'race',"'Amer-Indian-Eskimo', 'Asian', 'Asian-Pac-Islander', 'Black', 'Other', 'White'"
'sex',"'Female', 'Male'"
'native-country',"'?', 'Cambodia', 'Canada', 'China', 'Columbia', 'Cuba', 'Dominican-Republic', 'Ecuador', 'El-Salvador', 'England', 'France', 'Germany', 'Greece', 'Guatemala', 'Haiti', 'Honduras', 'Hong', 'Hungary', 'India', 'Iran', 'Ireland', 'Italy', 'Jamaica', 'Japan', 'Laos', 'Mexico', 'Mongolia', 'Nicaragua', 'Outlying-US(Guam-USVI-etc)', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Puerto-Rico', 'Scotland', 'South', 'Taiwan', 'Thailand', 'Trinadad&Tobago', 'United-States', 'Vietnam', 'Yugoslavia'"
'label',"'<=50K', '>50K'"


In [249]:
print("Number of features in the schema: ",len(schema.feature))
print("First feature of the schema: ",schema.feature[0].name )

Number of features in the schema:  15
First feature of the schema:  age



    To save schema to a directory we use "io" utility from tensorflow and write_schema_text() from TFDV

In [253]:
from tensorflow.python.lib.io import file_io
import os
OUTPUT_DIR = "output"
file_io.recursive_create_dir(OUTPUT_DIR)


schema_file = os.path.join(OUTPUT_DIR, 'schema_test.pbtxt')


tfdv.write_schema_text(schema, schema_file) 

# Compare train and test datasets

In [206]:
test_stats=tfdv.generate_statistics_from_dataframe(test_df)
tfdv.visualize_statistics(lhs_statistics=test_stats,\
                         rhs_statistics=train_stats,\
                         lhs_name='TEST DATASET',\
                         rhs_name='TRAIN_DATASET')

# Clean datasets

In [207]:
# filter the age range
test_df = test_df[test_df['age'] > 16]
test_df = test_df[test_df['age'] < 91]

train_df = train_df[train_df['age'] > 16]
train_df = train_df[train_df['age'] < 91]
# drop missing values
test_df.dropna(inplace=True)

In [208]:
test_stats=tfdv.generate_statistics_from_dataframe(test_df)
tfdv.visualize_statistics(lhs_statistics=test_stats,\
                         rhs_statistics=train_stats,\
                         lhs_name='TEST DATASET',\
                         rhs_name='TRAIN_DATASET')

# Anomalies

In [210]:
anomalies=tfdv.validate_statistics(statistics=train_stats, schema=schema)
tfdv.display_anomalies(anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'native-country',Unexpected string values,Examples contain values missing from the schema: Holand-Netherlands (<1%).


In [211]:
country_feature=tfdv.get_feature(schema, 'native-country')
country_feature.distribution_constraints.min_domain_mass=0.9


In [212]:
race_feature=tfdv.get_domain(schema, 'race')
race_feature.value.append('Asian')

In [213]:
anomalies=tfdv.validate_statistics(statistics=test_stats, schema=schema)
tfdv.display_anomalies(anomalies)

  pd.set_option('max_colwidth', -1)


# Update domain with a range of values

In [214]:
tfdv.set_domain(schema, 'age', schema_pb2.IntDomain(name='age',min=17, max=90))
tfdv.display_schema(schema)


Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'age',INT,required,,min: 17; max: 90
'workclass',STRING,required,,'workclass'
'fnlwgt',INT,required,,-
'education',STRING,required,,'education'
'education-num',INT,required,,-
'marital-status',STRING,required,,'marital-status'
'occupation',STRING,required,,'occupation'
'relationship',STRING,required,,'relationship'
'race',STRING,required,,'race'
'sex',STRING,required,,'sex'


  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'workclass',"'?', 'Federal-gov', 'Local-gov', 'Never-worked', 'Private', 'Self-emp-inc', 'Self-emp-not-inc', 'State-gov', 'Without-pay'"
'education',"'10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th', 'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters', 'Preschool', 'Prof-school', 'Some-college'"
'marital-status',"'Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'"
'occupation',"'?', 'Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial', 'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct', 'Other-service', 'Priv-house-serv', 'Prof-specialty', 'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving', 'gamer'"
'relationship',"'Husband', 'Not-in-family', 'Other-relative', 'Own-child', 'Unmarried', 'Wife'"
'race',"'Amer-Indian-Eskimo', 'Asian', 'Asian-Pac-Islander', 'Black', 'Other', 'White', 'Asian'"
'sex',"'Female', 'Male'"
'native-country',"'?', 'Cambodia', 'Canada', 'China', 'Columbia', 'Cuba', 'Dominican-Republic', 'Ecuador', 'El-Salvador', 'England', 'France', 'Germany', 'Greece', 'Guatemala', 'Haiti', 'Honduras', 'Hong', 'Hungary', 'India', 'Iran', 'Ireland', 'Italy', 'Jamaica', 'Japan', 'Laos', 'Mexico', 'Mongolia', 'Nicaragua', 'Outlying-US(Guam-USVI-etc)', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Puerto-Rico', 'Scotland', 'South', 'Taiwan', 'Thailand', 'Trinadad&Tobago', 'United-States', 'Vietnam', 'Yugoslavia'"
'label',"'<=50K', '>50K'"


In [217]:
updated_anomalies=tfdv.validate_statistics(test_stats, schema)
tfdv.display_anomalies(updated_anomalies)

# Analyse data slices

In [226]:
from tensorflow_data_validation.utils import slicing_util

slice_fn = slicing_util.get_feature_value_slicer(features={'sex':None})

In [227]:
# Declare stats options
slice_stats_options = tfdv.StatsOptions(schema=schema,
                                        slice_functions=[slice_fn],
                                        infer_type_from_schema=True)


In [237]:
# Convert dataframe to CSV since `slice_functions` works only with `tfdv.generate_statistics_from_csv`
CSV_PATH = 'slice_sample.csv'

# Calculate statistics for the sliced dataset

sliced_stats = tfdv.generate_statistics_from_csv(CSV_PATH, stats_options=slice_stats_options)


AttributeError: type object 'DataFrame' has no attribute 'backfill'

In [235]:
sliced_stats

NameError: name 'sliced_stats' is not defined

In [None]:
from tensorflow_metadata.proto.v0.statistics_pb2 import DatasetFeatureStatisticsList

# Convert `Male` statistics (index=1) to the correct type and get the dataset name
male_stats_list = DatasetFeatureStatisticsList()
male_stats_list.datasets.extend([sliced_stats.datasets[1]])
male_stats_name = sliced_stats.datasets[1].name

# Convert `Female` statistics (index=2) to the correct type and get the dataset name
female_stats_list = DatasetFeatureStatisticsList()
female_stats_list.datasets.extend([sliced_stats.datasets[2]])
female_stats_name = sliced_stats.datasets[2].name

# Visualize the two slices side by side
tfdv.visualize_statistics(
    lhs_statistics=male_stats_list,
    rhs_statistics=female_stats_list,
    lhs_name=male_stats_name,
    rhs_name=female_stats_name
)