This notebook introduces the tensorflow data validation exmaple

In [3]:
import tensorflow as tf
import zipfile

In [4]:
# prepare for unzipping the file and extract the files
zip_ref=zipfile.ZipFile("/content/dataset_diabetes.zip","r")
zip_ref.extractall()
zip_ref.close()

In [5]:
zip_ref

<zipfile.ZipFile [closed]>

In [6]:
import pandas as pd
import numpy as np

In [7]:
# now read the data

df=pd.read_csv("/content/dataset_diabetes/diabetic_data.csv")

In [8]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [9]:
def prepare_data_splits_from_dataframe(df):

  '''
  splits a dataframe into training,evaluation and serving datasets

  parameters:
            df: pandas dataframe
  Returns:
        Training_df: training dataframe (70%)
        Evaluation_df=Evaluation_datafram (15%)
        Serving_df=Serving_dataframe(15%)
  '''

  train_len=int(len(df)*0.7)
  eval_len=int(len(df)*0.15)
  serv_len=int(len(df)*0.15)


  # sample the train, eval and serving datasets and keep the results reporducible

  train_df=df.iloc[:train_len].sample(frac=1,random_state=48).reset_index(drop=True)
  eval_df=df.iloc[train_len:train_len+eval_len].sample(frac=1,random_state=48).reset_index(drop=True)
  serv_df=df.iloc[train_len+eval_len:].sample(frac=1,random_state=48).reset_index(drop=True)

  # drop the readmitted class from the serving dataframe
  serv_df=serv_df.drop(['readmitted'],axis=1)

  return train_df,eval_df,serv_df



In [10]:
train_df,eval_df,serv_df=prepare_data_splits_from_dataframe(df)

In [11]:
serv_df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
0,389024180,181005341,Caucasian,Male,[80-90),?,1,6,7,2,...,No,No,No,No,No,No,No,No,Ch,Yes
1,363799016,138873029,Caucasian,Male,[60-70),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Ch,Yes
2,277967352,40621032,Caucasian,Female,[40-50),?,1,6,7,6,...,No,No,Down,No,No,No,No,No,Ch,Yes
3,374841218,138051635,Caucasian,Male,[60-70),?,3,3,1,5,...,No,No,Up,No,No,No,No,No,Ch,Yes
4,400257644,183715484,Caucasian,Female,[80-90),?,1,1,1,1,...,No,No,No,No,No,No,No,No,Ch,Yes


Performing exploratory data analysis (Should only be done on the training dataset)

In [12]:
# first we will do the required imports

In [13]:
import os
import tempfile,urllib

In [14]:
# import tensorflow data validation
!pip install --upgrade 'tensorflow_data_validation[visualization]<2'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_data_validation[visualization]<2
  Downloading tensorflow_data_validation-1.9.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 4.1 MB/s 
[?25hCollecting joblib<0.15,>=0.12
  Downloading joblib-0.14.1-py2.py3-none-any.whl (294 kB)
[K     |████████████████████████████████| 294 kB 59.3 MB/s 
[?25hCollecting pyarrow<6,>=1
  Downloading pyarrow-5.0.0-cp37-cp37m-manylinux2014_x86_64.whl (23.6 MB)
[K     |████████████████████████████████| 23.6 MB 1.3 MB/s 
Collecting apache-beam[gcp]<3,>=2.38
  Downloading apache_beam-2.40.0-cp37-cp37m-manylinux2010_x86_64.whl (10.9 MB)
[K     |████████████████████████████████| 10.9 MB 30.8 MB/s 
[?25hCollecting pyfarmhash<0.4,>=0.2
  Downloading pyfarmhash-0.3.2.tar.gz (99 kB)
[K     |████████████████████████████████| 99 kB 11.0 MB/s 
[?25hCollecting tensorflow!=

In [15]:
import pkg_resources
import importlib
importlib.reload(pkg_resources)

<module 'pkg_resources' from '/usr/local/lib/python3.7/dist-packages/pkg_resources/__init__.py'>

In [16]:
import tensorflow_data_validation as tfdv

In [17]:
tfdv.__version__

'1.9.0'

In [18]:
from tensorflow.python.lib.io import file_io
from tensorflow_data_validation.utils import slicing_util
from tensorflow_metadata.proto.v0.statistics_pb2 import DatasetFeatureStatisticsList, DatasetFeatureStatistics


In [19]:
# removing unwanted features from the training dataset
features_to_remove={'encounter_id','patient_nbr'}

approved_col=[col for col in df.columns if (col not in features_to_remove)]
'patient_nbr' in approved_col, 'encounter_id' in approved_col

(False, False)

In [20]:
# now we will instantiate a StatsOptions class and define the features_list
# to be the approved columns

stats_options=tfdv.StatsOptions(feature_allowlist=approved_col)


In [21]:
help(stats_options)

Help on StatsOptions in module tensorflow_data_validation.statistics.stats_options object:

class StatsOptions(builtins.object)
 |  StatsOptions(generators: Union[List[tensorflow_data_validation.statistics.generators.stats_generator.StatsGenerator], NoneType] = None, schema: Union[tensorflow_metadata.proto.v0.schema_pb2.Schema, NoneType] = None, label_feature: Union[str, NoneType] = None, weight_feature: Union[str, NoneType] = None, slice_functions: Union[List[Callable[[pyarrow.lib.RecordBatch], Iterable[Tuple[Union[str, NoneType], pyarrow.lib.RecordBatch]]]], NoneType] = None, sample_rate: Union[float, NoneType] = None, num_top_values: int = 20, frequency_threshold: int = 1, weighted_frequency_threshold: float = 1.0, num_rank_histogram_buckets: int = 1000, num_values_histogram_buckets: int = 10, num_histogram_buckets: int = 10, num_quantiles_histogram_buckets: int = 10, epsilon: float = 0.01, infer_type_from_schema: bool = False, desired_batch_size: Union[int, NoneType] = None, enable

In [22]:
# printing the features in feature allowlist
for feature in stats_options.feature_allowlist:
  print(feature)

race
gender
age
weight
admission_type_id
discharge_disposition_id
admission_source_id
time_in_hospital
payer_code
medical_specialty
num_lab_procedures
num_procedures
num_medications
number_outpatient
number_emergency
number_inpatient
diag_1
diag_2
diag_3
number_diagnoses
max_glu_serum
A1Cresult
metformin
repaglinide
nateglinide
chlorpropamide
glimepiride
acetohexamide
glipizide
glyburide
tolbutamide
pioglitazone
rosiglitazone
acarbose
miglitol
troglitazone
tolazamide
examide
citoglipton
insulin
glyburide-metformin
glipizide-metformin
glimepiride-pioglitazone
metformin-rosiglitazone
metformin-pioglitazone
change
diabetesMed
readmitted


Now, we will generate the training statistics

since we have the data stored in a dataframe, then we will use the 
function tfdv.generate_statistics_from_dataframe(dataframe,stats_options)
and returns a DatasetFeatureStatisticsList object



In [23]:
train_stats=tfdv.generate_statistics_from_dataframe(train_df,stats_options)

In [24]:
# check the length of the feature and num of examples used to create statistics

len(train_stats.datasets[0].features),train_stats.datasets[0].num_examples


(48, 71236)

In [25]:
# now visualize the statistics using tfdv.visualize_statistics

tfdv.visualize_statistics(train_stats)

The next step is to infer schema.

Schema contains properties about the data such as 

1- values of features.

2- the presence of features in the training examples.

3- The expected domain of the features

In [26]:
# we can infer the schema using tfdv.infer_schema(stats)
# and display it using tfdv.display(schema)

schema=tfdv.infer_schema(train_stats)

In [27]:
type(schema)

tensorflow_metadata.proto.v0.schema_pb2.Schema

In [28]:
# display the schema
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'race',STRING,required,,'race'
'gender',STRING,required,,'gender'
'age',STRING,required,,'age'
'weight',STRING,required,,'weight'
'admission_type_id',INT,required,,-
'discharge_disposition_id',INT,required,,-
'admission_source_id',INT,required,,-
'time_in_hospital',INT,required,,-
'payer_code',STRING,required,,'payer_code'
'medical_specialty',STRING,required,,'medical_specialty'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'race',"'?', 'AfricanAmerican', 'Asian', 'Caucasian', 'Hispanic', 'Other'"
'gender',"'Female', 'Male', 'Unknown/Invalid'"
'age',"'[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', '[50-60)', '[60-70)', '[70-80)', '[80-90)', '[90-100)'"
'weight',"'>200', '?', '[0-25)', '[100-125)', '[125-150)', '[150-175)', '[175-200)', '[25-50)', '[50-75)', '[75-100)'"
'payer_code',"'?', 'BC', 'CH', 'CM', 'CP', 'DM', 'HM', 'MC', 'MD', 'MP', 'OG', 'OT', 'PO', 'SI', 'SP', 'UN', 'WC'"
'medical_specialty',"'?', 'AllergyandImmunology', 'Anesthesiology', 'Anesthesiology-Pediatric', 'Cardiology', 'Cardiology-Pediatric', 'Dentistry', 'Dermatology', 'Emergency/Trauma', 'Endocrinology', 'Family/GeneralPractice', 'Gastroenterology', 'Gynecology', 'Hematology', 'Hematology/Oncology', 'Hospitalist', 'InfectiousDiseases', 'InternalMedicine', 'Nephrology', 'Neurology', 'Obsterics&Gynecology-GynecologicOnco', 'Obstetrics', 'ObstetricsandGynecology', 'Oncology', 'Ophthalmology', 'Orthopedics', 'Orthopedics-Reconstructive', 'Osteopath', 'Otolaryngology', 'OutreachServices', 'Pathology', 'Pediatrics', 'Pediatrics-AllergyandImmunology', 'Pediatrics-CriticalCare', 'Pediatrics-EmergencyMedicine', 'Pediatrics-Endocrinology', 'Pediatrics-Hematology-Oncology', 'Pediatrics-InfectiousDiseases', 'Pediatrics-Neurology', 'Pediatrics-Pulmonology', 'Perinatology', 'PhysicalMedicineandRehabilitation', 'PhysicianNotFound', 'Podiatry', 'Proctology', 'Psychiatry', 'Psychiatry-Addictive', 'Psychiatry-Child/Adolescent', 'Psychology', 'Pulmonology', 'Radiologist', 'Radiology', 'Rheumatology', 'Speech', 'SportsMedicine', 'Surgeon', 'Surgery-Cardiovascular', 'Surgery-Cardiovascular/Thoracic', 'Surgery-Colon&Rectal', 'Surgery-General', 'Surgery-Maxillofacial', 'Surgery-Neuro', 'Surgery-Pediatric', 'Surgery-Plastic', 'Surgery-PlasticwithinHeadandNeck', 'Surgery-Thoracic', 'Surgery-Vascular', 'SurgicalSpecialty', 'Urology'"
'max_glu_serum',"'>200', '>300', 'None', 'Norm'"
'A1Cresult',"'>7', '>8', 'None', 'Norm'"
'metformin',"'Down', 'No', 'Steady', 'Up'"
'repaglinide',"'Down', 'No', 'Steady', 'Up'"


Now we ensure that the schema of the training data is consistent with the schema
of the evaluation data 

In [29]:
# first we will generate statistics for the evaluation dataframe 

eval_stats=tfdv.generate_statistics_from_dataframe(eval_df,stats_options)

In [30]:
# now we will visualize the statistics for the training and evaluation together

tfdv.visualize_statistics(lhs_statistics=train_stats,rhs_statistics=eval_stats,
                          lhs_name="Train Statistics",rhs_name="Eval Statistics")
                          

In [31]:
# now we will create a function that detects anomalies

In [32]:
def calculate_and_display_anomalies(statistics,schema):

  '''
  calculate and display anomalies between training and validation data
  Parameters:
          Statistics: an instance form statistics (train_stats,eval_stats)
          schema
  Return:
      Display of anomalies
  '''

  anomalies=tfdv.validate_statistics(statistics,schema)

  tfdv.display_anomalies(anomalies)

  

In [33]:
calculate_and_display_anomalies(eval_stats,schema)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'glimepiride-pioglitazone',Unexpected string values,Examples contain values missing from the schema: Steady (<1%).
'medical_specialty',Unexpected string values,Examples contain values missing from the schema: Neurophysiology (<1%).


The above cell indicates that evaludation data contains values in the features
that were not present in the schema inferred from the training data

The solution is to append the missing values to the schema

In [34]:
tfdv.get_domain(schema,'glimepiride-pioglitazone').value.append("Steady")


#medical_speciality_domain=tfdv.get_domain(schema,'medical_speciality').name
#medical_speciality_domain.value.append("Neurophysiology")

In [35]:
tfdv.get_domain(schema,'medical_specialty').value.append("Neurophysiology")

In [36]:
# now we will run the calculate and display anomalies function
calculate_and_display_anomalies(eval_stats,schema)

Schema Environments

In [38]:
# now we need to check anomalies for the serving set by creating an instance 
# from stats options class

options=tfdv.StatsOptions(schema=schema,infer_type_from_schema=True,
                          feature_allowlist=approved_col)




In [None]:
# now we will generate the serving statistics from the dataframe
serving_stats=tfdv.generate_statistics_from_dataframe(serv_df,options)

# now we will calculate and display anomalies as follows
calculate_and_display_anomalies(serving_stats,schema)

In [41]:
# now we can relax these constraints by setting the min_domain_mass to be 0.9

payer_code=tfdv.get_feature(schema,'payer_code')
payer_code.distribution_constraints.min_domain_mass=0.9

medical_specialty=tfdv.get_feature(schema,'medical_specialty')
medical_specialty.distribution_constraints.min_domain_mass=0.9

In [42]:
# now we can check one more time how can we work with anomlaies
calculate_and_display_anomalies(serving_stats,schema)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'readmitted',Column dropped,Column is completely missing
'metformin-pioglitazone',Unexpected string values,Examples contain values missing from the schema: Steady (<1%).
'metformin-rosiglitazone',Unexpected string values,Examples contain values missing from the schema: Steady (<1%).


In [None]:
# now we will start modifying the domain as follows
# first we can check the shcema as follows
tfdv.display_schema(schema)

We can see from the above that the missing domain features were not present in
the schema from the training dataset

In [45]:
def modify_domain_of_features(features_list,schema,to_domain_name):
  """
  Parameters:
          features_list: list of the features whose domain to be changed
          schema: inferred schema
          to_domain_name: feature where the domains are being mapped to

  returns:
        New Schema
  """

  for feature in features_list:
    tfdv.set_domain(schema=schema,feature_path=feature,domain=to_domain_name)
  
  return schema

In [None]:
domain_change_features = ['repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 
                          'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 
                          'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 
                          'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 
                          'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone']


# Infer new schema by using your modify_domain_of_features function 
# and the defined domain_change_features feature list
schema = modify_domain_of_features(domain_change_features, schema, 'metformin')

# Display new schema
tfdv.display_schema(schema)

In [47]:
# now we can check and display anomalies
calculate_and_display_anomalies(serving_stats,schema)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'readmitted',Column dropped,Column is completely missing


In [48]:
# we will write code to ignore the warning regarding the droppring 
# of the column "readmitted"

# all features by default are both in the training and serving environments
schema.default_environment.append('TRAINING')
schema.default_environment.append('SERVING')

In [49]:
# Remove the readmitted class from the serving environment

tfdv.get_feature(schema,feature_path="readmitted").not_in_environment.append('SERVING')

serving_anomalies_with_env=tfdv.validate_statistics(serving_stats,schema,
                                                    environment='SERVING')

In [50]:
tfdv.display_anomalies(serving_anomalies_with_env)