In [None]:
import os
import tensorflow as tf
import tensorflow_data_validation as tfdv
import pandas
from sklearn.preprocessing import StandardScaler

# Dataset Validation

TensorFlow Data Validation is a tool to investigate different datasets meant for training, evaluation and serving.

It provides:

- descriptive statistics 
- inferring a schema
- checking for and fixing anomalies
- checking for drift and skew


## Download and Load Dataset

In [None]:
train_dataset_fp = "../00-datasets/iris.data.csv"
print("Local copy of the dataset file: {}".format(train_dataset_fp))

In [None]:
COLUMN_NAMES = ['sepal_len', 'sepal_width', 'petal_len', 'petal_width', 'label']
iris_raw = pandas.read_csv(train_dataset_fp, header=None, names=COLUMN_NAMES, sep=',')

## Generate Statistics for training data

In [None]:
stats_raw = tfdv.generate_statistics_from_dataframe(iris_raw)
tfdv.visualize_statistics(stats_raw)

## We apply our pre-processing

In [None]:
scaler = StandardScaler(with_mean=True, with_std=True)
iris_scaled = scaler.fit_transform(iris_raw[['sepal_len', 'sepal_width', 'petal_len', 'petal_width']])
iris = pandas.DataFrame(iris_scaled, columns=COLUMN_NAMES[:4])
iris["label"] = iris_raw["label"]
stats_iris = tfdv.generate_statistics_from_dataframe(iris)
tfdv.visualize_statistics(stats_iris)

## Infer a schema from statistics

In [None]:
schema = tfdv.infer_schema(stats_iris)
tfdv.display_schema(schema)

## Download and Load a different dataset with missing data

In [None]:
error_dataset_fp = "../00-datasets/iris.data.errors.csv"
print("Local copy of the dataset file: {}".format(error_dataset_fp))

In [None]:
error_stats = tfdv.generate_statistics_from_csv(error_dataset_fp, column_names=COLUMN_NAMES)
tfdv.visualize_statistics(error_stats)

In [None]:
anomalies = tfdv.validate_statistics(statistics=error_stats, schema=schema)
tfdv.display_anomalies(anomalies)

## Download and Load the dataset that got served in production

In [None]:
serving_dataset_fp = "../00-datasets/iris.data.serving.csv"
print("Local copy of the dataset file: {}".format(serving_dataset_fp))

In [None]:
serving_stats = tfdv.generate_statistics_from_csv(serving_dataset_fp, column_names=COLUMN_NAMES[:4]+["SYNTH"])
tfdv.visualize_statistics(serving_stats)

## Make Serving Dataset and Training Dataset comparable

In [None]:
schema.default_environment.append('TRAINING')
schema.default_environment.append('SERVING')
tfdv.get_feature(schema, 'label').not_in_environment.append('SERVING')
serving_anomalies_with_env = tfdv.validate_statistics(serving_stats, schema, environment='SERVING')
tfdv.display_anomalies(serving_anomalies_with_env)

## Really, that's all?

## Skew Detection

### Schema Skew
Schema skew occurs when the training and serving data do not conform to the same schema. 

### Feature Skew
Different feature values during serving than during training.

- modified datasources
- different preprocessing


### Distribution Skew
Distribution skew occurs when the distribution of the training dataset is significantly different from the distribution of the serving dataset. 

- highly imbalanced datasets and heavy up/downsampling

## Drift

Statistical features that are used for prediction change over time. That way the predictions become less accurate over time.