Copyright 2021 Google LLC.

SPDX-License-Identifier: Apache-2.0

In [None]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Assessing the veracity of semantic markup for dataset pages

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/google-research/google-research/blob/master/dataset_or_not/dataset_or_not.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/google-research/google-research/tree/master/dataset_or_not/dataset_or_not.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

## About this Colab
This is a companion Colab for the paper 

[Dataset or Not? A study on the veracity of semantic markup for dataset pages]()\
*Tarfah Alrashed, Dimitris Paparas, Omar Benjelloun, Ying Sheng, and Natasha Noy*

It contains python code for training the two main models from the paper, using the [Veracity of schema.org for datasets (labeled data)](https://www.kaggle.com/googleai/veracity-of-schemaorg-for-datasets-labeled-data) dataset.

## Prerequisites

Before continuing, download and unzip the [Veracity of schema.org for datasets (labeled data)](https://www.kaggle.com/googleai/veracity-of-schemaorg-for-datasets-labeled-data) dataset to your computer.

## Note regarding *prominent terms*

The released dataset and the code in this notebook do not contain the *prominent terms* feature mentioned in the paper. This is because that feature is extracted using proprietary code that cannot be released. The interested reader can replicate this feature extraction using the model proposed in [this paper](https://arxiv.org/abs/1805.01334).

#Install required packages

In [None]:
!pip install adanet
!pip install --user --upgrade tensorflow-probability

#Import Modules

In [None]:
from google.colab import files
import math
import tensorflow.compat.v2 as tf
import adanet
import pandas as pd
import io

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing import text

# Upload Dataset

Run the following cell and, when prompted, upload files *testing_set.csv*, *training_set.csv*, and *validation_set.csv* (that you downloaded as part of the prerequisites).

In [None]:
uploaded = files.upload()

# Load dataset in pandas.DataFrame

In [None]:
training_set = pd.read_csv('training_set.csv', keep_default_na=False)
eval_set = pd.read_csv('validation_set.csv', keep_default_na=False)
test_set = pd.read_csv('testing_set.csv', keep_default_na=False)

# Select Model

In the next cell you can select which model to train. Remember to run the cell after making a selection. The features each model uses are:

Column Name|Type|Contents|Lightweight Model|Full Model
-----------|----|-|:-:|:--------:
source_url| string |url of a webpage that contains schema.org/Dataset markup| |+
name| string |The name of the dataset| +|+
description| string |Description of the dataset|+|+
has_distribution| bool|True if the dataset contains distribution metadata, false otherwise| |+
has_encoding_or_file_format| bool |True if the dataset contains encoding or file format metadata, false otherwise| |+
provider_or_publisher| string |The name of the provider or publisher of the dataset| |+
author_or_creator| string |The author(s) or creator(s) of the dataset| |+
doi| string|The Digital Object Identifier of the dataset| |+
has_catalog| bool |True if the dataset is included in a data catalog, false otherwise| |+|
has_dateCreated| bool |True if a creation date is provided, false otherwise| |+
has_dateModified| bool |True if a modification date is provided, false otherwise| |+
has_datePublished| bool |True if a publication date is provided, false otherwise| |+

In [None]:
SELECTED_MODEL = 'lightweight_model'  #@param {type:'string'} ["lightweight_model", "full_model"]

# Preprocessing Parameters

Dictionary with the sizes of the feature vocabularies to generate during preprocessing for each of the models

In [None]:
P_PARAMS_BY_MODEL = {
    'lightweight_model': {
        'vocab_size_by_feature': {
            'description': 110211,
            'name': 18720
        },
        'MAX_TOKENS': 400
    },
    'full_model': {
        'vocab_size_by_feature': {
            'description': 104383,
            'name': 17495,
            'author_or_creator': 1602,
            'doi': 193,
            'provider_or_publisher': 773,
            'source_url': 17749
        },
        'MAX_TOKENS': 400
    }
}

MODEL_P_PARAMS = P_PARAMS_BY_MODEL[SELECTED_MODEL]

# Data Preprocessing

Analyze training dataset and generate tokenizers with custom vocabularies for each text feature

In [None]:
tokenizers = {}

for feature_name, vocab_size in MODEL_P_PARAMS['vocab_size_by_feature'].items():
  tokenizers[feature_name] = text.Tokenizer(num_words=vocab_size)
  tokenizers[feature_name].fit_on_texts(training_set[feature_name])

# Hyperparametes

Dictionary with the training hyperparameters for each of the models

In [None]:
H_PARAMS_BY_MODEL = {
    'lightweight_model': {
        'features': ['description', 'name'],
        'LEARNING_RATE': 0.00677,
        'TRAIN_STEPS': 500,
        'SHUFFLE_BUFFER_SIZE': 2048,
        'BATCH_SIZE': 128,
        'CLIP_NORM': 0.00037,
        'HIDDEN_UNITS': [186],
        'DROPOUT': 0.28673,
        'ACTIVATION_FN': tf.nn.selu,
        'MAX_ITERATION_STEPS': 333333,
        'DO_BATCH_NORM': True,
        'MAX_TRAIN_STEPS': 1000
    },
    'full_model': {
        'features': [
            'author_or_creator', 'description', 'doi', 'has_date_created',
            'has_date_modified', 'has_date_published', 'has_distribution',
            'has_encoding_or_file_format', 'name', 'provider_or_publisher',
            'source_url'
        ],
        'LEARNING_RATE': 0.00076,
        'TRAIN_STEPS': 500,
        'SHUFFLE_BUFFER_SIZE': 2048,
        'BATCH_SIZE': 128,
        'CLIP_NORM': 0.25035,
        'HIDDEN_UNITS': [329, 351, 292],
        'DROPOUT': 0.08277,
        'ACTIVATION_FN': tf.nn.selu,
        'MAX_ITERATION_STEPS': 333333,
        'DO_BATCH_NORM': False,
        'MAX_TRAIN_STEPS': 1000
    }
}

MODEL_H_PARAMS = H_PARAMS_BY_MODEL[SELECTED_MODEL]

# Utility functions

Methods used to preprocess and create the input for training the model

In [None]:
def tokenize_and_pad(features):
  """Iterates over the features of a labeled sample, tokenizing and padding them.

  Args:
    features: A dictionary of feature values keyed by feature names. It includes
      label as a feature

  Returns:
    A tuple with the processed features
  """

  tokenized_features = list()
  for feature in MODEL_H_PARAMS['features']:
    # Tokenize text features according to the corresponding vocabulary
    if feature in MODEL_P_PARAMS['vocab_size_by_feature']:
      # Handle missing features
      if not features[feature]:
        tokenized = [[MODEL_P_PARAMS['vocab_size_by_feature'][feature]]]
      else:
        tokenized = tokenizers[feature].texts_to_sequences([features[feature]])
      tokenized_features.append([
          sequence.pad_sequences(
              tokenized,
              maxlen=MODEL_P_PARAMS['MAX_TOKENS'],
              padding='post',
              truncating='post')
      ])
    # Tokenize boolean features into binary values
    else:
      if features[feature]:
        tokenized_features.append([1])
      else:
        tokenized_features.append([0])
  tokenized_features.append(features['label'])
  return tuple(tokenized_features)


def generator(dataset):
  """Returns a generator mapping dataset entries to tokenized features-label pairs."""

  def _gen():
    for entry in dataset.iterrows():
      yield tokenize_and_pad(entry[1])

  return _gen


def preprocess(*args):
  """Tensorizes its arguments.

  Args:
    *args: Variable length arguments feature1, ..., featureK, label. Should be
      in the same order as in MODEL_H_PARAMS['features']

  Returns:
    A pair of
      1. A dictionary with the features keyed by their names
      2. A label
  """
  m = {}
  for feature, name in zip(args[:-1], MODEL_H_PARAMS['features']):
    m[name] = feature
  return m, [args[-1]]


def generate_output_types():
  """Returns a vector of output types corresponding to the tuple produced by the generator."""
  types = []
  # Feature types
  types = [tf.int32] * len(MODEL_H_PARAMS['features'])
  # Label type
  types.append(tf.bool)
  return tuple(types)


def input_fn(partition, training, batch_size):
  """Generates an input_fn for the Estimator.

  Args:
    partition: One of 'train', 'test', and 'eval' for training, testing, and
      validation sets respectively
    training: If true, then shuffle dataset to add randomness between epochs
    batch_size: Number of elements to combine in a single batch

  Returns:
    The input function
  """

  def _input_fn():
    if partition == 'train':
      dataset = tf.data.Dataset.from_generator(
          generator(training_set), generate_output_types())
    elif partition == 'test':
      dataset = tf.data.Dataset.from_generator(
          generator(test_set), generate_output_types())
    elif partition == 'eval':
      dataset = tf.data.Dataset.from_generator(
          generator(eval_set), generate_output_types())
    else:
      print('Unknown partition')
      return

    if training:
      dataset = dataset.shuffle(MODEL_H_PARAMS['SHUFFLE_BUFFER_SIZE'] *
                                batch_size).repeat()

    dataset = dataset.map(preprocess).batch(batch_size)
    iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
    features, labels = iterator.get_next()
    return features, labels

  return _input_fn


def generate_feature_columns(embed):
  """Creates the feature columns that we will train the model on.

  Args:
    embed: If true, we embed the columns.

  Returns:
    A list with the feature columns.
  """
  feature_columns = []
  for feature in MODEL_H_PARAMS['features']:
    if feature in MODEL_P_PARAMS['vocab_size_by_feature']:
      # vocab_size + 1 to handle missing features
      num_buckets = MODEL_P_PARAMS['vocab_size_by_feature'][feature] + 1
    else:
      # All none-text features are booleans, so 2 buckets are enough
      num_buckets = 2
    column = tf.feature_column.categorical_column_with_identity(
        key=feature, num_buckets=num_buckets)
    if embed:
      column = tf.feature_column.embedding_column(
          column, dimension=math.ceil(math.log2(num_buckets)))
    feature_columns.append(column)
  return feature_columns

# Build Model

Set up an ensemble estimator combining a Linear estimator and a DNN

In [None]:
head = tf.estimator.BinaryClassHead()

adam = lambda: tf.keras.optimizers.Adam(
    learning_rate=MODEL_H_PARAMS['LEARNING_RATE'],
    clipnorm=MODEL_H_PARAMS['CLIP_NORM'])

estimator = adanet.AutoEnsembleEstimator(
    head=head,
    candidate_pool={
        'linear':
            tf.estimator.LinearEstimator(
                head=head,
                feature_columns=generate_feature_columns(False),
                optimizer=adam),
        'dnn':
            tf.estimator.DNNEstimator(
                head=head,
                hidden_units=MODEL_H_PARAMS['HIDDEN_UNITS'],
                feature_columns=generate_feature_columns(True),
                optimizer=adam,
                activation_fn=MODEL_H_PARAMS['ACTIVATION_FN'],
                dropout=MODEL_H_PARAMS['DROPOUT'],
                batch_norm=MODEL_H_PARAMS['DO_BATCH_NORM'])
    },
    max_iteration_steps=MODEL_H_PARAMS['MAX_ITERATION_STEPS'])

# Train Model

For demonstration purposes, we set *max_steps* to a small value so that the
training finishes fast. This is enough to achieve good results. Alternatively, you can remove the *max_steps* argument and let the estimator train to convergence.

In [None]:
result = tf.estimator.train_and_evaluate(
    estimator,
    train_spec=tf.estimator.TrainSpec(
        input_fn=input_fn(
            'train', training=True, batch_size=MODEL_H_PARAMS['BATCH_SIZE']),
        max_steps=MODEL_H_PARAMS['MAX_TRAIN_STEPS']),
    eval_spec=tf.estimator.EvalSpec(
        input_fn=input_fn(
            'eval', training=False, batch_size=MODEL_H_PARAMS['BATCH_SIZE']),
        steps=None,
        start_delay_secs=1,
        throttle_secs=1,
    ))[0]

# Model perfomance on validation set

In [None]:
print('AUC:', result['auc'], 'AUC_PR:', result['auc_precision_recall'],
      'Recall:', result['recall'], 'Precision:', result['precision'])

# Model perfomance on testing set

In [None]:
ret = estimator.evaluate(
    input_fn('test', training=False, batch_size=MODEL_H_PARAMS['BATCH_SIZE']))
print('AUC:', ret['auc'], 'AUC_PR:', ret['auc_precision_recall'], 'Recall:',
      ret['recall'], 'Precision:', ret['precision'])