# Initialize Fiddler Client

In [1]:
import fiddler as fdl
client = fdl.FiddlerApi()

# Load dataset

In [2]:
import pandas as pd
df = pd.read_csv('/app/fiddler_samples/samples/datasets/imdb_rnn/imdb_rnn.csv')
df_schema = fdl.DatasetInfo.from_dataframe(df, max_inferred_cardinality=1000)

In [3]:
df.head()

Unnamed: 0,sentence,polarity
0,A real blow-up of the film literally. This Bri...,False
1,"I only wish that Return of the Jedi, have been...",True
2,"""I like cheap perfume better; it doesn't last ...",True
3,On the eighth day God created Georges. But the...,True
4,"No, this is not no Alice fairy tale my friends...",True


# Upload dataset

In [4]:
if 'imdb_rnn' not in client.list_datasets():
    upload_result = client.upload_dataset(
        dataset={'train': df}, 
        dataset_id='imdb_rnn')

# Create model schema

In [5]:
target = 'polarity'
feature_columns = ['sentence']
train_input = df[feature_columns]
train_target = df[target]

model_info = fdl.ModelInfo.from_dataset_info(
    dataset_info=client.get_dataset_info('imdb_rnn'),
    target=target, 
    features=feature_columns,
    display_name='Text IG',
    description='this is a tensorflow model using text data and IG enabled from tutorial',
    input_type=fdl.ModelInputType.TEXT
)

# Train model

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_target = le.fit_transform(train_target)
train_target = train_target.reshape(-1,1)

In [7]:
import tensorflow_datasets as tfds

vocab_size = 2000
encoder = tfds.features.text.SubwordTextEncoder.build_from_corpus(train_input['sentence'],
                                                                  target_vocab_size=vocab_size)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [8]:
from unidecode import unidecode

def strip_accents_and_special_characters(s):
    return unidecode(s)

In [9]:
from tensorflow.keras.preprocessing import sequence

input_tokens = (train_input['sentence']
                .apply(lambda x: encoder.encode(
                    strip_accents_and_special_characters(x))))

max_seq_length = 300

input_tokens_padd = sequence.pad_sequences(
    input_tokens,
    maxlen=max_seq_length,
    padding="post"
)

In [10]:
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.models import Model

def RNN():
    inputs = Input(name='inputs', shape=[max_seq_length])
    layer = Embedding(vocab_size, 64, input_length=max_seq_length)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256, name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.2)(layer)
    layer = Dense(1, name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

In [11]:
from tensorflow.keras.optimizers import RMSprop

model = RNN()
model.summary()
model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 300)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 300, 64)           128000    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                33024     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation (Activation)      (None, 256)               0         
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                 257   

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [12]:
from tensorflow.keras.callbacks import EarlyStopping
model.fit(input_tokens_padd, train_target, batch_size=128, epochs=5,
          validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.001)])

Train on 22500 samples, validate on 2500 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f861db7bd50>

# Save model and schema

In [13]:
import pathlib
import shutil
import pickle
import yaml
import tensorflow as tf

project_id = 'tutorial'
model_id = 'imdb_rnn_model'

# create temp dir
model_dir = pathlib.Path(model_id)
shutil.rmtree(model_dir, ignore_errors=True)
model_dir.mkdir()

# save model
tf.keras.experimental.export_saved_model(model, str(model_dir / 'saved_model'))

# save model schema
with open(model_dir / 'model.yaml', 'w') as yaml_file:
    yaml.dump({'model': model_info.to_dict()}, yaml_file)

# save tokenizer
with open(model_dir / 'tokenizer.pkl', 'wb') as tok_file:
    tok_file.write(pickle.dumps(encoder))

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.


Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.


INFO:tensorflow:Signatures INCLUDED in export for Classify: None


INFO:tensorflow:Signatures INCLUDED in export for Classify: None


INFO:tensorflow:Signatures INCLUDED in export for Regress: None


INFO:tensorflow:Signatures INCLUDED in export for Regress: None


INFO:tensorflow:Signatures INCLUDED in export for Predict: None


INFO:tensorflow:Signatures INCLUDED in export for Predict: None


INFO:tensorflow:Signatures INCLUDED in export for Train: ['train']


INFO:tensorflow:Signatures INCLUDED in export for Train: ['train']


INFO:tensorflow:Signatures INCLUDED in export for Eval: None


INFO:tensorflow:Signatures INCLUDED in export for Eval: None






INFO:tensorflow:No assets to save.


INFO:tensorflow:No assets to save.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:Signatures INCLUDED in export for Classify: None


INFO:tensorflow:Signatures INCLUDED in export for Classify: None


INFO:tensorflow:Signatures INCLUDED in export for Regress: None


INFO:tensorflow:Signatures INCLUDED in export for Regress: None


INFO:tensorflow:Signatures INCLUDED in export for Predict: None


INFO:tensorflow:Signatures INCLUDED in export for Predict: None


INFO:tensorflow:Signatures INCLUDED in export for Train: None


INFO:tensorflow:Signatures INCLUDED in export for Train: None


INFO:tensorflow:Signatures INCLUDED in export for Eval: ['eval']


INFO:tensorflow:Signatures INCLUDED in export for Eval: ['eval']






INFO:tensorflow:No assets to save.


INFO:tensorflow:No assets to save.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:Signatures INCLUDED in export for Classify: None


INFO:tensorflow:Signatures INCLUDED in export for Classify: None


INFO:tensorflow:Signatures INCLUDED in export for Regress: None


INFO:tensorflow:Signatures INCLUDED in export for Regress: None


INFO:tensorflow:Signatures INCLUDED in export for Predict: ['serving_default']


INFO:tensorflow:Signatures INCLUDED in export for Predict: ['serving_default']


INFO:tensorflow:Signatures INCLUDED in export for Train: None


INFO:tensorflow:Signatures INCLUDED in export for Train: None


INFO:tensorflow:Signatures INCLUDED in export for Eval: None


INFO:tensorflow:Signatures INCLUDED in export for Eval: None


INFO:tensorflow:No assets to save.


INFO:tensorflow:No assets to save.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: imdb_rnn_model/saved_model/saved_model.pb


INFO:tensorflow:SavedModel written to: imdb_rnn_model/saved_model/saved_model.pb


# Write package.py and related wrappers

In [14]:
%%writefile imdb_rnn_model/tf_saved_model_wrapper.py

import numpy as np
import pandas as pd
import tensorflow as tf
import logging


class TFSavedModelWrapper:
    def __init__(self, saved_model_path, sig_def_key, output_columns,
                 is_binary_classification=False, output_key=None,
                 batch_size=8):
        """
        Wrapper to load and run a TF model from a saved_model path.
        Models must extend this class in their package.py, and override the
        transform_input method.

        Args:
        :param saved_model_path: Path to the directory containing the TF
            model in SavedModel format.
            See: https://www.tensorflow.org/guide/saved_model#build_and_load_a_savedmodel

        :param sig_def_key: Key for the specific SignatureDef to be used for
            executing the model.
            See: https://www.tensorflow.org/tfx/serving/signature_defs#signaturedef_structure

        :param output_columns: List containing the names of the output
            column(s) that corresponds to the output of the model. If the
            model is a binary classification model then the number of output
            columns is one, otherwise, the number of columns must match the
            shape of the output tensor corresponding to the output key
            specified.

         :param is_binary_classification [optional]: Boolean specifying if the
            model is a binary classification model. If True, the number of
            output columns is one. The default is False.

        :param output_key [optional]: Key for the specific output tensor (
            specified in the SignatureDef) whose predictions must be explained.
            The output tensor must specify a differentiable output of the
            model. Thus, output tensors that are generated as a result of
            discrete operations (e.g., argmax) are disallowed. The default is
            None, in which case the first output listed in the SignatureDef is
            used. The 'saved_model_cli' can be used to view the output tensor
            keys available in the signature_def.
            See: https://www.tensorflow.org/guide/saved_model#cli_to_inspect_and_execute_savedmodel

        :param batch_size [optional]: the batch size for input into the model.
            Depends on model and instance config.
        """

        self.saved_model_path = saved_model_path
        self.sig_def_key = sig_def_key
        self.output_key = output_key
        self.output_columns = output_columns
        self.input_tensors = None
        self.output_tensor = None
        self.sess = None
        self.saved_model = None
        self.is_binary_classification = is_binary_classification
        self.batch_size = batch_size

    def load_model(self):
        """
        Loads the model and creates a session from the saved_model_path
        provided at initialization.
        """
        # load the model
        self.sess = tf.Session()
        self.saved_model = tf.saved_model.loader.load(
            sess=self.sess, tags=['serve'],
            export_dir=str(self.saved_model_path))

        # Extract input and output tensors from the signature.
        sig = self.saved_model.signature_def[self.sig_def_key]
        self.input_tensors = sig.inputs

        if self.output_key is None:
            self.output_key = list(sig.outputs)[0]

        self.output_tensor = self.get_tensor(sig.outputs[self.output_key].name)
        if self.is_binary_classification:
            if len(self.output_columns) != 1:
                raise ValueError(f'Number of output columns should be one '
                                 f'for a binary classification model, '
                                 f'but length is {len(self.output_columns)} ')
            # output_tensor should either be of shape <batch, > or <batch, 2>
            output_tensor_shape = self.output_tensor.shape.as_list()
            logging.info(f'Output tensor shape is {output_tensor_shape}')
            if len(output_tensor_shape) == 2:
                if output_tensor_shape[1] == 2:
                    self.output_tensor = self.output_tensor[:, 1]

    def transform_input(self, input_df):
        raise NotImplementedError('Please implement transform_input in package.py')

    def predict(self, input_df):
        """
        Returns predictions for the provided inputs.

        Args:
        :param input_df: DataFrame corresponding to the dataset yaml
            associated with the project. Specifically, the columns in the
            DataFrame must correspond to the feature names mentioned in the
            yaml.

        Returns:
        - predictions_df: Pandas DataFrame with predictions for the provided
            inputs. The columns of the DataFrame are the provided set of output
            columns.
        """

        transformed_input_df = self.transform_input(input_df)
        predictions = []
        for ind in range(0, len(transformed_input_df), self.batch_size):
            df_chunk = transformed_input_df.iloc[ind: ind + self.batch_size]
            feed = self.get_feed_dict(df_chunk)

            with self.sess.as_default():
                predictions += self.sess.run(self.output_tensor, feed).tolist()
        return pd.DataFrame(predictions, columns=self.output_columns)

    def get_tensor(self, name):
        names = [t.name for t in self.sess.graph.as_graph_def().node]
        try:
            return self.sess.graph.get_tensor_by_name(name)
        except:
            raise ValueError(f'name: {name} not in graph: {names}')

    def get_feed_dict(self, input_df):
        """
        Returns the input dictionary to be fed to the TensorFlow graph given
        input_df which is a pandas DataFrame. The input_df DataFrame is
        obtained after applying transform_input on the raw input. The
        transform_input function is extended in package.py.
        """

        feed = {}
        for key, tensor_info in self.input_tensors.items():
            if key not in input_df.columns:
                raise RuntimeError(f'Transformed input does not have a '
                                   f'column corresponding to the input tensor '
                                   f'key: {key} specified in the SignatureDef. Input col: {input_df.columns}')
            feed_inp = input_df[key].tolist()
            feed_inp_shape = np.array(feed_inp).shape
            expected_shape = self.get_shape(tensor_info.tensor_shape)
            if not self.match_shape(feed_inp_shape, expected_shape):
                raise RuntimeError(f'Shape mismatch for input tensor {key}.'
                                   f'Got: {feed_inp_shape}, Want '
                                   f'{expected_shape}')
            feed[tensor_info.name] = feed_inp
        return feed

    @staticmethod
    def get_shape(tensor_shape):
        """
        Returns shape of tensor having tensor shape in the format returned by
        the SignatureDef
        """
        return [d.size for d in tensor_shape.dim]

    @staticmethod
    def get_shape_tensor(tensor_shape):
        """
        Returns shape of tensor having tensor shape in the format of the
        tf.TensorShape class
        """
        return [d.value if d.value is not None else -1 for d in
                tensor_shape.dims]

    @staticmethod
    def match_shape(got, want):
        if len(got) != len(want):
            return False
        for i, v in enumerate(got):
            if want[i] != -1 and want[i] != v and v != -1:
                return False
        return True


Writing imdb_rnn_model/tf_saved_model_wrapper.py


In [15]:
%%writefile imdb_rnn_model/tf_saved_model_wrapper_ig.py

from .tf_saved_model_wrapper import TFSavedModelWrapper
import tensorflow as tf
import logging


class TFSavedModelWrapperIg(TFSavedModelWrapper):
    def __init__(self, saved_model_path, sig_def_key, output_columns,
                 is_binary_classification=False,
                 output_key=None,
                 batch_size=8,
                 input_tensor_to_differentiable_layer_mapping={},
                 max_allowed_error=None):
        """
        Wrapper to support Integrated Gradients (IG) computation for a TF
        model loaded from a saved_model path.

        See: https://github.com/ankurtaly/Integrated-Gradients

        Models must extend this class in their  package.py, and override the
        transform_input and the project_attributions methods.

        Args:
        :param input_tensor_to_differentiable_layer_mapping [optional]:
            Dictionary that maps input tensors to the first differentiable
            layer/tensor in the graph they are attached to. For instance,
            in a text model, an input tensor containing token ids
            may not be differentiable but may feed into an embedding tensor.
            Such an input tensor must be mapped to the corresponding the
            embedding tensor in this dictionary.

            All input tensors must be mentioned in the dictionary. An input
            tensor that is directly differentiable may be mapped to itself.

            For each differentiable tensor, the first dimension must be the
            batch dimension. If <k1, …, kn> is the shape of the input then the
            differentiable tensor must either have the same shape or the shape
            <k1, …, kn, d>.

            The default is None, in which case all input tensors are assumed
            to be differentiable.

        :param max_allowed_error: Float specifying a percentage value
            for the maximum allowed integral approximation error for IG
            computation. If None then IG will be  calculated for a
            pre-determined number of steps. Otherwise, the number of steps
            will be increased till the error is within the specified limit.
        """

        super().__init__(saved_model_path, sig_def_key,
                         output_columns=output_columns,
                         is_binary_classification=is_binary_classification,
                         output_key=output_key,
                         batch_size=batch_size)

        self.input_tensor_to_differentiable_layer_mapping = \
            input_tensor_to_differentiable_layer_mapping

        # mapping from each input tensor to its differentiable version
        self.differentiable_tensors = {}

        # mapping each output column to a dictionary of gradients tensors.
        self.gradient_tensors = {}
        self.steps = 10  # no of steps for ig calculation
        self.ig_enabled = True  #
        self.max_allowed_error = max_allowed_error

    def load_model(self):
        """Extends load model defined in the TFSavedModelWrapper class"""
        super().load_model()

        for key, tensor_info in self.input_tensors.items():
            if key in self.input_tensor_to_differentiable_layer_mapping.keys():
                differentiable_tensor = \
                    self.get_tensor(
                        self.input_tensor_to_differentiable_layer_mapping[key])
                # shape check
                diff_tensor_shape = \
                    self.get_shape_tensor(differentiable_tensor.shape)
                input_tensor_shape = self.get_shape(tensor_info.tensor_shape)

                logging.info(f'For key {key} differentiable tensor shape is '
                             f'{diff_tensor_shape} input tensor shape is '
                             f'{input_tensor_shape}')
                if self._validate_differentiable_tensor_shape(
                        diff_tensor_shape, input_tensor_shape):
                    self.differentiable_tensors[key] = \
                        differentiable_tensor
                else:
                    raise ValueError(f'Shape of differentiable tensor '
                                     f'{diff_tensor_shape} doesnt follow rule '
                                     f'"If <k1, …, kn> is the shape of the '
                                     f'input then the differentiable tensor '
                                     f'must either have the same shape or the '
                                     f'shape <k1, …, kn, d>". Shape of input '
                                     f'tensor is {input_tensor_shape}')

        if self.is_binary_classification:
            self.gradient_tensors[self.output_columns[0]] = {}
            for key, tensor in self.differentiable_tensors.items():
                self.gradient_tensors[self.output_columns[0]][key] = \
                    tf.gradients(self.output_tensor, tensor)
        else:
            for index, column in enumerate(self.output_columns):
                self.gradient_tensors[column] = {}
                for key, tensor in self.differentiable_tensors.items():
                    self.gradient_tensors[column][key] = \
                        tf.gradients(self.output_tensor[:, index], tensor)

    def generate_baseline(self, input_df):
        raise NotImplementedError('Please implement generate_baseline in '
                                  'package.py')

    def project_attributions(self, input_df, transformed_input_df,
                             attributions):
        raise NotImplementedError('Please implement project_attributions in '
                                  'package.py')

    def _validate_differentiable_tensor_shape(self,
                                              differentiable_tensor_shape,
                                              input_tensor_shape):

        diff_len = len(differentiable_tensor_shape)
        input_len = len(input_tensor_shape)
        if diff_len == input_len:
            return self.match_shape(differentiable_tensor_shape,
                                    input_tensor_shape)
        elif diff_len - input_len == 1:
            return self.match_shape(differentiable_tensor_shape[:-1],
                                    input_tensor_shape)

        return False


Writing imdb_rnn_model/tf_saved_model_wrapper_ig.py


In [16]:
%%writefile imdb_rnn_model/cover_tokens.py

import numpy as np
from typing import Callable
from unidecode import unidecode


def strip_accents_and_special_characters(s):
    return unidecode(s)


def one_split(in_strings: list,
              split_string: str,
              strip_whitespace: bool) -> list:
    """Break each string in a list of strings into smaller parts.

    Split after each occurrence of split_string.

    :param in_strings: List of strings to be broken into substrings.
    :param split_string: A separator string after which to divide.
    :param strip_whitespace:(bool) leading/trailing whitespace from
        tokens.
    :return: A list of (probably) smaller strings.
    """
    out = []

    include_delim = split_string.strip() if strip_whitespace else split_string

    for sub_str in in_strings:
        splits = sub_str.split(split_string)
        for piece in splits:
            if strip_whitespace:
                piece = piece.strip()
            if piece:
                out.append(piece)
            if include_delim:
                out.append(split_string)
        if include_delim:
            out.pop()
    return out


def multi_split(in_strings: str,
                split_strings: tuple,
                strip_whitespace: bool) -> list:
    """
    Split strings in a list at any of multiple split-strings.

    :param in_strings:  List of strings to be broken into substrings.
    :param split_strings: List of string separators after which to
        divide.
    :param strip_whitespace: Remove leading/trailing whitespace from
        tokens?
    :return: A list of (probably) smaller strings.
    """
    out = [in_strings]
    for split_string in split_strings:
        out = one_split(out, split_string, strip_whitespace)
    return out


def word_tokenizer(raw_string: str,
                   delimiters: tuple =
                   (' ', '.', ',', '>', '!', ';', ':', '--'),
                   strip_whitespace: bool = False) -> list:
    """
    Simple tokenizer that splits on spaces and assorted punctuation.
    Also retains separators.

    :param raw_string: string to tokenize
    :param delimiters: [(' ', '.', ',', '>', '!', ';', ':', '--')]
        list of string splitting delimiters.
    :param strip_whitespace: [False] Remove leading/trailing whitespace
        from tokens?
    :return: List of substrings
    """
    return multi_split(raw_string, delimiters, strip_whitespace)


def cover_tokens(coarse_grained_tokens: list,
                 fine_grained_tokens: list,
                 num_fine_tokens_to_be_matched=None) -> list:
    """
    Given two tokenizations of a sentence -- one coarse-grained (e.g.
    word-level tokenization), and one fine-grained (e.g., wordpiece-
    level tokenization), this method returns a covering of the
    coarse-grained tokens with fine-grained tokens.

    Specifically, the returned covering comes with the guarantee that
    the concatenation of the lists of fine-grained tokens assigned to
    each coarse-grained token would recover the original list of
    fine-grained tokens.

    Additionally, the fine-grained tokens may include additional
    characters (which some tokenizers create), but MUST CONTAIN all the
    characters from the concatenated coarse-grained tokens in the same
    order (until num_fine_tokens_to_be_matched fine-tokens have been
    processed, if specified).

    Further, a fine-grained token is guaranteed to belong to one and
    only one coarse-grained token and is associated with the first
    coarse-grained token to which it contributes.  It doesn't need to
    end in the same coarse-grained token... this helps to accommodate
    tokenizers that may split whitespace and punctuation differently.

    The method returns None if a suitable covering cannot be defined.

    Example:

      sentence = 'coarse tokens fine.'

      coarse = word_tokenizer(sentence)
      # ['coarse', ' ', 'tokens', ' ', 'fine', '.']

      fine = imdb_rnn_tokenizer(sentence)
      # ['coa', 'rse', ' ', 'to', 'ken', 's ', 'fine', '.']

      cover_tokens_new(coarse, fine)

      # [('coarse', ['coa', 'rse']),
      #  (' ', [' ']),
      #  ('tokens', ['to', 'ken', 's ']),
      #  (' ', []),
      #  ('fine', ['fine']),
      #  ('.', ['.'])]

    Notice that 's ' in the fine tokenization straddles two coarse-
    tokens 'tokens' and ' ', it is associated with the first, but the
    still satisfies the requirement that the second is character-for-
    character matched.

    :param coarse_grained_tokens: List with tokens from a
    coarse-grained tokenization (e.g., word-level tokenization) of the
    input sentence

    :param fine_grained_tokens: List with tokens from a
    fine-grained tokenization (e.g., wordpiece-level or character-level
    tokenization) of the  input sentence.

    :param num_fine_tokens_to_be_matched: [Default None] If None,
    require all characters in coarse-grained tokens to be matched.
    If this is passed an integer, only require this many fine-grained
    tokens to match before declaring the covering valid. Helpful when
    model takes a specific number of input tokens.

    :returns token_covering: List of tuples where the i^th tuple
    consists of the i^th coarse-grained token followed by a list of
    fine-grained tokens it maps to; None if covering isn't possible.
    """
    coverings = []
    num_fine_tokens_processed = 0

    fine_token_iter = iter(fine_grained_tokens)
    coarse_token_iter = iter(coarse_grained_tokens)

    # These init values will kick-off the draw loops for coarse and fine chars
    coarse_char_iter = iter('')
    fine_char_iter = iter('')
    coarse_char = None
    fine_char = None

    # This while loop compares coarse and fine tokens, one character at a time
    # If they match, both step; if not, only the fine character steps this
    # allows the algorithm to skip extra characters that the fine tokenizer
    # might have added.
    #
    # Each time a new fine-grained token is drawn, it is added to the current
    # active coarse grained token right away.
    #
    # If a coarse-grained token ins consumed, a new one is picked up and a new
    # covering entry is created for it.  Any partially processed fine tokens
    # will continue to match characters in the new coarse token.  However
    # the fine-token will continue to be associated with only the previous
    # covering.  See the docstring example for a straddling case with the
    # 's ' fine-grained token.

    while True:
        if coarse_char == fine_char:
            while True:  # Draw coarse_char until valid
                try:
                    coarse_char = next(coarse_char_iter)
                    break
                except StopIteration:  # Need a new token
                    try:
                        coarse_token = next(coarse_token_iter)
                        coverings.append([coarse_token, []])
                        coarse_char_iter = iter(coarse_token)
                    except StopIteration:  # End of coarse tokens
                        return coverings

        # Increment fine_char whether or not there was a match.
        while True:  # Draw fine_char until valid
            try:
                fine_char = next(fine_char_iter)
                break
            except StopIteration:  # Need a new token
                try:
                    fine_token = next(fine_token_iter)
                    num_fine_tokens_processed += 1
                    coverings[-1][1].append(fine_token)
                    fine_char_iter = iter(fine_token)
                except StopIteration:  # End of fine tokens
                    if (num_fine_tokens_to_be_matched and
                            num_fine_tokens_processed >=
                            num_fine_tokens_to_be_matched):
                        return coverings
                    else:
                        return None


def regroup_attributions(coverings: list, fine_attributions: list) -> list:
    """
    Produces a list of len(coverings) summed attributions according to
    the groupings of the tuples in coverings.

     Example:

       covering =[(“simple”, [“simple”]),
                  (“example”, [“exam#”, “#ple”])]

       fine_attributions = [0.1, 0.3. 0.4]

       regroup_attributions(covering, fine_attributions)

       # [ 0.1,  0.7 ] <- one fore each coarse token

    :param coverings: List of tuples grouping tokens together
    :param fine_attributions: List of attribution values, one for
        each entry in the concatenated covering list.
    :return: A list of combined coverings, one for each tuple in
    covering.
    """
    coarse_attributions = []
    offset = 0

    for coarse_token, fine_tokens_covered in coverings:
        num_tokens = len(fine_tokens_covered)
        coarse_attributions.append(sum(fine_attributions[
                                       offset:offset + num_tokens])
                                   if num_tokens else 0.)
        offset += num_tokens
    return coarse_attributions


Writing imdb_rnn_model/cover_tokens.py


In [26]:
%%writefile imdb_rnn_model/package.py

import numpy as np
import pathlib
import pickle
import logging
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from .cover_tokens import strip_accents_and_special_characters
from .cover_tokens import word_tokenizer
from .cover_tokens import cover_tokens
from .cover_tokens import regroup_attributions
from .tf_saved_model_wrapper_ig import TFSavedModelWrapperIg


PACKAGE_PATH = pathlib.Path(__file__).parent
SAVED_MODEL_PATH = PACKAGE_PATH / 'saved_model'
TOKENIZER_PATH = PACKAGE_PATH / 'tokenizer.pkl'

LOG = logging.getLogger(__name__)


class MyModel(TFSavedModelWrapperIg):
    def __init__(self, saved_model_path, sig_def_key, tokenizer_path,
                 is_binary_classification=False,
                 output_key=None,
                 batch_size=8,
                 output_columns=[],
                 input_tensor_to_differentiable_layer_mapping={},
                 max_allowed_error=None):
        """
        Class to load and run the IMDB RNN model.
        See: TFSavedModelWrapper

        """
        super().__init__(saved_model_path, sig_def_key,
                         is_binary_classification=is_binary_classification,
                         output_key=output_key,
                         batch_size=batch_size,
                         output_columns=output_columns,
                         input_tensor_to_differentiable_layer_mapping=
                         input_tensor_to_differentiable_layer_mapping,
                         max_allowed_error=max_allowed_error)
        with open(tokenizer_path, 'rb') as handle:
            self.tokenizer = pickle.load(handle)
        self.max_seq_length = 300

    def transform_input(self, input_df):
        """
        Transform the provided dataframe into one that complies with the input
        interface of the model.

        Overrides the transform_input method of TFSavedModelWrapper.
        """

        input_tokens = (input_df['sentence']
                        .apply(lambda x: self.tokenizer.encode(
                                strip_accents_and_special_characters(x))))

        input_tokens = sequence.pad_sequences(input_tokens,
                                              maxlen=self.max_seq_length,
                                              padding="post"
                                             )

        return pd.DataFrame({'inputs': input_tokens.tolist()})

    def generate_baseline(self, input_df):

        input_tokens = input_df['sentence'].apply(lambda x:
                                                  self.tokenizer.encode(''))
        input_tokens = sequence.pad_sequences(input_tokens,
                                              maxlen=self.max_seq_length,
                                              padding="post"
                                             )

        return pd.DataFrame({'inputs': input_tokens.tolist()})

    def project_attributions(self, input_df, transformed_input_df,
                             attributions):
        """
        Maps the transformed input to original input space so that the
        attributions correspond to the features of the original input.
        Overrides the project_attributions method of TFSavedModelWrapper.
        """
        
        wordpiece_tokens = [self.tokenizer.decode([int(t)]) for t in
                            (transformed_input_df['inputs'][0])]

        word_tokens = word_tokenizer(
            strip_accents_and_special_characters(
                input_df['sentence'].iloc[0]))

        coverings = cover_tokens(word_tokens,
                                 wordpiece_tokens,
                                 num_fine_tokens_to_be_matched=
                                 self.max_seq_length)

        word_attributions = regroup_attributions(
            coverings,
            attributions['inputs'][0].astype(
                'float').tolist())
        if word_attributions:
            return {'embedding_input': [word_tokens, word_attributions]}
        else:
            LOG.info('Cover tokens failed.  Falling back to wordpiece tokens')
            return {'embedding_input': [wordpiece_tokens,
                                        attributions['inputs'
                                                     ][0].astype(
                                                     'float').tolist()
                                        ]}



def get_model():
    model = MyModel(
        SAVED_MODEL_PATH,
        tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
        TOKENIZER_PATH,
        is_binary_classification=True,
        batch_size=128,
        output_columns=['inputs'],
        input_tensor_to_differentiable_layer_mapping=
        {'inputs': 'embedding/embedding_lookup:0'},
        max_allowed_error=5)
    model.load_model()
    return model


Overwriting imdb_rnn_model/package.py


# Upload model

In [27]:
client.delete_model(project_id, model_id)
client.upload_model_package(model_dir, project_id, model_id)

# Run model

In [29]:
prediction_input = train_input[:10]
result = client.run_model(project_id, model_id, prediction_input)
result

Unnamed: 0,inputs
0,0.148647
1,0.959517
2,0.155386
3,0.603902
4,0.951737
5,0.909023
6,0.151743
7,0.105851
8,0.15047
9,0.140991


# Get explanation

In [30]:
selected_point = df.head(1)

In [31]:
project_id = 'tutorial'
model_id = 'imdb_rnn_model'

ex_ig = client.run_explanation(
    project_id=project_id,
    model_id=model_id, 
    df=selected_point, 
    dataset_id='imdb_rnn',
    explanations='ig')