# Initialize Fiddler Client

In [1]:
import fiddler as fdl
client = fdl.FiddlerApi()

# Load dataset

In [2]:
import pandas as pd
df = pd.read_csv('/app/fiddler_samples/samples/datasets/imdb_rnn/imdb_rnn.csv')
df_schema = fdl.DatasetInfo.from_dataframe(df, max_inferred_cardinality=1000)

In [3]:
df.head()

Unnamed: 0,sentence,polarity
0,A real blow-up of the film literally. This Bri...,False
1,"I only wish that Return of the Jedi, have been...",True
2,"""I like cheap perfume better; it doesn't last ...",True
3,On the eighth day God created Georges. But the...,True
4,"No, this is not no Alice fairy tale my friends...",True


# Upload dataset

In [4]:
if 'imdb_rnn' not in client.list_datasets():
    upload_result = client.upload_dataset(
        dataset={'train': df}, 
        dataset_id='imdb_rnn')

# Create model schema

In [5]:
target = 'polarity'
feature_columns = ['sentence']
train_input = df[feature_columns]
train_target = df[target]

model_info = fdl.ModelInfo.from_dataset_info(
    dataset_info=client.get_dataset_info('imdb_rnn'),
    target=target, 
    features=feature_columns,
    display_name='Text IG',
    description='this is a tensorflow model using text data and IG enabled from tutorial',
    input_type=fdl.ModelInputType.TEXT
)

# Install Tensorflow if necessary

In [None]:
!pip install tensorflow==1.14

In [None]:
!pip install tensorflow-datasets==1.2.0

# Train model

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_target = le.fit_transform(train_target)
train_target = train_target.reshape(-1,1)

In [7]:
import tensorflow_datasets as tfds

vocab_size = 2000
encoder = tfds.features.text.SubwordTextEncoder.build_from_corpus(train_input['sentence'],
                                                                  target_vocab_size=vocab_size)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [8]:
from unidecode import unidecode

def strip_accents_and_special_characters(s):
    return unidecode(s)

In [9]:
from tensorflow.keras.preprocessing import sequence

input_tokens = (train_input['sentence']
                .apply(lambda x: encoder.encode(
                    strip_accents_and_special_characters(x))))

max_seq_length = 300

input_tokens_padd = sequence.pad_sequences(
    input_tokens,
    maxlen=max_seq_length,
    padding="post"
)

In [10]:
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.models import Model

def RNN():
    inputs = Input(name='inputs', shape=[max_seq_length])
    layer = Embedding(vocab_size, 64, input_length=max_seq_length)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256, name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.2)(layer)
    layer = Dense(1, name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

In [11]:
from tensorflow.keras.optimizers import RMSprop

model = RNN()
model.summary()
model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 300)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 300, 64)           128000    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                33024     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation (Activation)      (None, 256)               0         
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                 257   

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [12]:
from tensorflow.keras.callbacks import EarlyStopping
model.fit(input_tokens_padd, train_target, batch_size=128, epochs=5,
          validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.001)])

Train on 22500 samples, validate on 2500 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f861db7bd50>

# Save model and schema

In [13]:
import pathlib
import shutil
import pickle
import yaml
import tensorflow as tf

project_id = 'tutorial'
model_id = 'imdb_rnn_model'

# create temp dir
model_dir = pathlib.Path(model_id)
shutil.rmtree(model_dir, ignore_errors=True)
model_dir.mkdir()

# save model
tf.keras.experimental.export_saved_model(model, str(model_dir / 'saved_model'))

# save model schema
with open(model_dir / 'model.yaml', 'w') as yaml_file:
    yaml.dump({'model': model_info.to_dict()}, yaml_file)

# save tokenizer
with open(model_dir / 'tokenizer.pkl', 'wb') as tok_file:
    tok_file.write(pickle.dumps(encoder))

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.


Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.


INFO:tensorflow:Signatures INCLUDED in export for Classify: None


INFO:tensorflow:Signatures INCLUDED in export for Classify: None


INFO:tensorflow:Signatures INCLUDED in export for Regress: None


INFO:tensorflow:Signatures INCLUDED in export for Regress: None


INFO:tensorflow:Signatures INCLUDED in export for Predict: None


INFO:tensorflow:Signatures INCLUDED in export for Predict: None


INFO:tensorflow:Signatures INCLUDED in export for Train: ['train']


INFO:tensorflow:Signatures INCLUDED in export for Train: ['train']


INFO:tensorflow:Signatures INCLUDED in export for Eval: None


INFO:tensorflow:Signatures INCLUDED in export for Eval: None






INFO:tensorflow:No assets to save.


INFO:tensorflow:No assets to save.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:Signatures INCLUDED in export for Classify: None


INFO:tensorflow:Signatures INCLUDED in export for Classify: None


INFO:tensorflow:Signatures INCLUDED in export for Regress: None


INFO:tensorflow:Signatures INCLUDED in export for Regress: None


INFO:tensorflow:Signatures INCLUDED in export for Predict: None


INFO:tensorflow:Signatures INCLUDED in export for Predict: None


INFO:tensorflow:Signatures INCLUDED in export for Train: None


INFO:tensorflow:Signatures INCLUDED in export for Train: None


INFO:tensorflow:Signatures INCLUDED in export for Eval: ['eval']


INFO:tensorflow:Signatures INCLUDED in export for Eval: ['eval']






INFO:tensorflow:No assets to save.


INFO:tensorflow:No assets to save.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:Signatures INCLUDED in export for Classify: None


INFO:tensorflow:Signatures INCLUDED in export for Classify: None


INFO:tensorflow:Signatures INCLUDED in export for Regress: None


INFO:tensorflow:Signatures INCLUDED in export for Regress: None


INFO:tensorflow:Signatures INCLUDED in export for Predict: ['serving_default']


INFO:tensorflow:Signatures INCLUDED in export for Predict: ['serving_default']


INFO:tensorflow:Signatures INCLUDED in export for Train: None


INFO:tensorflow:Signatures INCLUDED in export for Train: None


INFO:tensorflow:Signatures INCLUDED in export for Eval: None


INFO:tensorflow:Signatures INCLUDED in export for Eval: None


INFO:tensorflow:No assets to save.


INFO:tensorflow:No assets to save.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: imdb_rnn_model/saved_model/saved_model.pb


INFO:tensorflow:SavedModel written to: imdb_rnn_model/saved_model/saved_model.pb


# Write package.py and related wrappers

### Import related wrappers

We need to import 2 wrappers for tensorflow. Those files are stored in the utils directory.
- The tf_saved_model_wrapper.py file contains a wrapper to load and run a TF model from a saved_model path.
- The tf_saved_model_wrapper_ig.py file contains a wrapper to support Integrated Gradients (IG) computation for a TF model loaded from a saved_model path.

In [None]:
files = ['utils/tf_saved_model_wrapper.py', 'utils/tf_saved_model_wrapper_ig.py']
for f in files:
    shutil.copy(f, model_dir)

### Write package.py file

Next, we need to write the package.py file. This file contains functions to transform the input, generate the baseline and get the attributions.

The project_attributions() function uses functionalities from the cover_tokens.py file that we need to import as well.

In [26]:
%%writefile imdb_rnn_model/package.py

import numpy as np
import pathlib
import pickle
import logging
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from .cover_tokens import strip_accents_and_special_characters
from .cover_tokens import word_tokenizer
from .cover_tokens import cover_tokens
from .cover_tokens import regroup_attributions
from .tf_saved_model_wrapper_ig import TFSavedModelWrapperIg


PACKAGE_PATH = pathlib.Path(__file__).parent
SAVED_MODEL_PATH = PACKAGE_PATH / 'saved_model'
TOKENIZER_PATH = PACKAGE_PATH / 'tokenizer.pkl'

LOG = logging.getLogger(__name__)


class MyModel(TFSavedModelWrapperIg):
    def __init__(self, saved_model_path, sig_def_key, tokenizer_path,
                 is_binary_classification=False,
                 output_key=None,
                 batch_size=8,
                 output_columns=[],
                 input_tensor_to_differentiable_layer_mapping={},
                 max_allowed_error=None):
        """
        Class to load and run the IMDB RNN model.
        See: TFSavedModelWrapper

        """
        super().__init__(saved_model_path, sig_def_key,
                         is_binary_classification=is_binary_classification,
                         output_key=output_key,
                         batch_size=batch_size,
                         output_columns=output_columns,
                         input_tensor_to_differentiable_layer_mapping=
                         input_tensor_to_differentiable_layer_mapping,
                         max_allowed_error=max_allowed_error)
        with open(tokenizer_path, 'rb') as handle:
            self.tokenizer = pickle.load(handle)
        self.max_seq_length = 300

    def transform_input(self, input_df):
        """
        Transform the provided dataframe into one that complies with the input
        interface of the model.

        Overrides the transform_input method of TFSavedModelWrapper.
        """

        input_tokens = (input_df['sentence']
                        .apply(lambda x: self.tokenizer.encode(
                                strip_accents_and_special_characters(x))))

        input_tokens = sequence.pad_sequences(input_tokens,
                                              maxlen=self.max_seq_length,
                                              padding="post"
                                             )

        return pd.DataFrame({'inputs': input_tokens.tolist()})

    def generate_baseline(self, input_df):

        input_tokens = input_df['sentence'].apply(lambda x:
                                                  self.tokenizer.encode(''))
        input_tokens = sequence.pad_sequences(input_tokens,
                                              maxlen=self.max_seq_length,
                                              padding="post"
                                             )

        return pd.DataFrame({'inputs': input_tokens.tolist()})

    def project_attributions(self, input_df, transformed_input_df,
                             attributions):
        """
        Maps the transformed input to original input space so that the
        attributions correspond to the features of the original input.
        Overrides the project_attributions method of TFSavedModelWrapper.
        """
        
        wordpiece_tokens = [self.tokenizer.decode([int(t)]) for t in
                            (transformed_input_df['inputs'][0])]

        word_tokens = word_tokenizer(
            strip_accents_and_special_characters(
                input_df['sentence'].iloc[0]))

        coverings = cover_tokens(word_tokens,
                                 wordpiece_tokens,
                                 num_fine_tokens_to_be_matched=
                                 self.max_seq_length)

        word_attributions = regroup_attributions(
            coverings,
            attributions['inputs'][0].astype(
                'float').tolist())
        if word_attributions:
            return {'embedding_input': [word_tokens, word_attributions]}
        else:
            LOG.info('Cover tokens failed.  Falling back to wordpiece tokens')
            return {'embedding_input': [wordpiece_tokens,
                                        attributions['inputs'
                                                     ][0].astype(
                                                     'float').tolist()
                                        ]}



def get_model():
    model = MyModel(
        SAVED_MODEL_PATH,
        tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
        TOKENIZER_PATH,
        is_binary_classification=True,
        batch_size=128,
        output_columns=['inputs'],
        input_tensor_to_differentiable_layer_mapping=
        {'inputs': 'embedding/embedding_lookup:0'},
        max_allowed_error=5)
    model.load_model()
    return model


Overwriting imdb_rnn_model/package.py


### Import cover_tokens.py file

The cover_tokens.py file regroups functions to get attributions for word tokens.

The idea is to regroup the wordpiece-level tokenization that we used in the model to a word-level tokenization.  

For example:  
sentence = 'coarse tokens fine.'  
coarse = word_tokenizer(sentence)
-> ['coarse', ' ', 'tokens', ' ', 'fine', '.']  
fine = imdb_rnn_tokenizer(sentence)
-> ['coa', 'rse', ' ', 'to', 'ken', 's ', 'fine', '.']  
cover_tokens(coarse, fine)  
-> [('coarse', ['coa', 'rse']),  
   (' ', [' ']),  
   ('tokens', ['to', 'ken', 's ']),  
   (' ', []),  
   ('fine', ['fine']),  
   ('.', ['.'])]  

Then the regroup_attributions() function takes this mapping of word-level to wordpiece-level and regroup the fine attributions to a word-level attribution.

For example:  
covering =[(“simple”, [“simple”]),  (“example”, [“exam#”, “#ple”])]  
fine_attributions = [0.1, 0.3. 0.4]  
regroup_attributions(covering, fine_attributions)
-> [ 0.1,  0.7 ]  

In [None]:
shutil.copy('utils/cover_tokens.py', model_dir)

# Upload model

In [27]:
client.delete_model(project_id, model_id)
client.upload_model_package(model_dir, project_id, model_id)

# Run model

In [29]:
prediction_input = train_input[:10]
result = client.run_model(project_id, model_id, prediction_input)
result

Unnamed: 0,inputs
0,0.148647
1,0.959517
2,0.155386
3,0.603902
4,0.951737
5,0.909023
6,0.151743
7,0.105851
8,0.15047
9,0.140991


# Get explanation

In [30]:
selected_point = df.head(1)

In [31]:
project_id = 'tutorial'
model_id = 'imdb_rnn_model'

ex_ig = client.run_explanation(
    project_id=project_id,
    model_id=model_id, 
    df=selected_point, 
    dataset_id='imdb_rnn',
    explanations='ig')