based on Working with Hugging Face Transformers and TF 2.0 by Akash Desarda ([link](https://towardsdatascience.com/working-with-hugging-face-transformers-and-tf-2-0-89bf35e3555a))

Following is a general pipeline for any transformer model:

Tokenizer definition → Tokenization of Documents → Model Definition → Model Training →Inference

In [19]:
#tokenizer definition
from transformers import DistilBertTokenizer, TFDistilBertModel
from transformers import TFDistilBertForSequenceClassification, DistilBertConfig
import tensorflow as tf


distil_bert = 'distilbert-base-uncased' # Pick any desired pre-trained model

# Defining DistilBERT tokonizer
tokenizer = DistilBertTokenizer.from_pretrained(distil_bert, 
                                                do_lower_case=True, 
                                                add_special_tokens=True,
                                                max_length=128, 
                                                pad_to_max_length=True)


In [20]:
#tokenization 

def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [],[],[]
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, 
                                       add_special_tokens=True, 
                                       max_length=128, 
                                       pad_to_max_length=True, 
                                       return_attention_mask=True, 
                                       return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'), \
           np.asarray(input_masks, dtype='int32'), \
           np.asarray(input_segments, dtype='int32')


In [21]:
#model definition
distil_bert = 'distilbert-base-uncased'

config = DistilBertConfig(dropout=0.2, attention_dropout=0.2)
config.output_hidden_states = False
transformer_model = TFDistilBertModel.from_pretrained(distil_bert, config = config)

input_ids_in = tf.keras.layers.Input(shape=(128,), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(128,), name='masked_token', dtype='int32') 

embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
cls_token = embedding_layer[:,0,:]
X = tf.keras.layers.BatchNormalization()(cls_token)
X = tf.keras.layers.Dense(192, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(6, activation='softmax')(X)
model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = X)

for layer in model.layers[:3]:
  layer.trainable = False

In [23]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_token (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
masked_token (InputLayer)       [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_distil_bert_model (TFDistilB ((None, 128, 768),)  66362880    input_token[0][0]                
__________________________________________________________________________________________________
tf_op_layer_strided_slice (Tens [(None, 768)]        0           tf_distil_bert_model[0][0]       
______________________________________________________________________________________________

## Next example

In [31]:

import tensorflow_datasets
from transformers import glue_convert_examples_to_features, TFBertForSequenceClassification
from transformers import (TFBertModel, BertTokenizer,
                         TFGPT2Model, GPT2Tokenizer)

model = TFBertForSequenceClassification.from_pretrained("bert-base-cased")
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

data = tensorflow_datasets.load("glue/mrpc")
train_dataset = data["train"]
train_dataset = glue_convert_examples_to_features(train_dataset, tokenizer, 128, 'mrpc')

optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
model.fit(train_dataset, epochs=3)


INFO:absl:Load pre-computed datasetinfo (eg: splits) from bucket.
INFO:absl:Loading info from GCS for glue/mrpc/0.0.2
INFO:absl:Generating dataset glue (C:\Users\Fotis\tensorflow_datasets\glue\mrpc\0.0.2)


[1mDownloading and preparing dataset glue (1.43 MiB) to C:\Users\Fotis\tensorflow_datasets\glue\mrpc\0.0.2...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

INFO:absl:URL https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc already downloaded: reusing C:\Users\Fotis\tensorflow_datasets\downloads\fire.goog.com_v0_b_mtl-sent-repr.apps.com_o_2Flx13Z9gbmX_ZBgreDsI8T8McuyJqVdG9ShusR064Hcc.tsvalt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc.
INFO:absl:URL https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt already downloaded: reusing C:\Users\Fotis\tensorflow_datasets\downloads\dl.fbaip.com_sente_sente_msr_parap_trainYKmwkIRSjwZz7t7itpy5QZIPC4zQ7szvxGSph2hFf4k.txt.
INFO:absl:URL https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt already downloaded: reusing C:\Users\Fotis\tensorflow_datasets\downloads\dl.fbaip.com_sente_sente_msr_parap_testoE4nEJCHmqumQj1luUlQwIkphYfZwIS_nNdDm9eF94Q.txt.
INFO:absl:Generating split train
INFO:absl:Writing TFRecords








HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, description='Shuffling...', max=1.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Reading...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=0.0, description='Writing...', max=3668.0, style=ProgressStyle(description_…

INFO:absl:Generating split validation
INFO:absl:Writing TFRecords




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, description='Shuffling...', max=1.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Reading...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=0.0, description='Writing...', max=408.0, style=ProgressStyle(description_w…

INFO:absl:Generating split test
INFO:absl:Writing TFRecords




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, description='Shuffling...', max=1.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Reading...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=0.0, description='Writing...', max=1725.0, style=ProgressStyle(description_…

INFO:absl:Skipping computing stats for mode ComputeStatsMode.AUTO.




UnknownError: Failed to rename: C:\Users\Fotis\tensorflow_datasets\glue\mrpc\0.0.2.incompleteEUHEUR to: C:\Users\Fotis\tensorflow_datasets\glue\mrpc\0.0.2 : Zugriff verweigert
; Input/output error

In [29]:
tf.__version__

'2.2.0'