# About

See what BERT is all about. This version is meant to run on our server.

### various implementations of BERT

https://pypi.org/project/pytorch-pretrained-bert/

https://github.com/huggingface/transformers

https://github.com/tensorflow/models/tree/master/official/nlp/bert

https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1

# test of model from tensorflow hub

In [4]:
# see
# https://www.tensorflow.org/hub

import tensorflow as tf
import tensorflow_hub as hub

module_url = "https://tfhub.dev/google/nnlm-en-dim128/2"
embed = hub.KerasLayer(module_url)
embeddings = embed(["A long sentence.", "single-word", "http://example.com"])
print(embeddings.shape)  #(3,128)

(3, 128)


# test of the BERT model from tensor flow hub

In [5]:
# see
# https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1

module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"

max_seq_length = 128  # Your choice here.
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")
bert_layer = hub.KerasLayer(module_url, trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

In [10]:
# pooled_output of shape [batch_size, 768]
# representations for the entire input sequences
pooled_output

<tf.Tensor 'keras_layer_3/Identity:0' shape=(None, 768) dtype=float32>

In [11]:
# sequence_output of shape [batch_size, max_seq_length, 768]
# representations for each input token (in context).
sequence_output

<tf.Tensor 'keras_layer_3/Identity_1:0' shape=(None, None, 768) dtype=float32>

The tokenization of input text can be performed in Python with the FullTokenizer class from:

https://github.com/tensorflow/models/blob/master/official/nlp/bert/tokenization.py

In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

For complete usage examples, see run_classifier.py and run_squad.py from tensorflow/models/official/nlp/bert/ on GitHub.

https://github.com/tensorflow/models/blob/master/official/nlp/bert/run_classifier.py
https://github.com/tensorflow/models/blob/master/official/nlp/bert/run_squad.py

# Hugging Face

https://github.com/huggingface/transformers#quick-tour

In [1]:
import torch
from transformers import *

In [2]:
# Transformers has a unified API
# for 8 transformer architectures and 30 pretrained weights.
#          Model          | Tokenizer          | Pretrained weights shortcut

# MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
#           (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
#           (GPT2Model,       GPT2Tokenizer,       'gpt2'),
#           (CTRLModel,       CTRLTokenizer,       'ctrl'),
#           (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
#           (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
#           (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
#           (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
#           (RobertaModel,    RobertaTokenizer,    'roberta-base')]

# running on all models crashed kernel, so just using BERT

#          Model          | Tokenizer          | Pretrained weights shortcut
MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased')]

In [7]:
# To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel`

# Let's encode some text in a sequence of hidden-states using each model:
for model_class, tokenizer_class, pretrained_weights in MODELS:
    # Load pretrained model/tokenizer
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    model = model_class.from_pretrained(pretrained_weights)

    # Encode text
    input_ids = torch.tensor([tokenizer.encode("Here is some text to encode", add_special_tokens=True)])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
    with torch.no_grad():
        last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples
        print(type(last_hidden_states))
        print(last_hidden_states.shape)
        print(last_hidden_states[0])

<class 'torch.Tensor'>
torch.Size([1, 9, 768])
tensor([[-0.0549,  0.1053, -0.1065,  ..., -0.3551,  0.0686,  0.6506],
        [-0.5759, -0.3650, -0.1383,  ..., -0.6782,  0.2092, -0.1639],
        [-0.1641, -0.5597,  0.0150,  ..., -0.1603, -0.1346,  0.6216],
        ...,
        [ 0.2448,  0.1254,  0.1587,  ..., -0.2749, -0.1163,  0.8809],
        [ 0.0481,  0.4950, -0.2827,  ..., -0.6097, -0.1212,  0.2527],
        [ 0.9046,  0.2137, -0.5897,  ...,  0.3040, -0.6172, -0.1950]])


In [8]:
# Each architecture is provided with several class for fine-tuning on down-stream tasks, e.g.
BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
                      BertForSequenceClassification, BertForTokenClassification, BertForQuestionAnswering]

In [10]:
# All the classes for an architecture can be initiated from pretrained weights for this architecture
# Note that additional weights added for fine-tuning are only initialized
# and need to be trained on the down-stream task
pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
for model_class in BERT_MODEL_CLASSES:
    # Load pretrained model/tokenizer
    model = model_class.from_pretrained(pretrained_weights)

    # Models can return full list of hidden-states & attentions weights at each layer
    model = model_class.from_pretrained(pretrained_weights,
                                        output_hidden_states=True,
                                        output_attentions=True)
    input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
    all_hidden_states, all_attentions = model(input_ids)[-2:]

    # Models are compatible with Torchscript
    model = model_class.from_pretrained(pretrained_weights, torchscript=True)
    traced_model = torch.jit.trace(model, (input_ids,))

    # Simple serialization for models and tokenizers
    p_models = "../models/bert/"
    model.save_pretrained(p_models)  # save
    model = model_class.from_pretrained(p_models)  # re-load
    tokenizer.save_pretrained(p_models)  # save
    tokenizer = BertTokenizer.from_pretrained(p_models)  # re-load

    # SOTA examples for GLUE, SQUAD, text generation...