In [4]:
! pip install -q tensorflow==1.15.0 tensorflow-hub

In [5]:
!git clone https://github.com/google-research/bert.git


Cloning into 'bert'...
remote: Enumerating objects: 340, done.[K
remote: Total 340 (delta 0), reused 0 (delta 0), pack-reused 340[K
Receiving objects: 100% (340/340), 317.84 KiB | 260.00 KiB/s, done.
Resolving deltas: 100% (185/185), done.


In [6]:
import sys
import tensorflow.compat.v1 as tf
from bert import modeling
import shutil
import os
from shutil import copyfile

tf.get_logger().setLevel('WARN')
tf.disable_v2_behavior()

print(tf.__version__)
print(tf.keras.__version__)

Instructions for updating:
non-resource variables are not supported in the long term
1.15.0
2.2.4-tf


In [7]:
def save_model(config_path, meta_path, ckpt_path, export_dir):

    with tf.Graph().as_default():
        tf.random.set_random_seed(44)
        # these names are important, we look for these in Spark NLP when we feed the BERT model
        bert_inputs = dict(
            input_ids=tf.placeholder(dtype=tf.int32, shape=(None, None), name="input_ids"),
            input_mask=tf.placeholder(dtype=tf.int32, shape=(None, None), name="input_mask"),
            segment_ids=tf.placeholder(dtype=tf.int32, shape=(None, None), name="segment_ids")
        )

        with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                              log_device_placement=False)) as sess:

            with tf.device('/gpu:0'):

                bert_config = modeling.BertConfig.from_json_file(config_path)

                model = modeling.BertModel(
                    config=bert_config,
                    is_training=False,
                    input_ids=bert_inputs['input_ids'],
                    input_mask=bert_inputs['input_mask'],
                    token_type_ids=bert_inputs['segment_ids'],
                    use_one_hot_embeddings=False
                )

                # this name is important, we look for this when we want to fetch the result
                # as you already guessed, you can do whatever you want within the TensorFlow with this output
                # as long as the result is DT_FLOAT with the shape of (-1, -1, 768) you can use the same name 
                # and access the results in Spark NLP               
                sequence_output = tf.identity(model.get_sequence_output(), name="sequence_output")
                bert_outputs = dict(
                    sequence_output=sequence_output
                )

                tf.train.Saver().restore(sess, ckpt_path)

                init_op = tf.group([tf.global_variables_initializer(),
                                    tf.initializers.tables_initializer(name='init_all_tables')])

                sess.run(init_op)

                shutil.rmtree(export_dir, ignore_errors=True)

                tf.saved_model.simple_save(
                    sess,
                    export_dir,
                    inputs=bert_inputs,
                    outputs=bert_outputs,
                    legacy_init_op=init_op
                )

In [8]:
# Let's download some BERT Checkpoints
!wget https://neuralmind-ai.s3.us-east-2.amazonaws.com/nlp/bert-base-portuguese-cased/bert-base-portuguese-cased_tensorflow_checkpoint.zip
!unzip bert-base-portuguese-cased_tensorflow_checkpoint.zip -d bert-base-portuguese-cased_tensorflow_checkpoint

# For some reason portuguese vocab.txt is not included in the model, 
# it has to be downloaded separately
# most BERT models come with the vocab.txt included
!wget -P bert-base-portuguese-cased_tensorflow_checkpoint "https://neuralmind-ai.s3.us-east-2.amazonaws.com/nlp/bert-base-portuguese-cased/vocab.txt"

--2020-09-14 15:41:46--  https://neuralmind-ai.s3.us-east-2.amazonaws.com/nlp/bert-base-portuguese-cased/bert-base-portuguese-cased_tensorflow_checkpoint.zip
Resolving neuralmind-ai.s3.us-east-2.amazonaws.com (neuralmind-ai.s3.us-east-2.amazonaws.com)... 52.219.100.250
Connecting to neuralmind-ai.s3.us-east-2.amazonaws.com (neuralmind-ai.s3.us-east-2.amazonaws.com)|52.219.100.250|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1205655266 (1,1G) [application/zip]
Saving to: ‘bert-base-portuguese-cased_tensorflow_checkpoint.zip’


2020-09-14 15:51:07 (2,05 MB/s) - ‘bert-base-portuguese-cased_tensorflow_checkpoint.zip’ saved [1205655266/1205655266]

Archive:  bert-base-portuguese-cased_tensorflow_checkpoint.zip
  inflating: bert-base-portuguese-cased_tensorflow_checkpoint/bert_config.json  
  inflating: bert-base-portuguese-cased_tensorflow_checkpoint/model.ckpt.data-00000-of-00001  
  inflating: bert-base-portuguese-cased_tensorflow_checkpoint/model.ckpt.index  

In [10]:
def export_bert(pretrain_path, save_path):

    config_path = pretrain_path + '/bert_config.json'
    meta_path = pretrain_path + '/model.ckpt.meta'
    ckpt_path = pretrain_path + '/model.ckpt'
    vocab = pretrain_path + '/vocab.txt'

    save_model(config_path, meta_path, ckpt_path, save_path)
    os.makedirs(os.path.dirname(save_path+"/assets/"), exist_ok=True)
    # Spark NLP needs vocab.txt in assets with the same name
    copyfile(vocab, save_path+"/assets/vocab.txt")

In [13]:
export_bert('./bert-base-portuguese-cased_tensorflow_checkpoint', './bert_saved_models/bert-base-portuguese-cased')






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.simple_save.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.


In [14]:
!saved_model_cli show --all --dir ./bert_saved_models/bert-base-portuguese-cased/



MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:

signature_def['serving_default']:
  The given SavedModel SignatureDef contains the following input(s):
    inputs['input_ids'] tensor_info:
        dtype: DT_INT32
        shape: (-1, -1)
        name: input_ids:0
    inputs['input_mask'] tensor_info:
        dtype: DT_INT32
        shape: (-1, -1)
        name: input_mask:0
    inputs['segment_ids'] tensor_info:
        dtype: DT_INT32
        shape: (-1, -1)
        name: segment_ids:0
  The given SavedModel SignatureDef contains the following output(s):
    outputs['sequence_output'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, -1, 768)
        name: sequence_output:0
  Method name is: tensorflow/serving/predict


In [15]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install -q pyspark==2.4.6
! pip install -q spark-nlp

openjdk version "1.8.0_265"
OpenJDK Runtime Environment (build 1.8.0_265-8u265-b01-0ubuntu2~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.265-b01, mixed mode)


In [16]:
import sparknlp
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.base import *

spark=sparknlp.start()

In [18]:
bert = BertEmbeddings.loadSavedModel('./bert_saved_models/bert-base-portuguese-cased/', spark)\
 .setInputCols(["sentence", "token"])\
 .setOutputCol("bert")\
 .setCaseSensitive(True)\
 .setDimension(768)

In [19]:
bert.write().save('./BertEmbeddings_bert-base-portuguese-cased')


In [21]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

token = Tokenizer()\
    .setInputCols(['sentence'])\
    .setOutputCol('token')

# you can load an offline model by using .load(PATH)
bert = BertEmbeddings.load('./BertEmbeddings_bert-base-portuguese-cased') \
 .setInputCols(["sentence", "token"])\
 .setOutputCol("bert")

pipeline = Pipeline(
    stages = [
        document,
        sentence,
        token,
        bert        
    ]
)

In [22]:
prediction_data = spark.createDataFrame([["A alemanha é um lugar legal"]]).toDF("text")

prediction = pipeline.fit(prediction_data).transform(prediction_data)

In [27]:
prediction.select("bert.result").show(1, False)


+----------------------------------+
|result                            |
+----------------------------------+
|[A, alemanha, é, um, lugar, legal]|
+----------------------------------+



+--------+------+
|chunk   |entity|
+--------+------+
|A       |null  |
|alemanha|null  |
|é       |null  |
|um      |null  |
|lugar   |null  |
|legal   |null  |
+--------+------+

