In [1]:
import sys
import os
import tensorflow as tf
import shutil

from embeddings_resolver import BertEmbeddingsResolver
from ner_model_saver import NerModelSaver

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from sparknlp.embeddings import *
import sparknlp 

import time
import zipfile

CORPUS_PATH="/home/rcuesta/TFM/es.rcs.tfm/es.rcs.tfm.corpus/"
DATASET_PATH=CORPUS_PATH + "datasets/"
BERT_PATH=DATASET_PATH + 'bert/'
BIOBERT_PATH=DATASET_PATH + 'biobert/'

SPARKNLP_BERT_MODEL_PATH=CORPUS_PATH+ "models/bert"


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


ModuleNotFoundError: No module named 'embeddings_resolver'

In [3]:
spark = sparknlp.start()

print("Spark NLP version")
sparknlp.version()
print("Apache Spark version")
spark.version

Spark NLP version
2.2.0
Apache Spark version


'2.4.3'

In [2]:
def download_model(url, destination_bert_folder, name):
    import os
    from pathlib import Path
    import urllib.request
    import zipfile
    model_name = destination_bert_folder + name
    zip_file = model_name + ".zip"
    if not Path(zip_file).is_file():
        print("Downloading " + url + " to " + str(Path(zip_file).resolve()))
        urllib.request.urlretrieve(url, zip_file)
    if not Path(model_name).exists():
        print("Unziping " + str(Path(zip_file).resolve()) + " to " + str(Path(model_name).resolve()))
        zip_ref = zipfile.ZipFile(zip_file, 'r')
        zip_ref.extractall(destination_bert_folder)
        zip_ref.close()

'''
def get_service_token_ids(source_bert_folder):
    start_id = 0
    end_id = 0
    with open(os.path.join(source_bert_folder, "vocab.txt")) as f:
        for line, row in enumerate(f):
            row = row.strip()
            if row == '[CLS]':
                start_id = line
            if row == '[SEP]':
                end_id = line
    return (start_id, end_id)
'''

def create_model(source_bert_folder, export_dir, max_sentence_length = 128, batch_size = 32):

    from pathlib import Path

    # if not os.path.exists(dst_folder):
    #     os.makedirs(dst_folder)
    if not Path(source_bert_folder).exists():
        print("Vamos mal")
    print("Esto no va mal")
 
    tf.reset_default_graph()
    is_cased = 'uncased' not in source_bert_folder.lower()
    string = str(Path(source_bert_folder).resolve())
    print("source_bert_folder: {}".format(string))
    print("is_cased: {}".format(is_cased))
    print("lowercase: {}".format(not is_cased))

    resolver = BertEmbeddingsResolver(source_bert_folder, max_sentence_length, lowercase = not is_cased)
    saver = NerModelSaver(resolver, None)
    saver.save_models(export_dir)
    resolver.session.close()
    shutil.copyfile(os.path.join(source_bert_folder, 'vocab.txt'),
                    os.path.join(export_dir, 'vocab.txt'))

    dim = resolver.config.hidden_size
    layers = resolver.config.num_hidden_layers
    print("Number of hidden units: {}".format(dim))
    print("Number of layers: {}".format(layers))
    
    model = BertEmbeddings.loadFromPython(export_dir, spark).setInputCols(["sentence", "token"]).setOutputCol("embeddings").setMaxSentenceLength(max_sentence_length).setBatchSize(batch_size).setDimension(dim).setCaseSensitive(is_cased)
    
    return model


def download_and_convert(url, name, max_sentence_length = 128, batch_size = 32, destination_model_folder = SPARKNLP_BERT_MODEL_PATH):

    from pathlib import Path

    # if not os.path.exists(dst_folder):
    #     os.makedirs(dst_folder)
    if not Path(destination_model_folder).exists():
        os.makedirs(destination_model_folder)

    download_model(url, BERT_PATH, name)

    bert_name = BERT_PATH + name
    model = create_model(bert_name, bert_name + '_export_dir_tmp', max_sentence_length, batch_size)
    # Remove but it's possible to use this model
    shutil.rmtree(bert_name + '_export_dir_tmp')
    # shutil.rmtree(name)

    final_model_name = name + '_M-{}'.format(max_sentence_length) + '_B-{}'.format(batch_size)
    model.write().overwrite().save(os.path.join(destination_model_folder, final_model_name))
    print("SPARKNLP BERT model has been saved: {}".format(destination_model_folder+'/'+final_model_name))
    return model


NameError: name 'CORPUS_PATH' is not defined

## Find models and source code here https://github.com/google-research/bert 

In [None]:
# 1. Base uncased
url = 'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip'
name = 'uncased_L-12_H-768_A-12'
download_and_convert(url, name, max_sentence_length = 128, batch_size = 32)

Downloading https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
Unziping 
source_bert_folder: uncased_L-12_H-768_A-12
is_cased: False
lowercase: True
INFO:tensorflow:Restoring parameters from uncased_L-12_H-768_A-12/bert_model.ckpt
Number of hidden units: 768
Number of layers: 12


In [None]:
# 2. Large uncased
url = 'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip'
name = 'uncased_L-24_H-1024_A-16'
download_and_convert(url, name, max_sentence_length = 128, batch_size = 32)

In [11]:
# 3. Base cased
url = 'https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip'
name = 'cased_L-12_H-768_A-12'
download_and_convert(url, name, max_sentence_length = 128, batch_size = 32)

Unziping 
source_bert_folder: ../../../../es.rcs.tfm.corpus/training/test/cased_L-12_H-768_A-12
is_cased: True
lowercase: False
INFO:tensorflow:Restoring parameters from ../../../../es.rcs.tfm.corpus/training/test/cased_L-12_H-768_A-12/bert_model.ckpt
Number of hidden units: 768
Number of layers: 12
BERT model has been saved: ../../../../es.rcs.tfm.corpus/models/test/cased_L-12_H-768_A-12_M-128_B-32


BERT_EMBEDDINGS_dfaabcfcf440

In [None]:
# 4. Large cased
url = 'https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip'
name = 'cased_L-24_H-1024_A-16'
download_and_convert(url, name, max_sentence_length = 128, batch_size = 32)

In [None]:
# 5. Multilingual Cased (New, recommended)
url = 'https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip'
name = 'multi_cased_L-12_H-768_A-12'
download_and_convert(url, name, max_sentence_length = 128, batch_size = 32)

In [None]:
# 6. Large uncased
url = 'https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip'
name = 'wwm_uncased_L-24_H-1024_A-16'
download_and_convert(url, name, max_sentence_length = 128, batch_size = 32)

In [None]:
# 7. Large cased
url = 'https://storage.googleapis.com/bert_models/2019_05_30/wwm_cased_L-24_H-1024_A-16.zip'
name = 'wwm_cased_L-24_H-1024_A-16'
download_and_convert(url, name, max_sentence_length = 128, batch_size = 32)

In [None]:
print('All generated models are inside "models/" directory')

In [14]:
def convert(name, max_sentence_length = 128, batch_size = 32, destination_model_folder = SPARKNLP_BERT_MODEL_PATH):

    model = create_model(BIOBERT_PATH + name, BERT_PATH + name + '_export_dir', max_sentence_length, batch_size)
    # Remove but it's possible to use this model
    shutil.rmtree(BERT_PATH + name + '_export_dir')

    final_model_name = name + '_M-{}'.format(max_sentence_length) + '_B-{}'.format(batch_size)
    model.write().overwrite().save(os.path.join(destination_model_folder, final_model_name))
    print("SPARKNLP BERT model has been saved: {}".format(destination_model_folder+'/'+final_model_name))

    return model

## Find models in: 
GOOGLE: https://github.com/google-research/bert

BIOBERT: https://github.com/naver/biobert-pretrained/releases

Dependiendo del nombre ocurren varias cosas:

- Busca la cadena uncased en el nombre para establecer si el modelo es uncased
- embeddings_resolver.py requiere que el modelo se denomine internamente bert_model.ckpt

In [16]:
name = 'biobert_v1.1_pubmed_bert'
convert(name, max_sentence_length = 128, batch_size = 32)

source_bert_folder: ../../../../es.rcs.tfm.corpus/datasets/biobert/biobert_v1.1_pubmed_bert
is_cased: True
lowercase: False
INFO:tensorflow:Restoring parameters from ../../../../es.rcs.tfm.corpus/datasets/biobert/biobert_v1.1_pubmed_bert/bert_model.ckpt
Number of hidden units: 768
Number of layers: 12
BERT model has been saved: ../../../../es.rcs.tfm.corpus/models/test/biobert_v1.1_pubmed_bert_M-128_B-32


BERT_EMBEDDINGS_66651538929c