# Document representation from BERT

Copyright 2021 Google Inc.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

In [None]:
import collections
import math
import random
import sys
import time
from typing import Dict, List, Tuple
from sklearn.metrics import pairwise
# Use Tensorflow 2.0
import tensorflow as tf
import numpy as np

In [2]:
# Set BigQuery application credentials
from google.cloud import bigquery
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "path/to/file.json"

project_id = "your_bq_project_id"
bq_client = bigquery.Client(project=project_id)

'# Set BigQuery application credentials\nfrom google.cloud import bigquery\nimport os\nos.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "path/to/file.json"\n\nproject_id = "your_bq_project_id"\nbq_client = bigquery.Client(project=project_id)'

In [None]:
# You will have to clone the BERT repo
!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo
if not 'bert_repo' in sys.path:
  sys.path += ['bert_repo']

The BERT repo uses Tensorflow 1 and thus a few of the functions have been moved/changed/renamed in Tensorflow 2. In order for the BERT tokenizer to be used, one of the lines in the repo that was just cloned needs to be modified to comply with Tensorflow 2. Line 125 in the BERT tokenization.py file must be changed as follows:

From => `with tf.gfile.GFile(vocab_file, "r") as reader:`

To => `with tf.io.gfile.GFile(vocab_file, "r") as reader:`

Once that is complete and the file is saved, the tokenization library can be imported.

In [None]:
import tokenization

# Load BERT

In [None]:
MAX_SEQ_LENGTH = 512
MODEL_DIR = 'path/to/model'
VOCAB = 'path/to/vocab'

tokenizer = tokenization.FullTokenizer(VOCAB, do_lower_case=True)

In [None]:
model = tf.compat.v2.saved_model.load(export_dir=MODEL_DIR, tags=['serve'])
model = model.signatures['serving_default']

In [None]:
# Mean pooling layer for combining
pooling = tf.keras.layers.GlobalAveragePooling1D()

# Get a couple of Patents

Here we do a simple query from the BigQuery patents data to collect the claims for a sample set of patents.

In [None]:
# Put your publications here.
test_pubs = (
    'US-8000000-B2', 'US-2007186831-A1', 'US-2009030261-A1', 'US-10722718-B2'
)

js = r"""
  // Regex to find the separations of the claims data
  var pattern = new RegExp(/[.][\\s]+[0-9]+[\\s]*[.]/, 'g');
  if (pattern.test(text)) {
    return text.split(pattern);
  }
"""

query = r'''
  #standardSQL
  CREATE TEMPORARY FUNCTION breakout_claims(text STRING) RETURNS ARRAY<STRING> 
  LANGUAGE js AS """
  {}
  """; 

  SELECT 
    pubs.publication_number, 
    title.text as title, 
    breakout_claims(claims.text) as claims
  FROM `patents-public-data.patents.publications` as pubs,
    UNNEST(claims_localized) as claims,
    UNNEST(title_localized) as title
  WHERE
    publication_number in {}
'''.format(js, test_pubs)

df = bq_client.query(query).to_dataframe()

In [None]:
df.head()

Unnamed: 0,publication_number,title,claims
0,US-2009030261-A1,Drug delivery system,[1 . A drug delivery system comprising:\n a ca...
1,US-2007186831-A1,Sewing machine,[1 . A sewing machine comprising:\n a needle b...
2,US-8000000-B2,Visual prosthesis,[1. A visual prosthesis apparatus comprising:\...
3,US-10722718-B2,Systems and methods for treatment of dry eye,[What is claimed is: \n \n 1. A meth...


In [None]:
def get_bert_token_input(texts):
  input_ids = []
  input_mask = []
  segment_ids = []

  for text in texts:
    tokens = tokenizer.tokenize(text)
    if len(tokens) > MAX_SEQ_LENGTH - 2:
      tokens = tokens[0:(MAX_SEQ_LENGTH - 2)]
    tokens = ['[CLS]'] + tokens + ['[SEP]']


    ids = tokenizer.convert_tokens_to_ids(tokens)
    token_pad = MAX_SEQ_LENGTH - len(ids)
    input_mask.append([1] * len(ids) + [0] * token_pad)
    input_ids.append(ids + [0] * token_pad)
    segment_ids.append([0] * MAX_SEQ_LENGTH)
  
  return {
      'segment_ids': tf.convert_to_tensor(segment_ids, dtype=tf.int64),
      'input_mask': tf.convert_to_tensor(input_mask, dtype=tf.int64),
      'input_ids': tf.convert_to_tensor(input_ids, dtype=tf.int64),
      'mlm_positions': tf.convert_to_tensor([], dtype=tf.int64)
  }

In [None]:
docs_embeddings = []
for _, row in df.iterrows():
  inputs = get_bert_token_input(row['claims'])
  response = model(**inputs)
  avg_embeddings = pooling(
      tf.reshape(response['encoder_layer'], shape=[1, -1, 1024]))
  docs_embeddings.append(avg_embeddings.numpy()[0])

In [None]:
pairwise.cosine_similarity(docs_embeddings)

array([[0.9999988 , 0.68387157, 0.83200616, 0.86913264],
       [0.68387157, 1.0000013 , 0.7299322 , 0.73105675],
       [0.83200616, 0.7299322 , 0.99999964, 0.9027555 ],
       [0.86913264, 0.73105675, 0.9027555 , 0.9999996 ]], dtype=float32)

In [None]:
docs_embeddings[0].shape

(1024,)