In [1]:
#import bert
# from bert import run_classifier
# from bert import optimization
#from bert import tokenization
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import hashlib
from tensorflow.python.ops import math_ops

from tensorflow.metrics import accuracy

tf.enable_eager_execution()

BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

In [4]:
### Testing the preprocessing module

from preprocessing.preprocessing import convert_example, FeatureWriter

def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    with tf.Graph().as_default():
        bert_module = hub.Module(BERT_MODEL_HUB)
        tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                                  tokenization_info["do_lower_case"]])

    return bert.tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case)

In [5]:
token = create_tokenizer_from_hub_module()

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [9]:
import jsonlines
import re
_train_file = '/data/nq/natural_questions/v1.0/sample_train/train/nq-train-sample.jsonl'
_train_file_out = re.sub(".jsonl", ".tf_record", _train_file)
train_writer = FeatureWriter(
    filename=_train_file_out,
    is_training=True)
with jsonlines.open(_train_file) as reader:
    features, examples = [], []
    for i, example in enumerate(reader):
        if i % 1e3 == 0: tf.logging.info("{}:{}".format(_train_file, i))
        examples.append(example)
        dt = convert_example(example,
                             tokenizer=token,
                             is_training=True,
                             max_seq_length=384,
                             doc_stride=128,
                             max_query_length=64,
                             train_writer=train_writer.process_feature)
        train_writer.close()
        break

INFO:tensorflow:/data/nq/natural_questions/v1.0/sample_train/train/nq-train-sample.jsonl:0
INFO:tensorflow:(276, 279)
INFO:tensorflow:(148, 151)
INFO:tensorflow:(20, 23)
INFO:tensorflow:3 examples found


In [8]:
len(examples), len(features)

(200, 0)

In [10]:
# yes / no
#[ex['annotations'][0]['yes_no_answer'] for ex in examples]

In [20]:
feature = features[10]
feature.example_id

-2975172535563055798

In [27]:
def test(i):
    feature = features[i]
    example_id = feature.example_id
    example = [x for x in examples if x['example_id'] == example_id][0]
    return feature, example  

In [72]:
def get_annotations(example):
    """
    if short, else long
    """
    annotation = example['annotations'][0]
    end_byte_ix, start_byte_ix = None, None
    start_token, end_token = None, None
    if annotation['short_answers']:
        end_byte_ix = annotation['short_answers'][0]['end_byte']
        start_token = annotation['short_answers'][0]['start_token']
        end_token = annotation['short_answers'][0]['end_token']
        start_byte_ix = annotation['short_answers'][0]['start_byte']
    else:
        end_byte_ix = annotation['long_answer']['end_byte']
        start_byte_ix = annotation['long_answer']['start_byte']
        start_token = annotation['long_answer']['start_token']
        end_token = annotation['long_answer']['end_token']
    return {'end_byte_ix': end_byte_ix, 
            'start_byte_ix': start_byte_ix,
            'start_token': start_token,
            'end_token': end_token}

In [131]:
def _validate(i):
    # get the feature and example the feature is derived from.
    feature, example = test(i)
    # get the ground truth annotations.
    gt = get_annotations(example)
    # get start byte and end bytes for targets.
    if feature.targets[0] == 0:
        return (i, True)
    start_bytes = feature.start_bytes[feature.targets[0]]
    end_bytes = feature.end_bytes[feature.targets[1]]
    if start_bytes == gt['start_byte_ix'] and end_bytes == gt['end_byte_ix']:
        return (i,True)
    else:
        return (i, False)

In [127]:
feature, example = test(1)
gt = get_annotations(example)
print(gt)
start_bytes = feature.start_bytes[feature.targets[0]]
end_bytes = feature.end_bytes[feature.targets[1]]
feature.targets, start_bytes, end_bytes
_validate(1)

{'end_byte_ix': 96731, 'start_byte_ix': 96715, 'start_token': 3521, 'end_token': 3525}
True
True


(1, True)

In [139]:
ix = []
for i in range(len(features)):
    _assertion =  _validate(i)
    if not _assertion[1]:
        ix.append(i)      

In [150]:
# assertion fails
feature, example = test(ix[2])
gt = get_annotations(example)
print('ground truth')
print(gt)
start_bytes = feature.start_bytes[feature.targets[0]]
end_bytes = feature.end_bytes[feature.targets[1]]
start_bytes, end_bytes

ground truth
{'end_byte_ix': 55798, 'start_byte_ix': 55137, 'start_token': 893, 'end_token': 1001}


(55140, 55794)

In [151]:
example['annotations']

[{'annotation_id': 13306123758205215060,
  'long_answer': {'candidate_index': 32,
   'end_byte': 55798,
   'end_token': 1001,
   'start_byte': 55137,
   'start_token': 893},
  'short_answers': [],
  'yes_no_answer': 'NONE'}]

In [12]:
#ann = get_annotations(example)
#[t for t in example['document_tokens'] if t['start_byte'] >= ann['start_byte_ix'] and t['end_byte'] <= ann['end_byte_ix']]

# TF Records

In [10]:
from run_nq import input_fn_builder
seq_length=384

name_to_features = {
  "input_ids": tf.FixedLenFeature([], tf.int64),
  "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
  "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
  "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
  "start_bytes": tf.FixedLenFeature([seq_length], tf.int64),
  "end_bytes": tf.FixedLenFeature([seq_length], tf.int64),
}
name_to_features["start_positions"] = tf.FixedLenFeature([], tf.int64)
name_to_features["end_positions"] = tf.FixedLenFeature([], tf.int64)

def _decode_record(record):
  """Decodes a record to a TensorFlow example."""
  example = tf.parse_single_example(record, name_to_features)
  return example

In [11]:
import os
import run_nq
#bert_data_dir = '/Users/deniz/natural_questions/data/'
#_train_path = '/Users/deniz/natural_questions/data/v1.0_sample_nq-train-sample.jsonl'
_train_path = '/data/nq/natural_questions/v1.0/sample_train/'
#_dev_path = os.path.join(bert_data_dir, 'dev')
_train_path = os.path.join(_train_path, 'train')
train_files = [os.path.join(_train_path, _file) for _file in os.listdir(_train_path) if _file.endswith(".tf_record")]

train_input_fn = input_fn_builder(
  input_files=train_files,
  seq_length=384,
  is_training=True,
  mode='train')

In [12]:
import tensorflow as tf
dt = tf.data.TFRecordDataset(train_files)
dt = dt.map(_decode_record, num_parallel_calls=10)
dt = dt.shuffle(buffer_size=100)
dt = dt.batch(32)
it = dt.make_one_shot_iterator()
a = it.get_next()
a.keys()

dict_keys(['end_bytes', 'end_positions', 'input_ids', 'input_mask', 'segment_ids', 'start_bytes', 'start_positions'])

In [16]:
a['start_positions'], a['end_positions']

(<tf.Tensor: id=168, shape=(3,), dtype=int64, numpy=array([148,  20, 276])>,
 <tf.Tensor: id=163, shape=(3,), dtype=int64, numpy=array([151,  23, 279])>)

In [13]:
#### argmax

In [None]:
# context
a = tf.constant([[2,10],[3,20],[4,50]])
a = tf.constant([[0.2,0.5,0.3]])
a = tf.expand_dims(a,1)
print(a.shape)
b = tf.constant([[1,2],[10,20],[1,2]])
b = tf.constant([[1,2]])
b = tf.expand_dims(b,2)
print(b.shape)
c = a + b
out = c.numpy()

In [None]:
a = tf.constant([[0.2, 0.5, 0.3]])
b = tf.constant([[0.8, 0.1, 0.1]])

In [None]:
tf.matmul(a,b, transpose_b=True)

In [None]:
_a = tf.expand_dims(a,0)
_b = tf.expand_dims(b,-1)
c = _a * _b
d = c.numpy().T

In [None]:
#indices = tf.argmax(c, axis=1)  # this gives you indices from 0 to 600^2
col_indices = indices / 3
row_indices = indices % 3
final_indices = tf.transpose(tf.stack(col_indices, row_indices))
final_indices

In [None]:
#np.argmax(d,axis=[0,1])
d.argmax(axis=[0, 1])

In [None]:
def argmax_2d(tensor):

  # input format: BxHxWxD
  assert rank(tensor) == 4

  # flatten the Tensor along the height and width axes
  flat_tensor = tf.reshape(tensor, (tf.shape(tensor)[0], -1, tf.shape(tensor)[3]))

  # argmax of the flat tensor
  argmax = tf.cast(tf.argmax(flat_tensor, axis=1), tf.int32)

  # convert indexes into 2D coordinates
  argmax_x = argmax // tf.shape(tensor)[2]
  argmax_y = argmax % tf.shape(tensor)[2]

  # stack and return 2D coordinates
  return tf.stack((argmax_x, argmax_y), axis=1)

def rank(tensor):

  # return the rank of a Tensor
  return len(tensor.get_shape())

In [None]:
argmax_2d(c)

In [None]:
def batch_gather(tensor, indices):
  """Gather in batch from a tensor of arbitrary size.

  In pseudocode this module will produce the following:
  output[i] = tf.gather(tensor[i], indices[i])

  Args:
    tensor: Tensor of arbitrary size.
    indices: Vector of indices.
  Returns:
    output: A tensor of gathered values.
  """
  shape = get_shape(tensor)
  flat_first = tf.reshape(tensor, [shape[0] * shape[1]] + shape[2:])
  indices = tf.convert_to_tensor(indices)
  offset_shape = [shape[0]] + [1] * (indices.shape.ndims - 1)
  offset = tf.reshape(tf.range(shape[0]) * shape[1], offset_shape)
  output = tf.gather(flat_first, indices + offset)
  return output

In [None]:
tf.ar

In [None]:
#########################
#####accuracy metric#####
#########################

tf.reset_default_graph()
init = tf.global_variables_initializer()

In [None]:
from tensorflow import metrics

# Start training
with tf.Session() as sess:
    sess.run(init)
    start_ix = tf.expand_dims(tf.constant([10,20,30,40,50]),1)
    end_ix =  tf.expand_dims(tf.constant([10,20,30,40,50]),1)
    start_positions = tf.expand_dims(tf.constant([10,20,30,40,50]),1)
    end_positions = tf.expand_dims(tf.constant([10,20,30,40,60]),1) #80% accuracy

    y_pred = tf.concat([start_ix, end_ix], axis=-1) #[batch_size, 2]
    y_true = tf.concat([start_positions, end_positions], axis=-1) #[batch_size, 2]
    acc = tf.reduce_all(math_ops.equal(y_true, y_pred), axis=-1)
    is_correct = math_ops.to_float(acc)
    a,b = metrics.mean(is_correct)
    
    
    running_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES)
    running_vars_initializer = tf.variables_initializer(var_list=running_vars)
    
    sess.run(running_vars_initializer)
    
    # initial op
    a_out = sess.run(a)
    # update op
    b_out = sess.run(b)

In [None]:
a_out, b_out

In [None]:
#from preprocessing.preprocessing import *
from run_nq import *

In [None]:
bert_data_dir = "/data/nq/natural_questions/v1.0/sample_train"

In [None]:

train_files = [_file for _file in os.listdir(bert_data_dir) if _file.endswith(".tf_record")]
_file_path = [os.path.join(bert_data_dir, _file) for _file in train_files]
[tf.gfile.MakeDirs(_dir) for _dir in [_train_path, _dev_path]]
print(_file_path)
train_input_fn = input_fn_builder(
    input_file=_file_path,
    seq_length=512,
    is_training=True,
    drop_remainder=True)

In [None]:
params = {}
params['batch_size'] = 32
_iter = train_input_fn(params)

In [None]:
_iter = _iter.make_one_shot_iterator()

In [None]:
out = _iter.get_next()
out.keys()

In [None]:
#dt['document_tokens'][:10]

In [None]:
dt['annotations']

In [None]:
outputs = convert_examples_to_features(dt, tokenizer, 512)

In [None]:
# answer
if outputs:
    print(outputs[0].targets)

In [None]:
{i: t for i, t in enumerate(outputs[0].tokens) if i >= outputs[0].targets[0][0] and i <= outputs[0].targets[0][1]}

In [None]:
short_answer_start = dt['annotations'][0]['short_answers'][0]['start_byte']
short_answer_end = dt['annotations'][0]['short_answers'][0]['end_byte']
[t for t in dt['document_tokens'] if t['start_byte'] >= short_answer_start and t['end_byte'] <= short_answer_end]

In [None]:
dt['annotations']

In [None]:
long_answer_start = dt['annotations'][0]['long_answer']['start_byte']
long_answer_end = dt['annotations'][0]['long_answer']['end_byte']
end = dt['annotations'][0]['long_answer']['end_token']+1
start = dt['annotations'][0]['long_answer']['start_token']
dt['document_tokens'][start:end]

In [None]:
dt = data[0]

In [None]:
dt.keys()

In [None]:
dt['document_tokens'][:100]

In [None]:
#set([t['token'] for dt in data for t in dt['document_tokens'] if t['html_token']])

In [None]:
from IPython.core.display import display, HTML
display(HTML(dt['document_html']))

In [None]:
#dt['long_answer_candidates']

In [None]:
dt['question_tokens']

In [None]:
dt['annotations'][0]

In [None]:
tokens = dt['document_tokens']

In [None]:
[token for token in tokens if token['start_byte'] >= start_byte_ix and token['end_byte'] <= end_byte_ix]

In [None]:
#' '.join([token['token'] for token in tokens if token['start_byte'] >= start_byte_ix_long and token['end_byte'] <= end_byte_ix_long and not token['html_token']])