In [0]:
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

TPU address is grpc://10.60.183.170:8470
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

TPU devices:
[_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 3241415426056758343),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 7440913165967643381),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 10595590061960707615),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 16706664639743592845),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 8761786898783724765),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0

In [0]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive/


**Secondly**, prepare and import BERT modules.

In [0]:
import sys

!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo
if not 'bert_repo' in sys.path:
  sys.path += ['bert_repo']

Cloning into 'bert_repo'...
remote: Enumerating objects: 333, done.[K
remote: Total 333 (delta 0), reused 0 (delta 0), pack-reused 333[K
Receiving objects: 100% (333/333), 282.45 KiB | 3.67 MiB/s, done.
Resolving deltas: 100% (183/183), done.


**Thirdly**, prepare for training:

*  Specify task and download training data.
*  Specify BERT pretrained model
*  Specify GS bucket, create output directory for model checkpoints and eval results.



In [0]:
TASK = "CRS"
TASK_DATA_DIR = "/content/drive/My Drive/Colab Notebooks2"
# Available pretrained model checkpoints:
#   uncased_L-12_H-768_A-12: uncased BERT base model
#   uncased_L-24_H-1024_A-16: uncased BERT large model
#   cased_L-12_H-768_A-12: cased BERT large model
BERT_MODEL = 'uncased_L-12_H-768_A-12' #@param {type:"string"}
BERT_PRETRAINED_DIR = 'gs://cloud-tpu-checkpoints/bert/' + BERT_MODEL
print('***** BERT pretrained directory: {} *****'.format(BERT_PRETRAINED_DIR))
!gsutil ls $BERT_PRETRAINED_DIR

BUCKET = 'usnavy_bert1' #@param {type:"string"}
assert BUCKET, 'Must specify an existing GCS bucket name'
OUTPUT_DIR = 'gs://{}/bert/models/{}'.format(BUCKET, TASK)
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))


***** BERT pretrained directory: gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12 *****
gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/bert_config.json
gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001
gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/bert_model.ckpt.index
gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/bert_model.ckpt.meta
gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/checkpoint
gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/vocab.txt
***** Model output directory: gs://usnavy_bert1/bert/models/CRS *****


**Now, let's go**

In [0]:
import run_classifier
import csv

class CRSProcessor(run_classifier.DataProcessor):
  """Processor for the CRS data set."""

  def get_train_examples(self, data_dir, group_name='stroke'):
    """See base class."""
    return self._create_examples(
        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train", group_name)

  def get_val_examples(self, data_dir, group_name='stroke'):
    """See base class."""
    return self._create_examples(
        self._read_tsv(os.path.join(data_dir, "val.tsv")), "val", group_name)

  def get_test_examples(self, data_dir, group_name='stroke'):
    """See base class."""
    return self._create_examples(
      self._read_tsv(os.path.join(data_dir, "test.tsv")), "test", group_name)

  def get_labels(self):
    """See base class."""
    return ["0", "1"]

  def _create_examples(self, lines, set_type, group_name='stroke'):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      
        guid = "%s-%s" % (set_type, i)
        y_group_name = 'y_%s' % (group_name, )
        
        if set_type == "test":
            text_a = "%s \n\n %s" % (line['ti'], line['ab'])
            label = "0"
        else:
            text_a = "%s \n\n %s" % (line['ti'], line['ab'])
            label = "1" if line[y_group_name]=="True" else "0"
        examples.append(
            run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples

  @classmethod
  def _read_tsv(cls, input_file):
    """Reads a tab separated value file."""
    with tf.gfile.Open(input_file, "r") as f:
      reader = csv.DictReader(f, delimiter="\t")
      lines = []
      for line in reader:
        lines.append(line)
      return lines


In [0]:
#checking if all files in the directory are there
import os
#for root, dirs, files in os.walk('.', topdown=True): root directory
for root, dirs, files in os.walk(TASK_DATA_DIR, topdown=True):
    dirs.clear() #with topdown true, this will prevent walk from going into subs
    for file in files:
      #do some stuff
      print(file)

cnn_vocab_map.pck
PubMed_token_map_50k.pck
PubMed_embeddings_50k.pck
val.tsv
train.tsv
test.tsv
SVM_TrialRegister.ipynb
uuu.csv
DataPrep_TrialRegister.ipynb
CNN_TrialRegister.ipynb
Copy of SVM_TrialRegister.ipynb
Copy of BERT.ipynb
BioBERT.ipynb
Copy of BioBERT.ipynb
BERT.ipynb


In [0]:
TASK_DATA_DIR

'/content/drive/My Drive/Colab Notebooks2'

In [0]:
# Setup task specific model and TPU running config.

import modeling
import optimization
import run_classifier
import tokenization


# Model Hyper Parameters
TRAIN_BATCH_SIZE = 32
EVAL_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
WARMUP_PROPORTION = 0.1
MAX_SEQ_LENGTH = 128
# Model configs
SAVE_CHECKPOINTS_STEPS = 1000
ITERATIONS_PER_LOOP = 1000
NUM_TPU_CORES = 8
VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json')
INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')
DO_LOWER_CASE = BERT_MODEL.startswith('uncased')

processors = {
    "crs": CRSProcessor
}
processor = processors[TASK.lower()]()
label_list = processor.get_labels()
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)

tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)
run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=OUTPUT_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=ITERATIONS_PER_LOOP,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

train_examples = processor.get_train_examples(TASK_DATA_DIR)
num_train_steps = int(
    len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

model_fn = run_classifier.model_fn_builder(
    bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
    num_labels=len(label_list),
    init_checkpoint=INIT_CHECKPOINT,
    learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=True,
    use_one_hot_embeddings=True)

estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=True,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE)

INFO:tensorflow:Using config: {'_model_dir': 'gs://usnavy_bert1/bert/models/CRS', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.60.183.170:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f420260d898>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.60.183.170:8470', '_evaluation_master': 'grpc://10.60.183.170:8470', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_

In [0]:
# Train the model.
print('MRPC/CoLA on BERT base model normally takes about 2-3 minutes. Please wait...')
train_features = run_classifier.convert_examples_to_features(
    train_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
print('***** Started training at {} *****'.format(datetime.datetime.now()))
print('  Num examples = {}'.format(len(train_examples)))
print('  Batch size = {}'.format(TRAIN_BATCH_SIZE))
tf.logging.info("  Num steps = %d", num_train_steps)
train_input_fn = run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True)
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print('***** Finished training at {} *****'.format(datetime.datetime.now()))

MRPC/CoLA on BERT base model normally takes about 2-3 minutes. Please wait...

INFO:tensorflow:Writing example 0 of 388437
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: train-0
INFO:tensorflow:tokens: [CLS] effects of n - ace ##ty ##lc ##yst ##ein ##e on se ##men parameters and ox ##ida ##tive / anti ##ox ##ida ##nt status . objectives to examine whether a beneficial effect of n - ace ##ty ##lc ##yst ##ein ##e ( na ##c ) on se ##men parameters and ox ##ida ##tive / anti ##ox ##ida ##nt status in id ##io ##pathic male in ##fer ##tility exists . the production of reactive oxygen species is a normal ph ##ys ##iol ##og ##ic event in various organs . however , over ##pro ##duction of reactive oxygen species can be detrimental to sperm and has been associated with male in ##fer ##tility . methods our study included 120 patients who had attended our clinic and were diagnosed [SEP]
INFO:tensorflow:input_ids: 101 3896 1997 1050 1011 9078 3723 15472 27268 12377 2063 2006 7367 3549 11709 

In [0]:
# Eval the model.
eval_examples = processor.get_test_examples(TASK_DATA_DIR)
eval_features = run_classifier.convert_examples_to_features(
    eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
print('***** Started evaluation at {} *****'.format(datetime.datetime.now()))
print('  Num examples = {}'.format(len(eval_examples)))
print('  Batch size = {}'.format(EVAL_BATCH_SIZE))
# Eval will be slightly WRONG on the TPU because it will truncate
# the last batch.
eval_steps = int(len(eval_examples) / EVAL_BATCH_SIZE)
eval_input_fn = run_classifier.input_fn_builder(
    features=eval_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=True)



result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
print('***** Finished evaluation at {} *****'.format(datetime.datetime.now()))
output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt")
with tf.gfile.GFile(output_eval_file, "w") as writer:
  print("***** Eval results *****")
  for key in sorted(result.keys()):
    print('  {} = {}'.format(key, str(result[key])))
    writer.write("%s = %s\n" % (key, str(result[key])))

INFO:tensorflow:Writing example 0 of 166853
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: test-0
INFO:tensorflow:tokens: [CLS] a 7 - year prospective radio ##graphic evaluation of marginal bone level around two different implant systems : a random ##ized clinical trial . objective the aim of this study was to evaluate the change in marginal bone level radio ##graphic ##ally around two different implant systems after 7 years of use . material and methods twenty fully eden ##tu ##lous patients were included in the study and randomly assigned to two treatment groups of machine ##d surface implant ##s ( bran ##ema ##rk , n = 40 ) and rough - surface implant ##s ( xiv ##e , n = 40 ) . the implant ##s were early loaded with individual bar - retained over ##dent ##ures . all patients were treated by the same [SEP]
INFO:tensorflow:input_ids: 101 1037 1021 1011 2095 17464 2557 14773 9312 1997 14785 5923 2504 2105 2048 2367 27159 3001 1024 1037 6721 3550 6612 3979 1012 7863 1996 6614 199

In [0]:
y_preds = estimator.predict(input_fn=eval_input_fn)


In [0]:
y_preds = list(y_preds)