In [1]:
'''
This notebook is designed to implement further pretraining on BERT before the fine-tuning process.
It is based on the code provided in the BERT GitHub repo by the architects: 
https://github.com/google-research/bert/#pre-training-with-bert
'''

'\nThis notebook is designed to implement further pretraining on BERT before the fine-tuning process.\nIt is based on the code provided in the BERT GitHub repo by the architects: \nhttps://github.com/google-research/bert/#pre-training-with-bert\n'

In [2]:
# Manual Parameters - these are not automatically generated and need to be set each runtime

BERT_MODEL  = "bert-base-uncased"
PRETRAINING_TYPE = "both" 
MAX_SEQ_LEN = int(128)
MASKED_LM_PROB = 0.15
SEED = 12345
MAX_PRED_PER_SEQ = 20
BATCH_SIZE = 32
TRAIN_STEPS = 1234
WARMUP_STEPS = 10
LEARNING_RATE = 2e-5
DUPE_FACTOR = 5
NOTES = "Stepping up to 100000 - LAST ITERATION"

In [3]:
%tensorflow_version 1.x
import tensorflow as tf
print(tf.__version__)

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

TensorFlow 1.x selected.
1.15.2
Found GPU at: /device:GPU:0


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
RUN_PATH = 'drive/"My Drive"/'
BERT_PATH = 'colab/bert/bert_base_uncased/vocab.txt'
CORAL_PATH = 'colab/data/coral_bleaching_sentences.txt'
TF_RECORD_PATH = 'colab/bert/bert_pretraining/coral_bleaching_pretraining.tfrecord'

In [6]:
%cd $RUN_PATH
%ls

/content/drive/My Drive
 [0m[01;34mcolab[0m/  [01;34m'Colab Notebooks'[0m/   [01;34mCSC594[0m/


In [7]:
!python colab/bert/create_pretraining_data.py \
  --input_file=$CORAL_PATH \
  --output_file=$TF_RECORD_PATH \
  --vocab_file=$BERT_PATH \
  --do_lower_case=True \
  --max_seq_length=$MAX_SEQ_LEN \
  --max_predictions_per_seq=$MAX_PRED_PER_SEQ \
  --masked_lm_prob=$MASKED_LM_PROB \
  --random_seed=$SEED \
  --dupe_factor=$DUPE_FACTOR



W1030 13:39:32.145976 140652228425600 module_wrapper.py:139] From colab/bert/create_pretraining_data.py:437: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.


W1030 13:39:32.146166 140652228425600 module_wrapper.py:139] From colab/bert/create_pretraining_data.py:437: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.


W1030 13:39:32.146296 140652228425600 module_wrapper.py:139] From /content/drive/My Drive/colab/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.


W1030 13:39:32.968843 140652228425600 module_wrapper.py:139] From colab/bert/create_pretraining_data.py:444: The name tf.gfile.Glob is deprecated. Please use tf.io.gfile.glob instead.


W1030 13:39:33.303391 140652228425600 module_wrapper.py:139] From colab/bert/create_pretraining_data.py:446: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.

INF

In [8]:
OUTPUT_PATH = 'colab/bert/bert_pretraining/coral_bleaching_pretraining_output'
CONFIG_PATH = 'colab/bert/bert_base_uncased/bert_config.json'
CKPT_PATH = 'colab/bert/bert_base_uncased/bert_model.ckpt.index'

In [9]:
!python colab/bert/run_pretraining.py \
  --input_file=$TF_RECORD_PATH \
  --output_dir=$OUTPUT_PATH \
  --do_train=True \
  --do_eval=True \
  --bert_config_file=$CONFIG_PATH \
  --init_checkpoint=$CKPT_PATH \
  --train_batch_size=$BATCH_SIZE \
  --max_seq_length=$MAX_SEQ_LEN \
  --max_predictions_per_seq=$MAX_PRED_PER_SEQ \
  --num_train_steps=$TRAIN_STEPS \
  --num_warmup_steps=$WARMUP_STEPS \
  --learning_rate=$LEARNING_RATE




W1030 13:39:47.712918 140695103870848 module_wrapper.py:139] From colab/bert/run_pretraining.py:407: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.


W1030 13:39:47.713079 140695103870848 module_wrapper.py:139] From colab/bert/run_pretraining.py:407: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.


W1030 13:39:47.713202 140695103870848 module_wrapper.py:139] From /content/drive/My Drive/colab/bert/modeling.py:93: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.


W1030 13:39:48.394889 140695103870848 module_wrapper.py:139] From colab/bert/run_pretraining.py:414: The name tf.gfile.MakeDirs is deprecated. Please use tf.io.gfile.makedirs instead.


W1030 13:39:48.395463 140695103870848 module_wrapper.py:139] From colab/bert/run_pretraining.py:418: The name tf.gfile.Glob is deprecated. Please use tf.io.gfile.glob instead.


W1030 13:39:48.397002 140695103870848 mod

In [10]:
TOTAL_LOSS = 0.07374086
MASKED_LM_ACC = 0.9909886
MASKED_LM_LOSS = 0.07412144
NSP_ACC = 1.0
NSP_LOSS = 7.45058e-10

In [11]:
# Create the file to store the stats of the pretraining if it doesn't already exist
import os

STATS_PATH = "colab/stats/coral_bleaching_pretraining_stats.csv"

f = None
if not os.path.isfile(STATS_PATH):
  f = open(STATS_PATH, "w")
  f.write("number,datetime,bert_model,pretraining_type,seed,max_seq_len,masked_lm_prob,\
          dupe_factor,max_pred_per_seq,batch_size,train_steps,warmup_steps,learning_rate,\
          masked_lm_acc,masked_lm_loss,nsp_acc,nsp_loss,total_loss,notes\n")
  print("coral_bleaching_stats.csv NOT found - creating")
  f.close()

In [12]:
def getLastModelNumber():
  try:
    with open(STATS_PATH, "r") as f:
      f_list = list(f)
      latest = f_list[-1].split(',')
      return int(latest[0])
  except:
    return -1

In [13]:
# We will use this in the future to refer to the current model
num = str(getLastModelNumber() + 1)
num

'9'

In [14]:
# Get date and time
import datetime
import pytz

date = str(datetime.datetime.now(tz = pytz.timezone('US/Central')))
date = date.split(' ')
time = date[1]
date = date[0]
h, m = [time.split(':')[0], time.split(':')[1]]

DATE_TIME = date + ' ' + h + ':' + m + " CT"
DATE_TIME

'2020-10-30 08:40 CT'

In [15]:
# Add line to params, then save and close
with open(STATS_PATH, "a") as f:
  f.write("{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10},{11},\
          {12},{13},{14},{15},{16},{17},{18}\n".format(num,DATE_TIME,BERT_MODEL,PRETRAINING_TYPE,SEED,
                                                  MAX_SEQ_LEN,MASKED_LM_PROB,DUPE_FACTOR,MAX_PRED_PER_SEQ,
                                                  BATCH_SIZE,TRAIN_STEPS,WARMUP_STEPS,LEARNING_RATE,
                                                  MASKED_LM_ACC,MASKED_LM_LOSS,NSP_ACC,NSP_LOSS,TOTAL_LOSS,NOTES))