In [1]:
import os
import json
from datetime import datetime
import shutil
import subprocess
import pandas as pd
import seqeval
os.environ['MKL_THREADING_LAYER'] = 'GNU'

In [2]:
ROOT_DIR = !pwd
ROOT_DIR = "/".join(ROOT_DIR[0].split("/")[:-1])
ROOT_DIR

'/home/gmichalo/umlsbert/UmlsBERT'

## Training Pipeline


In [3]:
downstream_dir = ROOT_DIR + "/token-classification/"
os.chdir(downstream_dir)

In [4]:
!pip install tokenizers sentencepiece sacremoses

/bin/bash: pip: command not found


In [5]:
def generate_command(config):
  command = "python3"
  command += " " + config["run_file"] + " "
  command += "--output_dir " + config["output_dir"] + " "
  command += "--model_name_or_path " + config["model_name_or_path"] + " "
  command += "--data_dir " + config["data_dir"] + " "
  command += "--num_train_epochs " + str(config["num_train_epochs"]) + " "
  command += "--per_device_train_batch_size " + str(config["per_device_train_batch_size"]) + " "
  command += "--learning_rate " + str(config["learning_rate"]) + " "
  command += "--max_seq_length " + str(config["max_seq_length"]) + " "


  if "do_train" in config:
    command += "--do_train "
  if "do_eval" in config:
    command += "--do_eval "
  if "do_predict" in config:
    command += "--do_predict "

  command += "--seed " + str(config["seed"]) + " "
  if "umls" in config:
    command += "--umls "
    command += "--med_document " + str(config["med_document"]) + " "

  command += "--labels " + config["labels"]
  command += " --save_steps 50000"

  return command

In [6]:
'''
############################################# 
Clinical/umls(both 1500000 and 3000000) BERT fine-tuning params
- Output dir: ./models/clinicalBert-v1 | ./models/umls-clinicalBert-v1
- model_name_or_path: emilyalsentzer/Bio_ClinicalBERT | ../checkpoint/clinicalbert300000
- Learning Rate: {2e-5, 3e-5, 5e-5}
- Batch size: {16, 32}
- Epochs: {20} # ner needs longer training
############################################# 
'''

# seeds = [6809, 36275, 5317, 82958, 25368] # five seeds average
seeds = [6809] # fine tuning

learning_rate_set = [2e-5, 3e-5, 5e-5]
batch_size_set = [16, 32]
epoch_set = [20]

In [7]:
path_set = [
            ("./models/i2b2_2012/clinicalBert", "emilyalsentzer/Bio_ClinicalBERT"), 
            ("./models/i2b2_2012/BertBased", "bert-base-cased")
           ]

for seed in seeds:
  for lr in learning_rate_set:
    for epoch in epoch_set:
      for batch_size in batch_size_set:
        for path in path_set:
          config = {
              "run_file"                    :     "run_ner.py",
              "labels"                      :     "dataset/NER/2006/label.txt",
              "output_dir"                  :     path[0] + "-" +  str(seed)+"-"+ datetime.now().strftime("%Y-%m-%d-%H-%M-%S"),
              "model_name_or_path"          :     path[1],
              "data_dir"                    :     "dataset/NER/2006",
              "num_train_epochs"            :     epoch,
              "per_device_train_batch_size" :     batch_size,
              "learning_rate"               :     lr,
              "max_seq_length"              :     258,
              "seed"                        :     seed,
              "do_train"                    :     True,
              "do_eval"                     :     True,
              "do_predict"                  :     True
          }

          # Run Downstream tasks with given config
          !rm dataset/NER/2006/cache*
          command = generate_command(config)
          print(command)
          subprocess.run(command, shell=True)

          # Save config to output dir
          with open(config["output_dir"] + '/fine_tune_config.json', 'w') as f:
            json.dump(config, f)
          assert "fine_tune_config.json" in os.listdir(config["output_dir"])

          # delete all checkpoints
          for path in os.listdir(config["output_dir"]):
            if path.startswith("checkpoint"):
              shutil.rmtree(config["output_dir"] + "/" +path)
            if path.startswith("pytorch_model.bin"):
              os.remove(config["output_dir"] + "/" +path)


rm: cannot remove 'dataset/NER/2006/cache*': No such file or directory
python3 run_ner.py --output_dir ./models/i2b2_2012/clinicalBert-6809-2020-09-13-23-36-57 --model_name_or_path emilyalsentzer/Bio_ClinicalBERT --data_dir dataset/NER/2006 --num_train_epochs 20 --per_device_train_batch_size 16 --learning_rate 2e-05 --max_seq_length 258 --do_train --do_eval --do_predict --seed 6809 --labels dataset/NER/2006/label.txt --save_steps 50000
python3 run_ner.py --output_dir ./models/i2b2_2012/BertBased-6809-2020-09-13-23-37-36 --model_name_or_path bert-base-cased --data_dir dataset/NER/2006 --num_train_epochs 20 --per_device_train_batch_size 16 --learning_rate 2e-05 --max_seq_length 258 --do_train --do_eval --do_predict --seed 6809 --labels dataset/NER/2006/label.txt --save_steps 50000


In [8]:
path_set = [("./models/2006-umlsbert/", "../checkpoint/umlsbert")]

for seed in seeds:
  for lr in learning_rate_set:
    for epoch in epoch_set:
      for batch_size in batch_size_set:
        for path in path_set:
          config = {
              "run_file"                    :     "run_ner.py",
              "labels"                      :     "dataset/NER/2006/label.txt",
              "output_dir"                  :     path[0] + "-" +  str(seed)+"-"+ datetime.now().strftime("%Y-%m-%d-%H-%M-%S"),
              "model_name_or_path"          :     path[1],
              "data_dir"                    :     "dataset/NER/2006",
              "num_train_epochs"            :     epoch,
              "per_device_train_batch_size" :     batch_size,
              "learning_rate"               :     lr,
              "max_seq_length"              :     258,
              "seed"                        :     seed,
              "do_train"                    :     True,
              "do_eval"                     :     True,
              "umls"                        :     True,
              "med_document"                :     "voc/vocab_updated.txt",
              "do_predict"                  :     True
          }

          # Run Downstream tasks with given config
          !rm dataset/NER/2006/cache*
          command = generate_command(config)
          print(command)
          subprocess.run(command, shell=True)

          # Save config to output dir
          with open(config["output_dir"] + '/fine_tune_config.json', 'w') as f:
            json.dump(config, f)
          assert "fine_tune_config.json" in os.listdir(config["output_dir"])

          # delete all checkpoints
          for path in os.listdir(config["output_dir"]):
            if path.startswith("checkpoint"):
              shutil.rmtree(config["output_dir"] + "/" +path)
            if path.startswith("pytorch_model.bin"):
              os.remove(config["output_dir"] + "/" +path)

python3 run_ner.py --output_dir ./models/2006-umlsbert/-6809-2020-09-13-23-38-19 --model_name_or_path ../checkpoint/umlsbert --data_dir dataset/NER/2006 --num_train_epochs 20 --per_device_train_batch_size 16 --learning_rate 2e-05 --max_seq_length 258 --do_train --do_eval --do_predict --seed 6809 --umls --med_document voc/vocab_updated.txt --labels dataset/NER/2006/label.txt --save_steps 50000
