In [1]:
import json
from datetime import datetime
import shutil
import subprocess
import pandas as pd
import os
os.environ['MKL_THREADING_LAYER'] = 'GNU'

In [2]:
ROOT_DIR = !pwd
ROOT_DIR = "/".join(ROOT_DIR[0].split("/")[:-1])
ROOT_DIR

'/home/gmichalo/umlsbert/UmlsBERT'

## Training Pipeline


In [3]:
downstream_dir = ROOT_DIR + "/text-classification/"
os.chdir(downstream_dir)

In [4]:
!pip3 install tokenizers sentencepiece sacremoses

Collecting tokenizers
  Using cached https://files.pythonhosted.org/packages/e9/ee/fedc3509145ad60fe5b418783f4a4c1b5462a4f0e8c7bbdbda52bdcda486/tokenizers-0.8.1-cp36-cp36m-manylinux1_x86_64.whl
Collecting sentencepiece
  Using cached https://files.pythonhosted.org/packages/68/e5/0366f50a00db181f4b7f3bdc408fc7c4177657f5bf45cb799b79fb4ce15c/sentencepiece-0.1.92-cp36-cp36m-manylinux1_x86_64.whl
Collecting sacremoses
Collecting six (from sacremoses)
  Using cached https://files.pythonhosted.org/packages/ee/ff/48bde5c0f013094d729fe4b0316ba2a24774b3ff1c52d924a8a4cb04078a/six-1.15.0-py2.py3-none-any.whl
Collecting joblib (from sacremoses)
  Using cached https://files.pythonhosted.org/packages/51/dd/0e015051b4a27ec5a58b02ab774059f3289a94b0906f880a3f9507e74f38/joblib-0.16.0-py3-none-any.whl
Collecting regex (from sacremoses)
  Using cached https://files.pythonhosted.org/packages/3e/eb/85f375a102e95cde14a184ee985a35e1a20c4ceb3fe7f57fa128a9326283/regex-2020.7.14-cp36-cp36m-manylinux1_x86_64.whl
C

In [5]:
def generate_command(config):
  command = "python3"
  command += " " + config["run_file"] + " "
  command += "--output_dir " + config["output_dir"] + " "
  command += "--model_name_or_path " + config["model_name_or_path"] + " "
  command += "--data_dir " + config["data_dir"] + " "
  command += "--num_train_epochs " + str(config["num_train_epochs"]) + " "
  command += "--per_device_train_batch_size " + str(config["per_device_train_batch_size"]) + " "
  command += "--learning_rate " + str(config["learning_rate"]) + " "

  if "do_train" in config:
    command += "--do_train "
  if "do_eval" in config:
    command += "--do_eval "
  if "do_predict" in config:
    command += "--do_predict "

  command += "--seed " + str(config["seed"]) + " "
  if "umls" in config:
    command += "--umls "
    command += "--med_document " + str(config["med_document"]) + " "

  command += "--task_name " + config["task_name"]
  command += " --save_steps 5000"
  return command

In [6]:
'''
############################################# 
Clinical/umls(both 1500000 and 3000000) BERT fine-tuning params
- Output dir: ./models/clinicalBert-v1 | ./models/umls-clinicalBert-v1
- model_name_or_path: emilyalsentzer/Bio_ClinicalBERT | ../checkpoint/clinicalbert300000
- Learning Rate: {2e-5, 3e-5, 5e-5}
- Batch size: {16, 32}
- Epochs: {3, 4}
############################################# 
'''

# seeds = [6809, 36275, 5317, 82958, 25368] # five seeds average
seeds = [6809] # fine tuning

learning_rate_set = [2e-5, 3e-5, 5e-5]
batch_size_set = [16, 32]
epoch_set = [3, 4]

### Fine-tune Clinical BERT and BERT

In [7]:
path_set = [
    ("./models/mednli/clinicalBert", "emilyalsentzer/Bio_ClinicalBERT"), 
    ("./models/mednli/BertBased", "bert-base-cased")
]

for seed in seeds:
    for lr in learning_rate_set:
        for epoch in epoch_set:
            for batch_size in batch_size_set:
                for path in path_set:
                    config = {
                    "run_file"                    :     "run_glue.py",
                    "output_dir"                  :     path[0] + "-" +  str(seed)+"-"+ datetime.now().strftime("%Y-%m-%d-%H-%M-%S"),
                    "model_name_or_path"          :     path[1],
                    "data_dir"                    :     "dataset/mednli/mednli",
                    "num_train_epochs"            :     epoch,
                    "seed"                        :     seed,
                    "per_device_train_batch_size" :     batch_size,
                    "learning_rate"               :     lr,
                    "do_train"                    :     True,
                    "do_eval"                     :     True,
                    "do_predict"                  :     True,
                    "task_name"                   :     "mnli"
                    }

                    # Run Downstream tasks with given config
                    !rm dataset/mednli/mednli/cache*
                    command = generate_command(config)
                    print(command)
                    subprocess.call(command, shell=True)

                    # Save config to output dir
                    with open(config["output_dir"] + '/fine_tune_config.json', 'w') as f:
                        json.dump(config, f)
                    assert "fine_tune_config.json" in os.listdir(config["output_dir"])

                    # delete all checkpoints
                    for path in os.listdir(config["output_dir"]):
                        if path.startswith("checkpoint"):
                            shutil.rmtree(config["output_dir"] + "/" +path)
                        if path.startswith("pytorch_model.bin"):
                            os.remove(config["output_dir"] + "/" +path)


rm: cannot remove 'dataset/mednli/mednli/cache*': No such file or directory
python3 run_glue.py --output_dir ./models/mednli/clinicalBert-6809-2020-09-13-23-26-53 --model_name_or_path emilyalsentzer/Bio_ClinicalBERT --data_dir dataset/mednli/mednli --num_train_epochs 3 --per_device_train_batch_size 32 --learning_rate 5e-05 --do_train --do_eval --do_predict --seed 6809 --task_name mnli --save_steps 5000
python3 run_glue.py --output_dir ./models/mednli/BertBased-6809-2020-09-13-23-27-47 --model_name_or_path bert-base-cased --data_dir dataset/mednli/mednli --num_train_epochs 3 --per_device_train_batch_size 32 --learning_rate 5e-05 --do_train --do_eval --do_predict --seed 6809 --task_name mnli --save_steps 5000


### Fine-tune UMLS Bert

In [8]:
path_set = [("./models/mednli/umlsbert", "../checkpoint/umlsbert")]

for seed in seeds:
  for lr in learning_rate_set:
    for epoch in epoch_set:
      for batch_size in batch_size_set:
        for path in path_set:
          config = {
              "run_file"                    :     "run_glue.py",
              "output_dir"                  :     path[0] + "-" +  str(seed)+"-"+ datetime.now().strftime("%Y-%m-%d-%H-%M-%S"),
              "model_name_or_path"          :     path[1],
              "data_dir"                    :     "dataset/mednli/mednli",
              "num_train_epochs"            :     epoch,
              "seed"                        :     seed,
              "per_device_train_batch_size" :     batch_size,
              "learning_rate"               :     lr,
              "do_train"                    :     True,
              "do_eval"                     :     True,
              "do_predict"                  :     True,
              "umls"                        :     True,
              "med_document"                :     "voc/vocab_updated.txt",
              "task_name"                   :     "mnli"
          }
          
          # Run Downstream tasks with given config
          !rm dataset/mednli/mednli/cache*
          command = generate_command(config)
          print(command)
          subprocess.run(command, shell=True)

          # Save config to output dir
          with open(config["output_dir"] + '/fine_tune_config.json', 'w') as f:
            json.dump(config, f)
          assert "fine_tune_config.json" in os.listdir(config["output_dir"])

          # delete all checkpoints
          for path in os.listdir(config["output_dir"]):
            if path.startswith("checkpoint"):
              shutil.rmtree(config["output_dir"] + "/" +path)
            if path.startswith("pytorch_model.bin"):
              os.remove(config["output_dir"] + "/" +path)

python3 run_glue.py --output_dir ./models/mednli/umlsbert-6809-2020-09-13-23-28-41 --model_name_or_path ../checkpoint/umlsbert --data_dir dataset/mednli/mednli --num_train_epochs 3 --per_device_train_batch_size 32 --learning_rate 5e-05 --do_train --do_eval --do_predict --seed 6809 --umls --med_document voc/vocab_updated.txt --task_name mnli --save_steps 5000
