As found in the [previous notebook](01_Data_Exploration.ipynb), the column "diagnosis" is missing from the pkl files, because tokenization didn't run. Let's see by running some experiments using the author's code what files and columns are missing in particular. So, we can narrow down the problem and fix it.

## Imports

In [1]:
import subprocess
from pathlib import Path
import datetime

## Functions

In [2]:
# Function to convert datetime isoformat to POSIX compatible file name
def iso_to_posix(iso):
    return iso.replace(':', '-').replace('.', '-')

def get_time():
    dtime = datetime.datetime.now().isoformat()
    return iso_to_posix(dtime)

In [3]:
get_time()

'2024-04-08T23-49-18-711652'

## Experiments

### Pretraining CodeEmb with word2vec

In [4]:
def listify_command(command):
    return [ch for ch in command.split(" ") if ch != ""]

def run_command(command):
    # Open the subprocess
    if isinstance(command, str):
        command = listify_command(command)
    process = subprocess.Popen(
        command,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        universal_newlines=True,
        bufsize=1  # line buffered
    )

    # Continuously print stdout
    with process.stdout:
        for line in iter(process.stdout.readline, ''):
            print(line, end='')

    # Wait for the process to finish
    process.wait()

In [5]:
input_path = "/data/DescEmb/output_predict"
data = "mimiciii"
model = "codeemb"
WORLDSIZE = 1
value_mode = "NV"
run = get_time()
checkpoint_prefix = f"{model}_{data}_{value_mode}_run_{run}_checkpoint_best"
checkpoint_path = f"checkpoints"

command = f"python ../DescEmb/main.py \
    --distributed_world_size {WORLDSIZE} \
    --input_path {input_path} \
    --src_data {data} \
    --task w2v \
    --model {model} \
    --value_mode {value_mode} \
    --save_dir {checkpoint_path} \
    --save_prefix {checkpoint_prefix}"
#run_command(command)

In [6]:
! $command

[2024-04-08 23:37:43,213][trainers.word2vec_trainer][INFO] - epoch: 0, loss: 1.386
[2024-04-08 23:37:43,214][trainers.word2vec_trainer][INFO] - Saving checkpoint to checkpoints/codeemb_mimiciii_NV_run_2024-04-08T23-37-27-746146_checkpoint_best_best.pt
[2024-04-08 23:37:43,243][trainers.word2vec_trainer][INFO] - Finished saving checkpoint to checkpoints/codeemb_mimiciii_NV_run_2024-04-08T23-37-27-746146_checkpoint_best_best.pt
[2024-04-08 23:37:43,384][trainers.word2vec_trainer][INFO] - epoch: 1, loss: 1.386
Validation AUROC increased (0.000000 --> -1.386221)
[2024-04-08 23:37:43,385][trainers.word2vec_trainer][INFO] - Saving checkpoint to checkpoints/codeemb_mimiciii_NV_run_2024-04-08T23-37-27-746146_checkpoint_best_best.pt
[2024-04-08 23:37:43,415][trainers.word2vec_trainer][INFO] - Finished saving checkpoint to checkpoints/codeemb_mimiciii_NV_run_2024-04-08T23-37-27-746146_checkpoint_best_best.pt
[2024-04-08 23:37:43,554][trainers.word2vec_trainer][INFO] - epoch: 2, loss: 1.386
Valid

### Fine-tune a pre-trained DescEmb model

In [None]:
input_path = "/data/DescEmb/output_predict"
data = "mimiciii"
model = "codeemb"
WORLDSIZE = 1
value_mode = "NV"
run = get_time()
checkpoint_prefix = f"{model}_{data}_{value_mode}_run_{run}_checkpoint_best"
checkpoint_path = f"checkpoints"

command = f"python ../DescEmb/main.py \
    --distributed_world_size {WORLDSIZE} \
    --input_path {input_path} \
    --src_data {data} \
    --task w2v \
    --model {model} \
    --value_mode {value_mode} \
    --save_dir {checkpoint_path} \
    --save_prefix {checkpoint_prefix}"
run_command(command)

In [None]:
python main.py \
    --distributed_world_size $WORLDSIZE \
    --input_path /path/to/data \
    --model_path /path/to/model.pt \
    --transfer \
    --model ehr_model \
    --embed_model $embed_model \
    --pred_model rnn \
    --src_data $data \
    --ratio $ratio \
    --value_mode $value \
    --task $task

In [7]:
arg_dict = {
    'distributed_world_size': 1,
    'input_path': "None",
    'model_path': "None",
    'save_dir': "checkpoints",
    'save_prefix': "checkpoint",
    'patience': 5,
    'disable_validation': "None",
    'src_data': "None",
    'eval_data': "None",
    'value_mode': "NV",
    'fold': "None",
    'valid_subsets': "valid, test",
    'task': "readmission",
    'seed': 1,
    'ratio': "100",
    'n_epochs': 1000,
    'lr': 0.0001,
    'batch_size': 128,
    'enc_embed_dim': 128,
    'enc_hidden_dim': 256,
    'rnn_layer': 1,
    'dropout': 0.3,
    'pred_embed_dim': 128,
    'pred_hidden_dim': 256,
    'max_event_len': 150,
    'mlm_prob': 0.3,
    'load_pretrained_weights': "None",
    'transfer': "None",
    'model': "None",
    'embed_model': "None",
    'pred_model': "None",
    'bert_model': "bert_tiny",
    'init_bert_params': "None",
    'init_bert_params_with_freeze': "None"
}


In [8]:
arg_dict

{'distributed_world_size': 1,
 'input_path': 'None',
 'model_path': 'None',
 'save_dir': 'checkpoints',
 'save_prefix': 'checkpoint',
 'patience': 5,
 'disable_validation': 'None',
 'src_data': 'None',
 'eval_data': 'None',
 'value_mode': 'NV',
 'fold': 'None',
 'valid_subsets': 'valid, test',
 'task': 'readmission',
 'seed': 1,
 'ratio': '100',
 'n_epochs': 1000,
 'lr': 0.0001,
 'batch_size': 128,
 'enc_embed_dim': 128,
 'enc_hidden_dim': 256,
 'rnn_layer': 1,
 'dropout': 0.3,
 'pred_embed_dim': 128,
 'pred_hidden_dim': 256,
 'max_event_len': 150,
 'mlm_prob': 0.3,
 'load_pretrained_weights': 'None',
 'transfer': 'None',
 'model': 'None',
 'embed_model': 'None',
 'pred_model': 'None',
 'bert_model': 'bert_tiny',
 'init_bert_params': 'None',
 'init_bert_params_with_freeze': 'None'}

In [18]:
class Argument(object):
    def __init__(self, arg_dict:dict):
        super().__init__()
        for key, value in arg_dict.items():
            setattr(self, key, value)
            
    def __repr__(self):
        return str(self.__dict__)

In [19]:
args = Argument(arg_dict)

In [20]:
args

{'distributed_world_size': 1, 'input_path': 'None', 'model_path': 'None', 'save_dir': 'checkpoints', 'save_prefix': 'checkpoint', 'patience': 5, 'disable_validation': 'None', 'src_data': 'None', 'eval_data': 'None', 'value_mode': 'NV', 'fold': 'None', 'valid_subsets': 'valid, test', 'task': 'readmission', 'seed': 1, 'ratio': '100', 'n_epochs': 1000, 'lr': 0.0001, 'batch_size': 128, 'enc_embed_dim': 128, 'enc_hidden_dim': 256, 'rnn_layer': 1, 'dropout': 0.3, 'pred_embed_dim': 128, 'pred_hidden_dim': 256, 'max_event_len': 150, 'mlm_prob': 0.3, 'load_pretrained_weights': 'None', 'transfer': 'None', 'model': 'None', 'embed_model': 'None', 'pred_model': 'None', 'bert_model': 'bert_tiny', 'init_bert_params': 'None', 'init_bert_params_with_freeze': 'None'}