In [1]:
!pip install datasets -q
!pip install transformers -q -U
!pip install wandb -q -U

In [2]:
import os
import pandas as pd
import os
from collections import defaultdict
import datasets
import sys, importlib
import random
import shutil
pd.set_option('display.max_colwidth', None)

In [46]:
DATA_HOME = './misc-data'
SECRET_HOME = '/Users/yifu/Documents/Stanford/Academics/Papers/TraumaICD-Paper/injury-icd-dataset/'  # THE DIRECTORY WHERE YOUR PATIENT DATA IS STORED
VOCAB_HOME = os.path.join(DATA_HOME, 'icd-codes')
SECRET_PATH =  os.path.join(DATA_HOME, 'injury-icd-dataset')
DS_HOME_PRETRAIN = os.path.join(DATA_HOME, 'pretrain', 'injury-icd-dataset')
PLOT_HOME = os.path.join(DATA_HOME, "publishing", "figures")
RAW_DATA_PATH = os.path.join(SECRET_HOME,"injury_icd_dataset.csv")  # YOUR OWN PATIENT DATA CSV FILE HERE. Example Format: 
EXCLUDE_SUPERFICIAL = False

In [4]:
### READ THE INJURY CODE VOCABULARY ###

icd10_concepts = pd.read_csv(os.path.join(VOCAB_HOME, "injury_codes_ICD10CM.csv"), low_memory=False)
injuries = icd10_concepts[icd10_concepts["concept_code"].apply(lambda x: len(x)<=5 and x.startswith("S"))]
injuries.to_csv(os.path.join(VOCAB_HOME, 'injury_ICD10.csv'))
injuries_4_char = injuries[injuries.concept_class_id == '4-char nonbill code']
injuries_4_char.to_csv(os.path.join(VOCAB_HOME, 'injury_ICD10_4_char.csv'))

In [38]:
### READ THE ANNOTATED PATIENT DATA ###

raw_data = pd.read_csv(RAW_DATA_PATH, on_bad_lines='skip')
case_icd_codes = raw_data[['patient_id', 'icd_code', 'icd_name', 'diagnosis_region', 'ais_code', 'ais_name']].copy()
case_icd_codes = case_icd_codes[case_icd_codes.icd_code.notnull()]
case_icd_codes["icd_code_4_char"] = case_icd_codes.icd_code.apply(lambda x: str(x)[:5])
case_icd_codes["icd_name_4_char"] = case_icd_codes.icd_name.apply(lambda x: str(x)[:5])
case_icd_codes = case_icd_codes.merge(injuries_4_char.rename(columns={'concept_code': 'icd_code_4_char'}), how='inner')
cases = raw_data[(raw_data.tertiary_impression != '') | (raw_data.tertiary_exam != '')][['patient_id', 'tertiary_exam', 'tertiary_imaging_report', 'tertiary_impression']].copy()
cases['total_text_len'] = cases.apply(lambda row: len(str(row.tertiary_exam)) + len(str(row.tertiary_imaging_report)) + len(str(row.tertiary_impression)), axis=1)

In [51]:
### EXPLORATORY DATA ANALYSIS ###

n_cases_by_4_char_code = case_icd_codes.groupby('icd_code_4_char', as_index=False).agg({'patient_id': 'count'}).rename(columns={'patient_id': 'n_cases'})
n_cases_by_4_char_code = n_cases_by_4_char_code[n_cases_by_4_char_code.n_cases > 5]
n_cases_by_4_char_code.to_csv(os.path.join(DATA_HOME, 'n_cases_by_4_char_code.csv'))

In [47]:
### FILTER & DEFINE GROUND TRUTH LABELS ###
if EXCLUDE_SUPERFICIAL:
    ground_truth_labels = n_cases_by_4_char_code[~n_cases_by_4_char_code.icd_code_4_char.str.contains('uperficial')].icd_code_4_char.tolist()
else:
    ground_truth_labels = n_cases_by_4_char_code.icd_code_4_char.tolist()

with open(os.path.join(VOCAB_HOME, 'label.txt'), 'w') as f:
    f.write('\n'.join(ground_truth_labels))

In [58]:
### SPLIT PATIENTS INTO TRAIN, VALIDATION, TEST SETS ###

patient_ids = cases.patient_id.sample(frac=1, random_state=42).unique()

train_ratio, validation_ratio, test_ratio = 0.7, 0.15, 0.15

train = patient_ids[:int(len(patient_ids)*train_ratio)]
validation = patient_ids[int(len(patient_ids)*train_ratio):int(len(patient_ids)*(train_ratio+validation_ratio))]
test = patient_ids[int(len(patient_ids)*(train_ratio+validation_ratio)):]


with open(os.path.join(DATA_HOME, 'train.txt'), 'w') as f:
  f.write('\n'.join([str(x) for x in train]))

with open(os.path.join(DATA_HOME, 'validation.txt'), 'w') as f:
  f.write('\n'.join([str(x) for x in validation]))

with open(os.path.join(DATA_HOME, 'test.txt'), 'w') as f:
  f.write('\n'.join([str(x) for x in test]))

for x in validation:
  assert x not in train
  assert x not in test

for x in train:
  assert x not in validation
  assert x not in test

for x in test:
  assert x not in validation
  assert x not in train

In [None]:
### DEFINE TRAINING HYPERPARAMETERS ###

valid_labels = "4-char"                          #@param ["4-char", "4-char-top50", "4-char-top10", "5-char", "4-and-5-char"]
experiment_name = "4-char-with-superficial" #@param {type:"string",  allow-input: true} ["non-sup", "non-sup-pretrain", "non-sup-tune-after-pretrain", "non-sup-4and5-char-train-on-full", "non-sup-train-on-full", "4-char-with-superficial"]
model_name = "michiyasunaga/BioLinkBERT-base"         #@param ["michiyasunaga/BioLinkBERT-base", "michiyasunaga/BioLinkBERT-large", "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"]
metric_for_best_model = "eval_f1_score_weighted"          #@param ["eval_f1_score_macro", "eval_f1_score_micro", "eval_f1_score_weighted", "eval_auc_score_macro", "eval_auc_score_micro", "eval_auc_score_weighted"]
num_epochs = "20"                 #@param [6, 10, 20, 30]
# Whether to train on both train and val dataset, and do final eval on holdout test set
train_on_full = True            #@param {type:"boolean"} 
evaluate_only = False           #@param {type:"boolean"}
learning_rate = "0.00002"        #@param [2e-5, 1e-5, 7e-6, 2e-6]
warmup_steps =  5000            #@param [1000, 2000, 3000, 5000]
per_device_train_batch_size = "16" #@param [16, 8, 6, 4]
per_device_eval_batch_size = "32" #@param [32, 16, 12, 8]
learning_rate = float(learning_rate)
warmup_steps = int(warmup_steps)
per_device_train_batch_size = int(per_device_train_batch_size)
per_device_eval_batch_size = int(per_device_eval_batch_size)

if "base" in model_name:
  print("The suggested batch size for train and eval of base-sized model on P100 GPU is 16 and 32")
else:
  print("The suggested batch size for train and eval of large-sized model on P100 GPU is 6 and 12")

In [59]:
!python train.py --model_name=$model_name \
                 --data_dir=$DS_HOME \
                 --model_dir=$MODEL_HOME \
                 --experiment_name=$experiment_name \
                 --valid_labels=$valid_labels \
                 --is_evaluate=$evaluate_only \
                 --train_on_full=$train_on_full \
                 --num_train_epochs=$num_epochs \
                 --metric_for_best_model=$metric_for_best_model \
                 --learning_rate=$learning_rate \
                 --warmup_steps=$warmup_steps \
                 --per_device_train_batch_size=$per_device_train_batch_size \
                 --per_device_eval_batch_size=$per_device_eval_batch_size


python: can't open file '/Users/yifu/Documents/Stanford/Academics/Papers/TraumaICD-Paper/TraumaICDBERT/train.py': [Errno 2] No such file or directory
