In [2]:
import json
import numpy as np
from sklearn.cluster import KMeans

In [3]:
infernece_result_path = './outputs/eval_ehrsql_mimic3_t5_base__mimic3_valid/prediction_raw.json'
# infernece_result_path = './outputs/eval_ehrsql_mimic3_t5_base_schema__mimic3_valid/prediction_raw.json'

In [4]:
num_workers = -1
with open(infernece_result_path, 'r') as f:
    data = json.load(f)
print(f'[result] {len(data)} lines loaded')

data_id = []
query_real = []
query_pred = []
entropy = []
impossible = []
for idx_, line in data.items():
    data_id.append(idx_)
    query_real.append(line['real'])
    query_pred.append(line['pred'])
    entropy.append(max(line['sequence_entropy']))
    impossible.append(line['is_impossible'])

[result] 1122 lines loaded


# k-means

In [4]:
def get_threshold_kmeans(entropy):
    
    kmeans = KMeans(n_clusters=2, random_state=0).fit(np.expand_dims(entropy, axis=1))
    
    zero_low = min(np.array(entropy)[kmeans.labels_==0])
    zero_high = max(np.array(entropy)[kmeans.labels_==0])
    one_low = min(np.array(entropy)[kmeans.labels_==1])
    one_high = max(np.array(entropy)[kmeans.labels_==1])
    
    if one_high > zero_high:
        return (zero_high + one_low)/2
    else:
        return (zero_low + one_high)/2

In [5]:
get_threshold_kmeans(entropy)

0.744074136018753

In [6]:
# mimic3 k-means: 0.744074136018753
# mimic3_schema k-means: 0.8807361125946045

# percentile

In [7]:
np.percentile(entropy, q=[67.0])

array([0.14923561])

In [8]:
# mimic3 67 percentile: 0.14923561
# mimic3_schema 67 percentile: 0.09932577

# choosing a threshold

In [9]:
!python T5/abstain_with_entropy.py --infernece_result_path outputs/eval_ehrsql_mimic3_t5_base__mimic3_valid --input_file prediction_raw.json --output_file prediction.json --threshold -1
!python evaluate.py --db_path ./dataset/ehrsql/mimic_iii/mimic_iii.db --data_file dataset/ehrsql/mimic_iii/valid.json --pred_file ./outputs/eval_ehrsql_mimic3_t5_base__mimic3_valid/prediction.json

{
  "precision_ans": 67.74,
  "recall_ans": 100.0,
  "f1_ans": 80.77,
  "precision_exec": 65.24,
  "recall_exec": 96.32,
  "f1_exec": 77.79
}


In [10]:
!python T5/abstain_with_entropy.py --infernece_result_path outputs/eval_ehrsql_mimic3_t5_base__mimic3_valid --input_file prediction_raw.json --output_file prediction.json --threshold 0.744074136018753
!python evaluate.py --db_path ./dataset/ehrsql/mimic_iii/mimic_iii.db --data_file dataset/ehrsql/mimic_iii/valid.json --pred_file ./outputs/eval_ehrsql_mimic3_t5_base__mimic3_valid/prediction.json

{
  "precision_ans": 83.84,
  "recall_ans": 97.63,
  "f1_ans": 90.21,
  "precision_exec": 82.03,
  "recall_exec": 95.53,
  "f1_exec": 88.27
}


In [11]:
!python T5/abstain_with_entropy.py --infernece_result_path outputs/eval_ehrsql_mimic3_t5_base__mimic3_valid --input_file prediction_raw.json --output_file prediction.json --threshold 0.14923561
!python evaluate.py --db_path ./dataset/ehrsql/mimic_iii/mimic_iii.db --data_file dataset/ehrsql/mimic_iii/valid.json --pred_file ./outputs/eval_ehrsql_mimic3_t5_base__mimic3_valid/prediction.json

{
  "precision_ans": 94.81,
  "recall_ans": 93.82,
  "f1_ans": 94.31,
  "precision_exec": 93.88,
  "recall_exec": 92.89,
  "f1_exec": 93.39
}


In [None]:
# high-precision setting

In [8]:
!python T5/abstain_with_entropy.py --infernece_result_path outputs/eval_ehrsql_mimic3_t5_base__mimic3_valid --input_file prediction_raw.json --output_file prediction.json --threshold 0.1
!python evaluate.py --db_path ./dataset/ehrsql/mimic_iii/mimic_iii.db --data_file dataset/ehrsql/mimic_iii/valid.json --pred_file ./outputs/eval_ehrsql_mimic3_t5_base__mimic3_valid/prediction.json

{
  "precision_ans": 95.82,
  "recall_ans": 93.42,
  "f1_ans": 94.6,
  "precision_exec": 95.14,
  "recall_exec": 92.76,
  "f1_exec": 93.94
}


In [12]:
!python T5/abstain_with_entropy.py --infernece_result_path outputs/eval_ehrsql_mimic3_t5_base__mimic3_valid --input_file prediction_raw.json --output_file prediction.json --threshold 0.000050
!python evaluate.py --db_path ./dataset/ehrsql/mimic_iii/mimic_iii.db --data_file dataset/ehrsql/mimic_iii/valid.json --pred_file ./outputs/eval_ehrsql_mimic3_t5_base__mimic3_valid/prediction.json

{
  "precision_ans": 98.46,
  "recall_ans": 8.42,
  "f1_ans": 15.52,
  "precision_exec": 98.46,
  "recall_exec": 8.42,
  "f1_exec": 15.52
}


In [13]:
!python T5/abstain_with_entropy.py --infernece_result_path outputs/eval_ehrsql_mimic3_t5_base__mimic3_valid --input_file prediction_raw.json --output_file prediction.json --threshold 0.000040
!python evaluate.py --db_path ./dataset/ehrsql/mimic_iii/mimic_iii.db --data_file dataset/ehrsql/mimic_iii/valid.json --pred_file ./outputs/eval_ehrsql_mimic3_t5_base__mimic3_valid/prediction.json

{
  "precision_ans": 97.78,
  "recall_ans": 5.79,
  "f1_ans": 10.93,
  "precision_exec": 97.78,
  "recall_exec": 5.79,
  "f1_exec": 10.93
}


In [5]:
!python T5/abstain_with_entropy.py --infernece_result_path outputs/eval_ehrsql_mimic3_t5_base__mimic3_valid --input_file prediction_raw.json --output_file prediction.json --threshold 0.000030
!python evaluate.py --db_path ./dataset/ehrsql/mimic_iii/mimic_iii.db --data_file dataset/ehrsql/mimic_iii/valid.json --pred_file ./outputs/eval_ehrsql_mimic3_t5_base__mimic3_valid/prediction.json

{
  "precision_ans": 100.0,
  "recall_ans": 3.68,
  "f1_ans": 7.11,
  "precision_exec": 100.0,
  "recall_exec": 3.68,
  "f1_exec": 7.11
}


In [15]:
# precision_exec: 100.0 (valid) >= 99.0
# set the threshold to 0.000030 and now ready to evaluate on the test set.