In [1]:
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite import CRF
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from spacy.lang.en import English
from spacy import displacy
from pathlib import Path
import joblib
from spacy.tokens import Doc
from typing import List
import random
from statistics import mean, stdev
from loguru import logger
import sys
import spacy
from medspacy_io.reader.brat_reader import BratDocReader, BratDirReader
import medspacy

## initialize brat reader

In [2]:
cleaned_train_dir=r'..\data\n2c2\cleaned_training'
cleaned_test_dir=r'..\data\n2c2\cleaned_test'
Path(cleaned_train_dir).exists(), Path(cleaned_test_dir).exists()

(True, True)

In [3]:
nlp=spacy.load('en_core_web_sm', disable=['ner'])

In [4]:
dir_reader = BratDirReader(nlp=nlp, schema_file=str(Path(cleaned_train_dir, 'annotation.conf')), support_overlap=True)

## Read eHOST annotations | load from pickles

In [5]:
pickle_file= r'..\data\n2c2\spacy_docs.joblib'

In [6]:
if not Path(pickle_file).exists():
    train_docs=dir_reader.read(txt_dir=cleaned_train_dir)
    test_docs=dir_reader.read(txt_dir=cleaned_test_dir)
    print(len(train_docs), len(test_docs))
    joblib.dump((train_docs, test_docs), pickle_file)
else:
    print(f'{pickle_file} already exists, load them directly')
    # before load from pickle, initiate EhostDirReader or EhostDocReader first, because some Doc extension used to store meta data will not be automatically recreated by loading.
    train_docs, test_docs=joblib.load(pickle_file)

..\data\n2c2\spacy_docs.joblib already exists, load them directly


## define sampling function

In [7]:
rounds =10
seed= 14

In [8]:
len(train_docs), len(test_docs)

(303, 202)

## CRF Wrapper (only use for eval)

In [9]:
from CRFWrapper_Sentence import spans_to_bio, convert_docs, word2features, sent2features,compute_metrics_and_averages,  CRFModel


In [10]:
## Get all annotation types: 
annos=set()
for d in train_docs:
    for anno in d.spans.keys():
        annos.add(anno)
print(annos)

{'Frequency', 'Duration', 'Form', 'ADE', 'Drug', 'Strength', 'Dosage', 'Route', 'Reason'}


In [11]:
crf_model=CRFModel(anno_types=annos)

## converting docs into sentence level dataframe

In [12]:
from ALLSampler_Sentence import SamplingSimulator, ModelSamplingSimulator, VBSamplingSimulator, convert_docs_medspacyIOvec

In [13]:
sdf_labels_train=convert_docs_medspacyIOvec(train_docs)

In [14]:
_, train_df=convert_docs(train_docs, anno_types=annos)

In [15]:
_, test_df=convert_docs(test_docs, anno_types=annos)

In [16]:
pickle_embedding_file= r'..\data\n2c2\embedding_df_uniqueSentID.joblib'#r'..\data\n2c2\embedding_df.joblib'
embedding_df=joblib.load(pickle_embedding_file)

In [17]:
embedding_df

Unnamed: 0,sentence_id,sentence,embedding
0,0,Admission Date:,"[0.026282, 0.03218903, -0.022386529, 0.0493732..."
3,1,[**2115-2-22**] Discharge Date: ...,"[0.016159855, 0.042264156, -0.018290585, -0.05..."
35,2,[**2078-8-9**] Sex: M\n\nService...,"[0.025958579, -0.05749655, 0.012378361, -0.009..."
113,3,[**Known lastname 3234**] is a 36 year old gen...,"[0.023170307, 0.03989108, 0.026217388, -0.0272..."
163,4,The patient initially presented to LGH ED with...,"[0.008176211, -0.06342948, 0.048615105, -0.045..."
...,...,...,...
3059,929923,"Cyanocobalamin 1,000 mcg/mL Injection once a m...","[0.050521564, -0.08905716, -0.0019493615, -0.0..."
3071,929924,"Lorazepam 0.25 QAM, O.25 QPM, 0.5 mg QHS\n8 Ca...","[-0.030010266, -0.062390286, 0.00167252, 0.016..."
3086,929925,Cream Topical TID\n9.,"[0.026732022, -0.04987913, 0.024520764, -0.016..."
3092,929926,Acetaminophen 1000 mg PO Q6H\n10.,"[-0.017295217, -0.10513715, -0.0030776137, -0...."


In [18]:
sdf_labels_train

Unnamed: 0,sentence,concept,y,doc_name
0,[**2078-8-9**] Sex: M\n\nService...,Vicodin,Drug,100035.txt
1,"While at the OSH, he received CTX,\nazithromyc...",CTX,Drug,100035.txt
2,"While at the OSH, he received CTX,\nazithromyc...",azithromycin,Drug,100035.txt
3,"While at the OSH, he received CTX,\nazithromyc...",epinephrine,Drug,100035.txt
4,"While at the OSH, he received CTX,\nazithromyc...",solumedrol,Drug,100035.txt
...,...,...,...,...
90405,[**Name (NI) **],,NEG,198406.txt
90406,"[**Telephone/Fax (1) 92787**](H),",,NEG,198406.txt
90407,[**Telephone/Fax (1) 92788**](C)\n\n\nMedicati...,,NEG,198406.txt
90408,3.,,NEG,198406.txt


In [19]:
train_df

Unnamed: 0,sentence_id,doc_name,token,label,sentence
0,0,100035.txt,Admission,O,Admission Date:
1,0,100035.txt,Date,O,Admission Date:
2,0,100035.txt,:,O,Admission Date:
3,1,100035.txt,[,O,[**2115-2-22**] Discharge Date: ...
4,1,100035.txt,*,O,[**2115-2-22**] Discharge Date: ...
...,...,...,...,...,...
3151,929927,198406.txt,Followup,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...
3152,929927,198406.txt,Instructions,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...
3153,929927,198406.txt,:,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...
3154,929927,198406.txt,\n,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...


In [20]:
train_df_embedding = train_df.merge(embedding_df, how='inner', on='sentence_id')

In [21]:
train_df_embedding

Unnamed: 0,sentence_id,doc_name,token,label,sentence_x,sentence_y,embedding
0,0,100035.txt,Admission,O,Admission Date:,Admission Date:,"[0.026282, 0.03218903, -0.022386529, 0.0493732..."
1,0,100035.txt,Date,O,Admission Date:,Admission Date:,"[0.026282, 0.03218903, -0.022386529, 0.0493732..."
2,0,100035.txt,:,O,Admission Date:,Admission Date:,"[0.026282, 0.03218903, -0.022386529, 0.0493732..."
3,1,100035.txt,[,O,[**2115-2-22**] Discharge Date: ...,[**2115-2-22**] Discharge Date: ...,"[0.016159855, 0.042264156, -0.018290585, -0.05..."
4,1,100035.txt,*,O,[**2115-2-22**] Discharge Date: ...,[**2115-2-22**] Discharge Date: ...,"[0.016159855, 0.042264156, -0.018290585, -0.05..."
...,...,...,...,...,...,...,...
932901,929927,198406.txt,Followup,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...,"[0.02337662, 0.021309359, 0.039473698, -0.0429..."
932902,929927,198406.txt,Instructions,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...,"[0.02337662, 0.021309359, 0.039473698, -0.0429..."
932903,929927,198406.txt,:,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...,"[0.02337662, 0.021309359, 0.039473698, -0.0429..."
932904,929927,198406.txt,\n,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...,"[0.02337662, 0.021309359, 0.039473698, -0.0429..."


In [22]:
sdf_labels_sid = sdf_labels_train.merge(embedding_df, how='inner', on='sentence') 

In [23]:
sdf_labels_sid

Unnamed: 0,sentence,concept,y,doc_name,sentence_id,embedding
0,[**2078-8-9**] Sex: M\n\nService...,Vicodin,Drug,100035.txt,2,"[0.025958579, -0.05749655, 0.012378361, -0.009..."
1,"While at the OSH, he received CTX,\nazithromyc...",CTX,Drug,100035.txt,5,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
2,"While at the OSH, he received CTX,\nazithromyc...",azithromycin,Drug,100035.txt,5,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
3,"While at the OSH, he received CTX,\nazithromyc...",epinephrine,Drug,100035.txt,5,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
4,"While at the OSH, he received CTX,\nazithromyc...",solumedrol,Drug,100035.txt,5,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
...,...,...,...,...,...,...
638687,Patient had some cardiac enzyme leaks\nduring ...,,NEG,198406.txt,929907,"[0.02065785, -0.06587324, 0.055154495, 0.01074..."
638688,Patient was given cardiac healthy diet during ...,,NEG,198406.txt,929910,"[0.030450102, -0.042418838, 0.00325665, 0.0384..."
638689,# CODE: DNR/DNI (discussed with patient and so...,,NEG,198406.txt,929911,"[0.023342747, 0.013347558, -0.01095362, -0.052..."
638690,"[**Telephone/Fax (1) 92787**](H),",,NEG,198406.txt,929914,"[0.02003492, 0.029056935, -0.0140215475, 0.007..."


In [24]:
train_sentID_list = train_df['sentence_id'].to_list()
train_sentID_set = set(train_sentID_list)
train_sentID_uniqList = list(train_sentID_set)
test_sentID_list = test_df['sentence_id'].to_list()
test_sentID_set = set(test_sentID_list)
test_sentID_uniqList = list(test_sentID_set)

In [25]:
print(len(train_sentID_uniqList), len(test_sentID_uniqList))

51798 34334


In [26]:
int(1.0*len(train_sentID_uniqList)/10)

5179

In [27]:
sdf_labels_sid#[['sentence','concept', 'y', 'doc_name_x','sid']]

Unnamed: 0,sentence,concept,y,doc_name,sentence_id,embedding
0,[**2078-8-9**] Sex: M\n\nService...,Vicodin,Drug,100035.txt,2,"[0.025958579, -0.05749655, 0.012378361, -0.009..."
1,"While at the OSH, he received CTX,\nazithromyc...",CTX,Drug,100035.txt,5,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
2,"While at the OSH, he received CTX,\nazithromyc...",azithromycin,Drug,100035.txt,5,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
3,"While at the OSH, he received CTX,\nazithromyc...",epinephrine,Drug,100035.txt,5,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
4,"While at the OSH, he received CTX,\nazithromyc...",solumedrol,Drug,100035.txt,5,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
...,...,...,...,...,...,...
638687,Patient had some cardiac enzyme leaks\nduring ...,,NEG,198406.txt,929907,"[0.02065785, -0.06587324, 0.055154495, 0.01074..."
638688,Patient was given cardiac healthy diet during ...,,NEG,198406.txt,929910,"[0.030450102, -0.042418838, 0.00325665, 0.0384..."
638689,# CODE: DNR/DNI (discussed with patient and so...,,NEG,198406.txt,929911,"[0.023342747, 0.013347558, -0.01095362, -0.052..."
638690,"[**Telephone/Fax (1) 92787**](H),",,NEG,198406.txt,929914,"[0.02003492, 0.029056935, -0.0140215475, 0.007..."


In [62]:
max(sdf_labels_sid['sid'])

47529

In [63]:
max(train_df['sentence_id'].to_list())

929927

In [22]:
train_df

Unnamed: 0,sentence_id,doc_name,token,label
0,0,100035.txt,Admission,O
1,0,100035.txt,Date,O
2,0,100035.txt,:,O
3,1,100035.txt,[,O
4,1,100035.txt,*,O
...,...,...,...,...
3151,929927,198406.txt,Followup,O
3152,929927,198406.txt,Instructions,O
3153,929927,198406.txt,:,O
3154,929927,198406.txt,\n,O


In [57]:
len(set(train_df['sentence_id'].to_list()))

51798

In [59]:
max(train_df['sentence_id'].to_list())

929927

In [26]:
#list(set(train_df['sentence_id'].to_list()))

In [24]:
sdf_labels_train

Unnamed: 0,sentence,concept,y,doc_name
0,[**2078-8-9**] Sex: M\n\nService...,Vicodin,Drug,100035.txt
1,"While at the OSH, he received CTX,\nazithromyc...",CTX,Drug,100035.txt
2,"While at the OSH, he received CTX,\nazithromyc...",azithromycin,Drug,100035.txt
3,"While at the OSH, he received CTX,\nazithromyc...",epinephrine,Drug,100035.txt
4,"While at the OSH, he received CTX,\nazithromyc...",solumedrol,Drug,100035.txt
...,...,...,...,...
90405,[**Name (NI) **],,NEG,198406.txt
90406,"[**Telephone/Fax (1) 92787**](H),",,NEG,198406.txt
90407,[**Telephone/Fax (1) 92788**](C)\n\n\nMedicati...,,NEG,198406.txt
90408,3.,,NEG,198406.txt


In [25]:
embedding_df

Unnamed: 0,sid,sentence,doc_name,embedding
0,0,Admission Date:,100035.txt,"[0.026282, 0.03218903, -0.022386529, 0.0493732..."
1,1,[**2115-2-22**] Discharge Date: ...,100035.txt,"[0.016159855, 0.042264156, -0.018290585, -0.05..."
2,2,[**2078-8-9**] Sex: M\n\nService...,100035.txt,"[0.025958579, -0.05749655, 0.012378361, -0.009..."
3,3,[**Known lastname 3234**] is a 36 year old gen...,100035.txt,"[0.023170307, 0.03989108, 0.026217388, -0.0272..."
4,4,The patient initially presented to LGH ED with...,100035.txt,"[0.008176211, -0.06342948, 0.048615105, -0.045..."
...,...,...,...,...
47525,47525,"Cyanocobalamin 1,000 mcg/mL Injection once a m...",198406.txt,"[0.050521564, -0.08905716, -0.0019493615, -0.0..."
47526,47526,"Lorazepam 0.25 QAM, O.25 QPM, 0.5 mg QHS\n8 Ca...",198406.txt,"[-0.030010268, -0.062390275, 0.0016725484, 0.0..."
47527,47527,Cream Topical TID\n9.,198406.txt,"[0.026732022, -0.04987913, 0.024520764, -0.016..."
47528,47528,Acetaminophen 1000 mg PO Q6H\n10.,198406.txt,"[-0.017295217, -0.10513715, -0.0030776137, -0...."


## sampling simulator

In [28]:
faiss_index_path= r'..\data\n2c2\faiss_index_st768'

In [29]:
pickle_embedding_file= r'..\data\n2c2\embedding_df_uniqueSentID.joblib'# r'..\data\n2c2\embedding_df.joblib'
#sentence	concept	y	doc_name	sentence_id	embedding
embedding_df=joblib.load(pickle_embedding_file)
vb_simulator=VBSamplingSimulator(total_sents=train_df, 
                                 total_round=10, 
                                 modelWrapper=crf_model, 
                                 eval_sents=test_df, 
                                 init_seed=seed, 
                                 faiss_index_path=faiss_index_path, 
                                 embedding_df=embedding_df,
                                 sdf_labels=sdf_labels_sid[['sentence','concept', 'y', 'doc_name','sentence_id']],
                                 min_dist_diff=True
                                )

[32m2024-05-10 15:17:18.121[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36m__init__[0m:[36m221[0m - [34m[1mLoading index...[0m
[32m2024-05-10 15:17:18.938[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36m__init__[0m:[36m223[0m - [34m[1mdone[0m


## test run

In [25]:
scores=vb_simulator.simulate_rounds(boostrap_times=3)

[32m2024-04-26 16:26:50.340[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m92[0m - [1msimulate round 0.[0m
[32m2024-04-26 16:26:50.342[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m75[0m - [34m[1mThe first round sampling will be random[0m
[32m2024-04-26 16:26:50.749[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m82[0m - [1mcurrent sampled sentences: 5179, remaining sentences: 46619[0m
[32m2024-04-26 16:26:52.840[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m284[0m - [34m[1mReset and train CRF model...[0m
[32m2024-04-26 16:27:54.528[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m292[0m - [34m[1mTraining complete.[0m
[32m2024-04-26 16:27:54.620[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mbootstrap_eval_DFsent[0m:[36m352[0m - [34m[1mPredicting eval sents...[0m
[32m2024-04

In [26]:
def compute_mean_ci(scores):
    ave=np.mean(scores)
    ci=np.percentile(scores, [2.5, 97.5])
    return ave, ci

summary={'precision': [], 'pl':[], 'pu': [], 'recall': [], 'rl':[], 'ru': [], 'f1':[], 'fl':[], 'fu': []}
for s in scores:    
    for k,v in s.items():
        ave, (l, u)=compute_mean_ci(v)
        summary[k].append(ave)
        summary[k[0]+'l'].append(l)
        summary[k[0]+'u'].append(u)

In [27]:
pd.options.display.float_format='{:,.5f}'.format
pd.DataFrame(summary)

Unnamed: 0,precision,pl,pu,recall,rl,ru,f1,fl,fu
0,0.94531,0.9427,0.94725,0.77002,0.7682,0.77201,0.84871,0.84658,0.85026
1,0.94039,0.93887,0.94243,0.77318,0.7692,0.77592,0.84862,0.84565,0.85037
2,0.9438,0.94254,0.94482,0.77315,0.76866,0.7774,0.84999,0.84736,0.85294
3,0.94651,0.94501,0.94757,0.77203,0.76746,0.77531,0.85041,0.84704,0.85262
4,0.94268,0.94131,0.94346,0.77167,0.76674,0.77575,0.84864,0.84594,0.85055
5,0.94405,0.94231,0.94527,0.76865,0.76422,0.77249,0.84736,0.84493,0.849
6,0.94495,0.94232,0.9478,0.77309,0.77217,0.77455,0.85042,0.84887,0.85126
7,0.94517,0.94283,0.94774,0.77105,0.76978,0.77258,0.84928,0.84756,0.85124
8,0.94493,0.94294,0.94814,0.77255,0.76759,0.77913,0.85008,0.84649,0.85536
9,0.94562,0.94501,0.94632,0.85615,0.85242,0.85917,0.89866,0.89635,0.90029


## bootstrap 3 runs

In [30]:
logger.remove()
logger.add(sys.stderr, level='INFO')

1

In [31]:
boostrap_runs=3
total_round=10

In [32]:
random.seed(14)
seeds=[random.randint(1,10000000) for  _ in range(boostrap_runs)]
seeds

[1792286, 8843471, 4142887]

In [33]:
all_scores=[]
embedding_df=joblib.load(pickle_embedding_file)
for si, seed  in enumerate(seeds):
    logger.info(f'start run {si}.')
    pickle_embedding_file= r'..\data\n2c2\embedding_df_uniqueSentID.joblib' #r'..\data\n2c2\embedding_df.joblib'
    crf_model=CRFModel(anno_types=annos)
    vb_simulator=VBSamplingSimulator(total_sents=train_df, 
                                 total_round=10, 
                                 modelWrapper=crf_model, 
                                 eval_sents=test_df, 
                                 init_seed=seed, 
                                 faiss_index_path=faiss_index_path, 
                                 embedding_df=embedding_df,
                                 sdf_labels=sdf_labels_sid[['sentence','concept', 'y', 'doc_name','sentence_id']],
                                 min_dist_diff=True
                                )
    #v_simulator=VBSamplingSimulator(train_docs, 
    #                                total_round=10, 
    #                                modelWrapper=crf_model, 
    #                                eval_docs=test_docs, 
    #                                init_seed=seed, 
    #                                faiss_index_path=faiss_index_path, 
    #                                embedding_df=embedding_df, 
    #                                min_dist_diff=True)
    scores=vb_simulator.simulate_rounds(boostrap_times=200)
    all_scores.append(scores) 

[32m2024-05-10 15:19:53.534[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mstart run 0.[0m
[32m2024-05-10 15:19:54.340[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m92[0m - [1msimulate round 0.[0m
[32m2024-05-10 15:19:54.765[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m82[0m - [1mcurrent sampled sentences: 5179, remaining sentences: 46619[0m
[32m2024-05-10 15:25:53.851[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m92[0m - [1msimulate round 1.[0m
[32m2024-05-10 15:26:35.789[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m342[0m - [1mdistance shape: (51798, 10), max to retrieve 46620 sentences[0m
[32m2024-05-10 15:32:53.424[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m82[0m - [1mcurrent sampled sentences: 5669, remaining sentences: 46129[0m
[32m2024-05-1

KeyboardInterrupt: 

In [32]:
joblib.dump(all_scores, r'../data/n2c2/scores_sentence_sampling/ner_VBmin_scores_sentenceSampling.joblib')


['../data/n2c2/scores_sentence_sampling/ner_VBmin_scores_sentenceSampling.joblib']

## Debug

In [34]:
logger.remove()
logger.add(sys.stderr, level='DEBUG')

2

In [35]:
#sdf_labels_intSid = sdf_labels_sid({'sid':'int'}) #convert this to int

In [36]:
pickle_embedding_file= r'..\data\n2c2\embedding_df_uniqueSentID.joblib'
embedding_df=joblib.load(pickle_embedding_file)
vb_simulator=VBSamplingSimulator(total_sents=train_df, 
                                 total_round=10, 
                                 modelWrapper=crf_model, 
                                 eval_sents=test_df, 
                                 init_seed=seed, 
                                 faiss_index_path=faiss_index_path, 
                                 embedding_df=embedding_df,
                                 sdf_labels=sdf_labels_sid[['sentence','concept', 'y', 'doc_name','sentence_id']],
                                 min_dist_diff=True
                                )

[32m2024-05-10 17:55:35.523[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36m__init__[0m:[36m221[0m - [34m[1mLoading index...[0m
[32m2024-05-10 17:55:36.036[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36m__init__[0m:[36m223[0m - [34m[1mdone[0m


In [37]:
vb_simulator.num_per_round=20

In [39]:
Sampled_1roundRandom,Remaining_1roundRandom=vb_simulator.keep_sample(True)

[32m2024-05-10 17:57:35.945[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m75[0m - [34m[1mThe first round sampling will be random[0m
[32m2024-05-10 17:57:36.375[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m82[0m - [1mcurrent sampled sentences: 20, remaining sentences: 51778[0m


In [40]:
def distribution(df):
    gdf=df.groupby('sentence_id')
    nsent={}
    psent={}
    for i,sent in gdf:
        labels=list(sent.label.unique()) #sentence labe is in `concept column`
        labels.remove('O')
        if len(labels)>0:
            psent[i]=(labels, ' '.join([str(t) for t in sent.token]))
        else:
            nsent[i]=(labels,' '.join([str(t) for t in sent.token]))
    return psent, nsent

In [41]:
psent, nsent=distribution(vb_simulator.sampled)

In [42]:
len(nsent), len(psent)

(15, 5)

In [43]:
psent

{16: (['B-Drug'],
  'Rescucitation last approximately 10 - 15 minutes with multiple \n rounds of epi and bicarb , with ROSC .'),
 92551: (['B-Drug',
   'I-Drug',
   'B-Strength',
   'I-Strength',
   'B-Form',
   'B-Dosage',
   'I-Dosage',
   'B-Route',
   'B-Frequency',
   'I-Frequency'],
  'Refills:*2 * \n 2 . ranitidine HCl 300 mg Tablet Sig : One ( 1 ) Tablet PO at \n bedtime .'),
 425908: (['B-Reason', 'I-Reason', 'B-Drug'],
  'Brief Hospital Course : \n 79F with locally advanced pancreatic CA on Xeloda and \n oxaliplatin ( C2D1'),
 471728: (['B-Drug',
   'B-Strength',
   'I-Strength',
   'B-Form',
   'B-Dosage',
   'I-Dosage',
   'B-Route',
   'B-Frequency',
   'I-Frequency'],
  'Captopril 25 mg Tablet Sig : One ( 1 ) Tablet PO BID ( 2 times a \n day ) .'),
 619225: (['B-Drug',
   'B-Strength',
   'I-Strength',
   'B-Form',
   'I-Form',
   'B-Dosage',
   'I-Dosage',
   'B-Route',
   'B-Frequency',
   'I-Frequency'],
  'verapamil 120 mg Tablet Extended Release Sig : One ( 1 ) Table

In [44]:
vb_simulator.sampled

Unnamed: 0,sentence_id,doc_name,token,label,sentence
0,16,100035.txt,Rescucitation,O,Rescucitation last approximately 10-15 minutes...
1,16,100035.txt,last,O,Rescucitation last approximately 10-15 minutes...
2,16,100035.txt,approximately,O,Rescucitation last approximately 10-15 minutes...
3,16,100035.txt,10,O,Rescucitation last approximately 10-15 minutes...
4,16,100035.txt,-,O,Rescucitation last approximately 10-15 minutes...
...,...,...,...,...,...
444,859499,182160.txt,the,O,"Fatigue is\nnormal, especially for the first m..."
445,859499,182160.txt,first,O,"Fatigue is\nnormal, especially for the first m..."
446,859499,182160.txt,month,O,"Fatigue is\nnormal, especially for the first m..."
447,859499,182160.txt,postoperative,O,"Fatigue is\nnormal, especially for the first m..."


In [45]:
sampled_sendIDlist = set(vb_simulator.sampled['sentence_id'])
logger.debug(f'sampled_sendIDlist: {len(sampled_sendIDlist)}')
sampled_sdf = vb_simulator.sdf_labels[vb_simulator.sdf_labels['sentence_id'].isin(sampled_sendIDlist)]

[32m2024-05-10 18:20:36.957[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [34m[1msampled_sendIDlist: 20[0m


In [46]:
type(list(sampled_sendIDlist)[0])

int

In [47]:
type(vb_simulator.sdf_labels.iloc[0].sentence_id) # type in consistent: Int is not instance of numpy.int64

numpy.int32

In [48]:
vb_simulator.sdf_labels

Unnamed: 0,sentence,concept,y,doc_name,sentence_id
0,[**2078-8-9**] Sex: M\n\nService...,Vicodin,Drug,100035.txt,2
1,"While at the OSH, he received CTX,\nazithromyc...",CTX,Drug,100035.txt,5
2,"While at the OSH, he received CTX,\nazithromyc...",azithromycin,Drug,100035.txt,5
3,"While at the OSH, he received CTX,\nazithromyc...",epinephrine,Drug,100035.txt,5
4,"While at the OSH, he received CTX,\nazithromyc...",solumedrol,Drug,100035.txt,5
...,...,...,...,...,...
638687,Patient had some cardiac enzyme leaks\nduring ...,,NEG,198406.txt,929907
638688,Patient was given cardiac healthy diet during ...,,NEG,198406.txt,929910
638689,# CODE: DNR/DNI (discussed with patient and so...,,NEG,198406.txt,929911
638690,"[**Telephone/Fax (1) 92787**](H),",,NEG,198406.txt,929914


In [49]:
sdf_labels_intSid = vb_simulator.sdf_labels.astype({'sentence_id':'int'})

In [50]:
type(sdf_labels_intSid.iloc[0].sentence_id) #int32 is type of int

numpy.int32

In [51]:
sampled_sendIDlist = set(vb_simulator.sampled['sentence_id'])
logger.debug(f'sampled_sendIDlist: {len(sampled_sendIDlist)}')
sampled_sdf_intSid = sdf_labels_intSid[sdf_labels_intSid['sentence_id'].isin(sampled_sendIDlist)]

[32m2024-05-10 18:21:04.536[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [34m[1msampled_sendIDlist: 20[0m


In [52]:
sampled_sendIDlist = set(vb_simulator.sampled['sentence_id'])


In [53]:
sampled_sendIDlist

{16,
 15910,
 92551,
 278115,
 278202,
 281654,
 362102,
 425908,
 471728,
 591966,
 619225,
 644536,
 647271,
 662894,
 662924,
 679460,
 691088,
 735111,
 761429,
 859499}

In [54]:
vb_simulator.sampled

Unnamed: 0,sentence_id,doc_name,token,label,sentence
0,16,100035.txt,Rescucitation,O,Rescucitation last approximately 10-15 minutes...
1,16,100035.txt,last,O,Rescucitation last approximately 10-15 minutes...
2,16,100035.txt,approximately,O,Rescucitation last approximately 10-15 minutes...
3,16,100035.txt,10,O,Rescucitation last approximately 10-15 minutes...
4,16,100035.txt,-,O,Rescucitation last approximately 10-15 minutes...
...,...,...,...,...,...
444,859499,182160.txt,the,O,"Fatigue is\nnormal, especially for the first m..."
445,859499,182160.txt,first,O,"Fatigue is\nnormal, especially for the first m..."
446,859499,182160.txt,month,O,"Fatigue is\nnormal, especially for the first m..."
447,859499,182160.txt,postoperative,O,"Fatigue is\nnormal, especially for the first m..."


### next round

In [55]:
vb_simulator.modelWrapper.fit(vb_simulator.sampled)

[32m2024-05-10 18:24:37.167[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m285[0m - [34m[1mReset and train CRF model...[0m
[32m2024-05-10 18:24:37.384[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m293[0m - [34m[1mTraining complete.[0m


In [56]:
vb_simulator.fit(vb_simulator.sampled)

[32m2024-05-10 18:24:50.023[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36mfit[0m:[36m270[0m - [34m[1m8 centroids detected from the given sampled_docs[0m


In [57]:
vb_simulator.remaining.shape

(932457, 5)

In [58]:
psent

{16: (['B-Drug'],
  'Rescucitation last approximately 10 - 15 minutes with multiple \n rounds of epi and bicarb , with ROSC .'),
 92551: (['B-Drug',
   'I-Drug',
   'B-Strength',
   'I-Strength',
   'B-Form',
   'B-Dosage',
   'I-Dosage',
   'B-Route',
   'B-Frequency',
   'I-Frequency'],
  'Refills:*2 * \n 2 . ranitidine HCl 300 mg Tablet Sig : One ( 1 ) Tablet PO at \n bedtime .'),
 425908: (['B-Reason', 'I-Reason', 'B-Drug'],
  'Brief Hospital Course : \n 79F with locally advanced pancreatic CA on Xeloda and \n oxaliplatin ( C2D1'),
 471728: (['B-Drug',
   'B-Strength',
   'I-Strength',
   'B-Form',
   'B-Dosage',
   'I-Dosage',
   'B-Route',
   'B-Frequency',
   'I-Frequency'],
  'Captopril 25 mg Tablet Sig : One ( 1 ) Tablet PO BID ( 2 times a \n day ) .'),
 619225: (['B-Drug',
   'B-Strength',
   'I-Strength',
   'B-Form',
   'I-Form',
   'B-Dosage',
   'I-Dosage',
   'B-Route',
   'B-Frequency',
   'I-Frequency'],
  'verapamil 120 mg Tablet Extended Release Sig : One ( 1 ) Table

In [59]:
sampled=vb_simulator.sampled

In [60]:
gsampled=sampled.groupby('sentence_id')

In [61]:
len(gsampled)

20

In [62]:
vb_simulator.sdf_labels[vb_simulator.sdf_labels.sentence_id==611466]

Unnamed: 0,sentence,concept,y,doc_name,sentence_id
610685,Transthoracic echo\nrevealed diastolic heart f...,diastolic heart failure,Reason,118418.txt,611466
610686,Transthoracic echo\nrevealed diastolic heart f...,enalapril,Drug,118418.txt,611466
610687,Transthoracic echo\nrevealed diastolic heart f...,Lasix,Drug,118418.txt,611466


In [52]:
len(set(sampled_sendIDlist))

20

In [64]:
vb_simulator.sample_next_round(vb_simulator.sampled, vb_simulator.remaining, randomly=False)

[32m2024-05-10 18:29:44.568[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m326[0m - [34m[1mCalculating centroids...[0m
[32m2024-05-10 18:29:45.112[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36mfit[0m:[36m270[0m - [34m[1m8 centroids detected from the given sampled_docs[0m
[32m2024-05-10 18:29:45.114[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m329[0m - [34m[1mSearching from the vector index...[0m
[32m2024-05-10 18:29:45.119[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m342[0m - [1mdistance shape: (51798, 8), max to retrieve 46620 sentences[0m
[32m2024-05-10 18:29:45.120[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m346[0m - [34m[1msearch for centroid: Dosage[0m
[32m2024-05-10 18:30:24.696[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[

(     sentence_id    doc_name          token label  \
 0             16  100035.txt  Rescucitation     O   
 1             16  100035.txt           last     O   
 2             16  100035.txt  approximately     O   
 3             16  100035.txt             10     O   
 4             16  100035.txt              -     O   
 ..           ...         ...            ...   ...   
 936           23  100035.txt           Poor     O   
 937           23  100035.txt            air     O   
 938           23  100035.txt       movement     O   
 939           23  100035.txt    bilaterally     O   
 940           23  100035.txt              .     O   
 
                                               sentence  
 0    Rescucitation last approximately 10-15 minutes...  
 1    Rescucitation last approximately 10-15 minutes...  
 2    Rescucitation last approximately 10-15 minutes...  
 3    Rescucitation last approximately 10-15 minutes...  
 4    Rescucitation last approximately 10-15 minutes...  
 .

In [65]:
vb_simulator.sampled

Unnamed: 0,sentence_id,doc_name,token,label,sentence
0,16,100035.txt,Rescucitation,O,Rescucitation last approximately 10-15 minutes...
1,16,100035.txt,last,O,Rescucitation last approximately 10-15 minutes...
2,16,100035.txt,approximately,O,Rescucitation last approximately 10-15 minutes...
3,16,100035.txt,10,O,Rescucitation last approximately 10-15 minutes...
4,16,100035.txt,-,O,Rescucitation last approximately 10-15 minutes...
...,...,...,...,...,...
444,859499,182160.txt,the,O,"Fatigue is\nnormal, especially for the first m..."
445,859499,182160.txt,first,O,"Fatigue is\nnormal, especially for the first m..."
446,859499,182160.txt,month,O,"Fatigue is\nnormal, especially for the first m..."
447,859499,182160.txt,postoperative,O,"Fatigue is\nnormal, especially for the first m..."


In [69]:
len(set(vb_simulator.sampled.sentence_id.to_list())) #20
len(set(vb_simulator.remaining.sentence_id.to_list())) #51778

51778

In [66]:
vb_simulator.remaining

Unnamed: 0,sentence_id,doc_name,token,label,sentence
0,0,100035.txt,Admission,O,Admission Date:
1,0,100035.txt,Date,O,Admission Date:
2,0,100035.txt,:,O,Admission Date:
3,1,100035.txt,[,O,[**2115-2-22**] Discharge Date: ...
4,1,100035.txt,*,O,[**2115-2-22**] Discharge Date: ...
...,...,...,...,...,...
3151,929927,198406.txt,Followup,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...
3152,929927,198406.txt,Instructions,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...
3153,929927,198406.txt,:,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...
3154,929927,198406.txt,\n,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...


## Isssue

In [53]:
sampled_sdf = vb_simulator.sdf_labels[vb_simulator.sdf_labels['sentence_id'].isin(list(set(sampled_sendIDlist)))]

In [54]:
vb_simulator.sdf_labels

Unnamed: 0,sentence,concept,y,doc_name,sentence_id
0,[**2078-8-9**] Sex: M\n\nService...,Vicodin,Drug,100035.txt,2
1,"While at the OSH, he received CTX,\nazithromyc...",CTX,Drug,100035.txt,5
2,"While at the OSH, he received CTX,\nazithromyc...",azithromycin,Drug,100035.txt,5
3,"While at the OSH, he received CTX,\nazithromyc...",epinephrine,Drug,100035.txt,5
4,"While at the OSH, he received CTX,\nazithromyc...",solumedrol,Drug,100035.txt,5
...,...,...,...,...,...
638687,Patient had some cardiac enzyme leaks\nduring ...,,NEG,198406.txt,929907
638688,Patient was given cardiac healthy diet during ...,,NEG,198406.txt,929910
638689,# CODE: DNR/DNI (discussed with patient and so...,,NEG,198406.txt,929911
638690,"[**Telephone/Fax (1) 92787**](H),",,NEG,198406.txt,929914


In [55]:
sampled_sdf

Unnamed: 0,sentence,concept,y,doc_name,sentence_id
81840,2.,,NEG,100035.txt,637949
82129,2.,,NEG,100039.txt,637949
82418,2.,,NEG,100039.txt,637949
82707,2.,,NEG,100039.txt,637949
82996,2.,,NEG,100187.txt,637949
...,...,...,...,...,...
610685,Transthoracic echo\nrevealed diastolic heart f...,diastolic heart failure,Reason,118418.txt,611466
610686,Transthoracic echo\nrevealed diastolic heart f...,enalapril,Drug,118418.txt,611466
610687,Transthoracic echo\nrevealed diastolic heart f...,Lasix,Drug,118418.txt,611466
621891,"Denies\nparoxysmal nocturnal dyspnea, orthopne...",,NEG,143451.txt,731921


In [56]:
sampled_sendIDlist

{143370,
 173777,
 177318,
 179101,
 238893,
 241062,
 247359,
 335612,
 383947,
 384017,
 457226,
 541227,
 541265,
 545918,
 588558,
 606928,
 611466,
 637949,
 731921,
 827217}

In [58]:
#sampled_sendIDlist

In [59]:
vb_simulator.centroid.keys()

dict_keys(['Drug', 'Form', 'NEG', 'Reason'])

In [62]:
set(sampled_sdf.y.to_list())

{'Drug', 'Form', 'NEG', 'Reason'}