In [2]:
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite import CRF
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from spacy.lang.en import English
from spacy import displacy
from pathlib import Path
import joblib
from spacy.tokens import Doc
from typing import List
import random
from statistics import mean, stdev
from loguru import logger
import sys
import spacy
from medspacy_io.reader.brat_reader import BratDocReader, BratDirReader
import medspacy

## Initiate Brat Reader

In [3]:
cleaned_train_dir=r'..\data\n2c2\cleaned_training'
cleaned_test_dir=r'..\data\n2c2\cleaned_test'
Path(cleaned_train_dir).exists(), Path(cleaned_test_dir).exists()

(True, True)

In [4]:
nlp=spacy.load('en_core_web_sm', disable=['ner'])

In [5]:
dir_reader = BratDirReader(nlp=nlp, schema_file=str(Path(cleaned_train_dir, 'annotation.conf')), support_overlap=True)

## Read Brat annotations | load from pickles

In [6]:
pickle_file= r'..\data\n2c2\spacy_docs.joblib'

In [7]:
if not Path(pickle_file).exists():
    train_docs=dir_reader.read(txt_dir=cleaned_train_dir)
    test_docs=dir_reader.read(txt_dir=cleaned_test_dir)
    print(len(train_docs), len(test_docs))
    joblib.dump((train_docs, test_docs), pickle_file)
else:
    print(f'{pickle_file} already exists, load them directly')
    # before load from pickle, initiate EhostDirReader or EhostDocReader first, because some Doc extension used to store meta data will not be automatically recreated by loading.
    train_docs, test_docs=joblib.load(pickle_file)

..\data\n2c2\spacy_docs.joblib already exists, load them directly


## Define sampling function

In [8]:
rounds =10
seed= 14

In [9]:
len(train_docs), len(test_docs)

(303, 202)

## CRF Wrapper

In [10]:
from CRFWrapper_Sentence import spans_to_bio, convert_docs, word2features, sent2features,compute_metrics_and_averages,  CRFModel


In [11]:
## Get all annotation types: 
annos=set()
for d in train_docs:
    for anno in d.spans.keys():
        annos.add(anno)
print(annos)

{'Strength', 'Duration', 'Route', 'Frequency', 'Drug', 'Dosage', 'Reason', 'Form', 'ADE'}


In [12]:
crf_model=CRFModel(anno_types=annos)

## Convert training and testing docs into sentence level dataframe

In [13]:
_, train_df=convert_docs(train_docs, anno_types=annos)

In [14]:
_, test_df=convert_docs(test_docs, anno_types=annos)

In [15]:
train_df

Unnamed: 0,sentence_id,doc_name,token,label
0,0,100035.txt,Admission,O
1,0,100035.txt,Date,O
2,0,100035.txt,:,O
3,1,100035.txt,[,O
4,1,100035.txt,*,O
...,...,...,...,...
3151,929927,198406.txt,Followup,O
3152,929927,198406.txt,Instructions,O
3153,929927,198406.txt,:,O
3154,929927,198406.txt,\n,O


In [16]:
train_df.label.unique()

array(['O', 'B-Drug', 'B-Route', 'B-Reason', 'I-Reason', 'B-Strength',
       'I-Strength', 'B-Frequency', 'B-Duration', 'I-Duration', 'B-Form',
       'B-Dosage', 'I-Dosage', 'B-ADE', 'I-Frequency', 'I-Drug', 'I-Form',
       'I-Route', 'I-ADE'], dtype=object)

In [15]:
gdf=train_df.groupby('sentence_id')

In [27]:
nsent=[]
psent=[]
for i,sent in gdf:
    if len(sent.label.unique())>1:
        psent.append(sent)
    else:
        nsent.append(sent)

In [68]:
from spacy import displacy
from spacy.tokens import Span

In [77]:
text='welcome to the bank of china.'
doc=nlp(text)
doc.spans['mk']=[Span(doc, 3,6, 'ORG'), Span(doc, 5,6, "GPE")]

In [91]:
displacy.render(doc,style='span')

In [82]:
doc=train_docs[0]

In [83]:
sps=[s for sps in doc.spans.values() for s in sps]

In [85]:
doc.spans['sc']=sps

In [33]:
len(list(train_docs[1].sents))

372

In [35]:
span=train_docs[1].spans['Drug'][0]

In [37]:
sents=list(train_docs[1].sents)

In [55]:
slens={len(s):s for s in sents}

In [58]:
sls=sorted(slens.keys())

In [64]:
slens[sls[-3]]

[**2173-1-11**], course
complicated by neutropenic fever and acute skin GVHD

OTHER MEDICAL HISTORY:
- Embolic stroke in [**3-/2174**] on coumadin
- Cardiomyopathy due to early anthracycline-related
cardiotoxicity [**10/2172**]
- Chronic kidney disease stage III/IV, baseline creatinine
~2.0-2.2
- Asthma
- HTN
- Cervical Intraepithelial neoplasia
- C-section in [**2165**]


Social History:
Smoke: never
EtOH:

In [None]:
displacy.render(train_docs[1], style='span')

In [21]:
visualize_ent(train_docs[0], target_span_type='group', span_group_name='B-Drug')

TypeError: visualize_ent() got an unexpected keyword argument 'target_span_type'

In [28]:
visualize_ent

12029

In [29]:
len(nsent)

39769

In [31]:
train_sentID_list = train_df['sentence_id'].to_list()
train_sentID_set = set(train_sentID_list)
train_sentID_uniqList = list(train_sentID_set)
test_sentID_list = test_df['sentence_id'].to_list()
test_sentID_set = set(test_sentID_list)
test_sentID_uniqList = list(test_sentID_set)

In [32]:
print(len(train_sentID_uniqList), len(test_sentID_uniqList))

51798 34334


In [33]:
int(1.0*len(train_sentID_uniqList)/10)

5179

## sampling simulator

In [35]:
from ALLSampler_Sentence import SamplingSimulator, ModelSamplingSimulator

In [66]:
m_simulator=ModelSamplingSimulator(total_sents=train_df, 
                                   total_round=10, 
                                   modelWrapper=crf_model, 
                                   eval_sents=test_df, 
                                   init_seed=seed)

In [67]:
train_df.columns

Index(['sentence_id', 'doc_name', 'token', 'label'], dtype='object')

In [81]:
sampled, remaining=m_simulator.sample_next_round(pd.DataFrame({k:[] for k in train_df.columns}), train_df)

In [69]:
gdf=sampled.groupby('sentence_id')

In [70]:
nsent=[]
psent=[]
for i,sent in gdf:
    if len(sent.label.unique())>1:
        psent.append(sent)
    else:
        nsent.append(sent)

In [71]:
len(psent), len(nsent)

(1153, 4026)

In [72]:
m_simulator.modelWrapper.fit(sampled)

[32m2024-05-01 13:39:27.654[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m284[0m - [34m[1mReset and train CRF model...[0m
[32m2024-05-01 13:40:27.417[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m292[0m - [34m[1mTraining complete.[0m


In [73]:
round_scores=m_simulator.modelWrapper.bootstrap_eval_DFsent(m_simulator.eval_sents, 1)

[32m2024-05-01 13:40:27.521[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mbootstrap_eval_DFsent[0m:[36m352[0m - [34m[1mPredicting eval sents...[0m
[32m2024-05-01 13:40:52.404[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mbootstrap_eval_DFsent[0m:[36m356[0m - [34m[1mCalculate scores from bootstrapping 1 times[0m
[32m2024-05-01 13:40:53.648[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mbootstrap_eval_DFsent[0m:[36m365[0m - [34m[1mcomplete[0m


In [74]:
round_scores

{'precision': [0.9461084511747021],
 'recall': [0.77212697643424],
 'f1': [0.8503093883950165]}

In [82]:
sampled, remaining=m_simulator.sample_next_round(sampled, remaining, randomly=False)

[32m2024-05-01 13:50:24.462[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m143[0m - [34m[1mTrain model wrapper on sampled 5179 sentences samples[0m
[32m2024-05-01 13:50:24.463[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m144[0m - [34m[1mUse trained model to estimate the remaining data certainty.[0m
[32m2024-05-01 13:51:25.184[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m153[0m - [34m[1mremain 839161 rows, sort indx on certainty for 46619 sentences[0m
[32m2024-05-01 13:51:25.331[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m169[0m - [34m[1mUpdate model with new sampled data[0m


In [88]:
gdf=sampled.groupby('sentence_id')
nsent=[]
psent=[]
for i,sent in gdf:
    if len(sent.label.unique())>1:
        psent.append(sent)
    else:
        nsent.append(sent)
len(psent), len(nsent)        

(2367, 7991)

In [1]:
m_simulator.modelWrapper.fit(sampled)

NameError: name 'm_simulator' is not defined

In [None]:
round_scores=m_simulator.modelWrapper.bootstrap_eval_DFsent(m_simulator.eval_sents, 1)
round_scores

## test run

In [20]:
scores=m_simulator.simulate_rounds() #sample sentences for 10 rounds; each round bootstrap sampling 200 for evaluation

[32m2024-04-23 23:49:06.744[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m92[0m - [1msimulate round 0.[0m
[32m2024-04-23 23:49:06.746[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m75[0m - [34m[1mThe first round sampling will be random[0m
[32m2024-04-23 23:49:07.146[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m82[0m - [1mcurrent sampled sentences: 5179, remaining sentences: 46619[0m
[32m2024-04-23 23:49:09.124[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m284[0m - [34m[1mReset and train CRF model...[0m
[32m2024-04-23 23:50:07.596[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m292[0m - [34m[1mTraining complete.[0m
[32m2024-04-23 23:50:07.690[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mbootstrap_eval_DFsent[0m:[36m352[0m - [34m[1mPredicting eval sents...[0m
[32m2024-04

In [21]:
def compute_mean_ci(scores):
    ave=np.mean(scores)
    ci=np.percentile(scores, [2.5, 97.5])
    return ave, ci

summary={'precision': [], 'pl':[], 'pu': [], 'recall': [], 'rl':[], 'ru': [], 'f1':[], 'fl':[], 'fu': []}
for s in scores:    
    for k,v in s.items():
        ave, (l, u)=compute_mean_ci(v)
        summary[k].append(ave)
        summary[k[0]+'l'].append(l)
        summary[k[0]+'u'].append(u)


In [22]:
pd.options.display.float_format='{:,.4f}'.format

In [23]:
pd.DataFrame(summary)

Unnamed: 0,precision,pl,pu,recall,rl,ru,f1,fl,fu
0,0.9461,0.9429,0.9494,0.7718,0.7655,0.7785,0.8501,0.846,0.8547
1,0.9419,0.9387,0.9452,0.8132,0.8063,0.82,0.8728,0.8689,0.877
2,0.943,0.9401,0.9459,0.8264,0.8206,0.8327,0.8808,0.8774,0.885
3,0.9446,0.9413,0.9476,0.8359,0.8306,0.8412,0.887,0.883,0.8907
4,0.9439,0.9405,0.9474,0.8453,0.8402,0.8507,0.8919,0.8883,0.8956
5,0.9456,0.942,0.9485,0.8474,0.8415,0.8522,0.8938,0.8899,0.8973
6,0.9462,0.9432,0.949,0.8494,0.8444,0.8544,0.8952,0.8915,0.8988
7,0.9455,0.9424,0.9485,0.8539,0.8486,0.8589,0.8974,0.8936,0.9011
8,0.9463,0.9431,0.9489,0.8573,0.8528,0.8622,0.8996,0.8962,0.9029
9,0.9459,0.9424,0.9489,0.8603,0.8557,0.8653,0.901,0.8981,0.9043


## Bootstrap 3 different initials runs

In [20]:
boostrap_runs=3
total_round=10
logger.remove()
logger.add(sys.stderr, level='INFO')

1

In [21]:
random.seed(14)
seeds=[random.randint(1,10000000) for  _ in range(boostrap_runs)]
seeds

[1792286, 8843471, 4142887]

In [22]:
all_scores=[]
for si, seed  in enumerate(seeds):
    logger.info(f'start run {si}.')
    crf_model=CRFModel(anno_types=annos)
    m_simulator=ModelSamplingSimulator(total_sents=train_df, 
                                   total_round=10, 
                                   modelWrapper=crf_model, 
                                   eval_sents=test_df, 
                                   init_seed=seed)  
    scores=m_simulator.simulate_rounds()
    all_scores.append(scores) 

[32m2024-04-25 15:39:40.634[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mstart run 0.[0m
[32m2024-04-25 15:39:40.759[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m92[0m - [1msimulate round 0.[0m
[32m2024-04-25 15:39:41.151[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m82[0m - [1mcurrent sampled sentences: 5179, remaining sentences: 46619[0m
[32m2024-04-25 15:45:20.787[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m92[0m - [1msimulate round 1.[0m
[32m2024-04-25 15:46:21.181[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m82[0m - [1mcurrent sampled sentences: 10358, remaining sentences: 41440[0m
[32m2024-04-25 15:53:05.117[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m92[0m - [1msimulate round 2.[0m
[32m2024-04-25 15:53:58.390[0m | [1mINFO    [0m | [36m

In [23]:
joblib.dump(all_scores, r'../data/n2c2/scores_sentence_sampling/ner_crf_scores_sentenceSampling.joblib')


['../data/n2c2/scores_sentence_sampling/ner_crf_scores_sentenceSampling.joblib']