This notebook demonstrates using vector index to sample. There are different ways to determine the uncertainty: 
min_dist_diff: if true, prioritize the sentences that have distances to two centroids have smaller difference
                if false, then prioritize the sentences that have a smaller difference of the max differences to all centroids (max distance- min distance).  
This notebook use  min_dist_diff=False           

# Import 

In [1]:
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite import CRF
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from spacy.lang.en import English
from spacy import displacy
from pathlib import Path
import joblib
from spacy.tokens import Doc
from typing import List
import random
from statistics import mean, stdev
from loguru import logger
import sys
import spacy

In [2]:
from medspacy_io.reader.brat_reader import BratDocReader, BratDirReader
import medspacy

## Initiate BratDirReader

In [3]:
cleaned_train_dir=r'..\data\n2c2\cleaned_training'
cleaned_test_dir=r'..\data\n2c2\cleaned_test'
Path(cleaned_train_dir).exists(), Path(cleaned_test_dir).exists()

(True, True)

In [4]:
nlp=spacy.load('en_core_web_sm', disable=['ner'])

In [5]:
dir_reader = BratDirReader(nlp=nlp, schema_file=str(Path(cleaned_train_dir, 'annotation.conf')), support_overlap=True)

## Read eHOST annotations | load from pickles

In [6]:
pickle_file= r'..\data\n2c2\spacy_docs.joblib'

In [7]:
if not Path(pickle_file).exists():
    train_docs=dir_reader.read(txt_dir=cleaned_train_dir)
    test_docs=dir_reader.read(txt_dir=cleaned_test_dir)
    print(len(train_docs), len(test_docs))
    joblib.dump((train_docs, test_docs), pickle_file)
else:
    print(f'{pickle_file} already exists, load them directly')
    # before load from pickle, initiate EhostDirReader or EhostDocReader first, because some Doc extension used to store meta data will not be automatically recreated by loading.
    train_docs, test_docs=joblib.load(pickle_file)

..\data\n2c2\spacy_docs.joblib already exists, load them directly


# Define sampling functions

In [8]:
rounds =10
seed= 14

In [9]:
len(train_docs), len(test_docs)

(303, 202)

### Define CRF Wrapper

In [18]:
from CRFWrapper import spans_to_bio, convert_docs, word2features, sent2features,compute_metrics_and_averages,  CRFModel
from ALSampler import SamplingSimulator, ModelSamplingSimulator, VBSamplingSimulator


In [11]:
faiss_index_path= r'..\data\n2c2\faiss_index_st768'

In [12]:
## Get all annotation types: 
annos=set()
for d in train_docs:
    for anno in d.spans.keys():
        annos.add(anno)
print(annos)

{'Frequency', 'Drug', 'ADE', 'Form', 'Dosage', 'Strength', 'Duration', 'Route', 'Reason'}


In [13]:
pickle_embedding_file= r'..\data\n2c2\embedding_df.joblib'
crf_model=CRFModel(anno_types=annos)
embedding_df=joblib.load(pickle_embedding_file)
r_simulator=VBSamplingSimulator(train_docs, total_round=10, modelWrapper=crf_model, eval_docs=test_docs, init_seed=seed, faiss_index_path=faiss_index_path, embedding_df=embedding_df)

[32m2024-04-04 22:57:25.610[0m | [34m[1mDEBUG   [0m | [36mALSampler[0m:[36m__init__[0m:[36m109[0m - [34m[1mLoading index...[0m
[32m2024-04-04 22:57:26.153[0m | [34m[1mDEBUG   [0m | [36mALSampler[0m:[36m__init__[0m:[36m111[0m - [34m[1mdone[0m


### Test run

In [14]:
scores=r_simulator.simulate_rounds(boostrap_times=3)

[32m2024-04-04 22:57:53.232[0m | [1mINFO    [0m | [36mALSampler[0m:[36msimulate_rounds[0m:[36m45[0m - [1msimulate round 0.[0m
[32m2024-04-04 22:57:53.234[0m | [34m[1mDEBUG   [0m | [36mALSampler[0m:[36mkeep_sample[0m:[36m35[0m - [34m[1mThe first round sampling will be random[0m
[32m2024-04-04 22:57:53.237[0m | [1mINFO    [0m | [36mALSampler[0m:[36mkeep_sample[0m:[36m39[0m - [1mcurrent sampled: 30, remaining: 273[0m
[32m2024-04-04 22:57:57.029[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper[0m:[36mfit[0m:[36m251[0m - [34m[1mReset and train CRF model...[0m
[32m2024-04-04 22:58:53.034[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper[0m:[36mfit[0m:[36m259[0m - [34m[1mTraining complete.[0m
[32m2024-04-04 22:58:53.133[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper[0m:[36mbootstrap_eval[0m:[36m280[0m - [34m[1mPredicting eval docs...[0m
[32m2024-04-04 22:59:29.807[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper[0m:[36mbootstrap_eval[0

In [15]:
def compute_mean_ci(scores):
    ave=np.mean(scores)
    ci=np.percentile(scores, [2.5, 97.5])
    return ave, ci

summary={'precision': [], 'pl':[], 'pu': [], 'recall': [], 'rl':[], 'ru': [], 'f1':[], 'fl':[], 'fu': []}
for s in scores:    
    for k,v in s.items():
        ave, (l, u)=compute_mean_ci(v)
        summary[k].append(ave)
        summary[k[0]+'l'].append(l)
        summary[k[0]+'u'].append(u)


In [16]:
pd.DataFrame(summary)

Unnamed: 0,precision,pl,pu,recall,rl,ru,f1,fl,fu
0,0.942246,0.94163,0.942988,0.776956,0.774381,0.780254,0.851653,0.849855,0.853937
1,0.939879,0.939207,0.94035,0.813052,0.809604,0.816221,0.871875,0.869971,0.873879
2,0.939547,0.937581,0.941075,0.822603,0.820527,0.825724,0.87719,0.87626,0.878127
3,0.944308,0.943111,0.946018,0.837092,0.834839,0.839625,0.887472,0.885732,0.888681
4,0.945256,0.944303,0.946406,0.845447,0.843417,0.848065,0.892568,0.891948,0.8936
5,0.941062,0.939423,0.943176,0.850685,0.847509,0.853065,0.893589,0.892786,0.894164
6,0.944856,0.942882,0.946125,0.853192,0.852803,0.85375,0.896687,0.895583,0.897567
7,0.94507,0.943365,0.946529,0.854524,0.85091,0.857363,0.897514,0.896176,0.89831
8,0.945791,0.944488,0.947165,0.856824,0.854951,0.860053,0.899109,0.898077,0.900329
9,0.947499,0.945725,0.949551,0.861023,0.857975,0.863436,0.902193,0.899715,0.904448


## Bootstrap 3 runs

In [17]:
logger.remove()
logger.add(sys.stderr, level='INFO')

1

In [15]:
boostrap_runs=3
total_round=10

In [16]:
random.seed(14)
seeds=[random.randint(1,10000000) for  _ in range(boostrap_runs)]
seeds

[1792286, 8843471, 4142887]

In [17]:
all_scores=[]
embedding_df=joblib.load(pickle_embedding_file)
for si, seed  in enumerate(seeds):
    logger.info(f'start run {si}.')
    pickle_embedding_file= r'..\data\n2c2\embedding_df.joblib'
    crf_model=CRFModel(anno_types=annos)
    v_simulator=VBSamplingSimulator(train_docs, total_round=10, modelWrapper=crf_model, eval_docs=test_docs, init_seed=seed, faiss_index_path=faiss_index_path, embedding_df=embedding_df)
    scores=v_simulator.simulate_rounds(boostrap_times=200)
    all_scores.append(scores) 

[32m2024-03-24 21:53:27.141[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mstart run 0.[0m
[32m2024-03-24 21:53:37.286[0m | [1mINFO    [0m | [36mALSampler[0m:[36msimulate_rounds[0m:[36m45[0m - [1msimulate round 0.[0m
[32m2024-03-24 21:53:37.289[0m | [1mINFO    [0m | [36mALSampler[0m:[36mkeep_sample[0m:[36m39[0m - [1mcurrent sampled: 30, remaining: 273[0m
[32m2024-03-24 21:59:31.233[0m | [1mINFO    [0m | [36mALSampler[0m:[36msimulate_rounds[0m:[36m45[0m - [1msimulate round 1.[0m
[32m2024-03-24 21:59:31.612[0m | [1mINFO    [0m | [36mALSampler[0m:[36msample_next_round[0m:[36m192[0m - [1mdistance shape: (47530, 10), max to retrieve 47501 sentences[0m
[32m2024-03-24 21:59:33.539[0m | [1mINFO    [0m | [36mALSampler[0m:[36mkeep_sample[0m:[36m39[0m - [1mcurrent sampled: 60, remaining: 243[0m
[32m2024-03-24 22:06:38.235[0m | [1mINFO    [0m | [36mALSampler[0m:[36msimulate_rounds[0m:[36m45[0m - 

In [18]:
joblib.dump(all_scores, r'..\data\vb_sampling\ner_vbmax_scores.joblib')

['..\\data\\vb_sampling\\ner_vbmax_scores.joblib']