In [1]:
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite import CRF
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from spacy.lang.en import English
from spacy import displacy
from pathlib import Path
import joblib
from spacy.tokens import Doc
from typing import List
import random
from statistics import mean, stdev
from loguru import logger
import sys
import spacy
from medspacy_io.reader.brat_reader import BratDocReader, BratDirReader
import medspacy

### initialize Brat reader

In [2]:
cleaned_train_dir=r'..\data\n2c2\cleaned_training'
cleaned_test_dir=r'..\data\n2c2\cleaned_test'
Path(cleaned_train_dir).exists(), Path(cleaned_test_dir).exists()

(True, True)

In [3]:
nlp=spacy.load('en_core_web_sm', disable=['ner'])

In [4]:
dir_reader = BratDirReader(nlp=nlp, schema_file=str(Path(cleaned_train_dir, 'annotation.conf')), support_overlap=True)

### Read eHOST annotations | load from pickles

In [5]:
pickle_file= r'..\data\n2c2\spacy_docs.joblib'

In [6]:
if not Path(pickle_file).exists():
    train_docs=dir_reader.read(txt_dir=cleaned_train_dir)
    test_docs=dir_reader.read(txt_dir=cleaned_test_dir)
    print(len(train_docs), len(test_docs))
    joblib.dump((train_docs, test_docs), pickle_file)
else:
    print(f'{pickle_file} already exists, load them directly')
    # before load from pickle, initiate EhostDirReader or EhostDocReader first, because some Doc extension used to store meta data will not be automatically recreated by loading.
    train_docs, test_docs=joblib.load(pickle_file)

..\data\n2c2\spacy_docs.joblib already exists, load them directly


In [7]:
len(train_docs), len(test_docs)

(303, 202)

### CRF Wrapper (only use for eval)

In [8]:
from CRFWrapper_Sentence import spans_to_bio, convert_docs, word2features, sent2features,compute_metrics_and_averages,  CRFModel


In [9]:
## Get all annotation types: 
annos=set()
for d in train_docs:
    for anno in d.spans.keys():
        annos.add(anno)
print(annos)

{'Reason', 'Drug', 'Frequency', 'Duration', 'Strength', 'Dosage', 'Route', 'Form', 'ADE'}


In [10]:
crf_model=CRFModel(anno_types=annos)

### converting docs to DF

In [11]:
from ALLSampler_Sentence import SamplingSimulator, ModelSamplingSimulator, VBSamplingSimulator, convert_docs_medspacyIOvec

In [12]:
_, train_df=convert_docs(train_docs, anno_types=annos)
_, test_df=convert_docs(test_docs, anno_types=annos)

In [13]:
train_df #sentence_id	doc_name	token	label	sentence

Unnamed: 0,sentence_id,doc_name,token,label,sentence
0,0,100035.txt,Admission,O,Admission Date:
1,0,100035.txt,Date,O,Admission Date:
2,0,100035.txt,:,O,Admission Date:
3,1,100035.txt,[,O,[**2115-2-22**] Discharge Date: ...
4,1,100035.txt,*,O,[**2115-2-22**] Discharge Date: ...
...,...,...,...,...,...
3151,929927,198406.txt,Followup,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...
3152,929927,198406.txt,Instructions,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...
3153,929927,198406.txt,:,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...
3154,929927,198406.txt,\n,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...


In [14]:
sdf_labels_train=convert_docs_medspacyIOvec(train_docs)
sdf_labels_train #sentence	concept	y	doc_name

Unnamed: 0,sentence,concept,y,doc_name
0,[**2078-8-9**] Sex: M\n\nService...,Vicodin,Drug,100035.txt
1,"While at the OSH, he received CTX,\nazithromyc...",CTX,Drug,100035.txt
2,"While at the OSH, he received CTX,\nazithromyc...",azithromycin,Drug,100035.txt
3,"While at the OSH, he received CTX,\nazithromyc...",epinephrine,Drug,100035.txt
4,"While at the OSH, he received CTX,\nazithromyc...",solumedrol,Drug,100035.txt
...,...,...,...,...
90405,[**Name (NI) **],,NEG,198406.txt
90406,"[**Telephone/Fax (1) 92787**](H),",,NEG,198406.txt
90407,[**Telephone/Fax (1) 92788**](C)\n\n\nMedicati...,,NEG,198406.txt
90408,3.,,NEG,198406.txt


In [15]:
# load embedding for unique sentences
pickle_embedding_file= r'..\data\n2c2\embedding_df_uniqueSentID.joblib'
embedding_df=joblib.load(pickle_embedding_file)

In [16]:
embedding_df #51798 unique sentences

Unnamed: 0,sentence_id,sentence,embedding
0,0,Admission Date:,"[0.026282, 0.03218903, -0.022386529, 0.0493732..."
3,1,[**2115-2-22**] Discharge Date: ...,"[0.016159855, 0.042264156, -0.018290585, -0.05..."
35,2,[**2078-8-9**] Sex: M\n\nService...,"[0.025958579, -0.05749655, 0.012378361, -0.009..."
113,3,[**Known lastname 3234**] is a 36 year old gen...,"[0.023170307, 0.03989108, 0.026217388, -0.0272..."
163,4,The patient initially presented to LGH ED with...,"[0.008176211, -0.06342948, 0.048615105, -0.045..."
...,...,...,...
3059,929923,"Cyanocobalamin 1,000 mcg/mL Injection once a m...","[0.050521564, -0.08905716, -0.0019493615, -0.0..."
3071,929924,"Lorazepam 0.25 QAM, O.25 QPM, 0.5 mg QHS\n8 Ca...","[-0.030010266, -0.062390286, 0.00167252, 0.016..."
3086,929925,Cream Topical TID\n9.,"[0.026732022, -0.04987913, 0.024520764, -0.016..."
3092,929926,Acetaminophen 1000 mg PO Q6H\n10.,"[-0.017295217, -0.10513715, -0.0030776137, -0...."


In [17]:
#join the embedding to get sentence ID
sdf_labels_sid = sdf_labels_train.merge(embedding_df, how='inner', on='sentence') 

In [18]:
sdf_labels_sid

Unnamed: 0,sentence,concept,y,doc_name,sentence_id,embedding
0,[**2078-8-9**] Sex: M\n\nService...,Vicodin,Drug,100035.txt,2,"[0.025958579, -0.05749655, 0.012378361, -0.009..."
1,"While at the OSH, he received CTX,\nazithromyc...",CTX,Drug,100035.txt,5,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
2,"While at the OSH, he received CTX,\nazithromyc...",azithromycin,Drug,100035.txt,5,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
3,"While at the OSH, he received CTX,\nazithromyc...",epinephrine,Drug,100035.txt,5,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
4,"While at the OSH, he received CTX,\nazithromyc...",solumedrol,Drug,100035.txt,5,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
...,...,...,...,...,...,...
638687,Patient had some cardiac enzyme leaks\nduring ...,,NEG,198406.txt,929907,"[0.02065785, -0.06587324, 0.055154495, 0.01074..."
638688,Patient was given cardiac healthy diet during ...,,NEG,198406.txt,929910,"[0.030450102, -0.042418838, 0.00325665, 0.0384..."
638689,# CODE: DNR/DNI (discussed with patient and so...,,NEG,198406.txt,929911,"[0.023342747, 0.013347558, -0.01095362, -0.052..."
638690,"[**Telephone/Fax (1) 92787**](H),",,NEG,198406.txt,929914,"[0.02003492, 0.029056935, -0.0140215475, 0.007..."


### test the sampling simulation

In [19]:
faiss_index_path= r'..\data\n2c2\faiss_index_st768'
pickle_embedding_file= r'..\data\n2c2\embedding_df_uniqueSentID.joblib'

In [20]:
# Debugging mode
logger.remove()
logger.add(sys.stderr, level='INFO')
logger.add(sys.stderr, level='DEBUG')

2

In [21]:
# initialize VB max: sample 3 round each round 10 sents
seed=14
vb_simulator=VBSamplingSimulator(total_sents=train_df, 
                                 total_round=3, 
                                 modelWrapper=crf_model, 
                                 eval_sents=test_df, 
                                 init_seed=seed, 
                                 faiss_index_path=faiss_index_path, 
                                 embedding_df=embedding_df,
                                 sdf_labels=sdf_labels_sid[['sentence','concept', 'y', 'doc_name','sentence_id']]
                                 #,
                                 #min_dist_diff=True
                                )

[32m2024-05-15 16:06:27.010[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36m__init__[0m:[36m69[0m - [34m[1mnum per found unique sent: 10[0m
[32m2024-05-15 16:06:27.018[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36m__init__[0m:[36m224[0m - [34m[1mLoading index...[0m
[32m2024-05-15 16:06:27.545[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36m__init__[0m:[36m226[0m - [34m[1mdone[0m


In [22]:
def distribution(df):
    gdf=df.groupby('sentence_id')
    nsent={}
    psent={}
    for i,sent in gdf:
        labels=list(sent.label.unique()) #sentence labe is in `concept column`
        labels.remove('O')
        if len(labels)>0:
            psent[i]=(labels, ' '.join([str(t) for t in sent.token]))
        else:
            nsent[i]=(labels,' '.join([str(t) for t in sent.token]))
    return psent, nsent

#### first round sampling completely random

In [23]:
vb_simulator.num_per_round=10

In [24]:
Sampled_1roundRandom,Remaining_1roundRandom=vb_simulator.keep_sample(True)

[32m2024-05-15 16:06:33.003[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m78[0m - [34m[1mThe first round sampling will be random[0m
[32m2024-05-15 16:06:33.428[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 10, remaining sentences: 51788[0m
[32m2024-05-15 16:06:33.428[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 10, remaining sentences: 51788[0m


In [25]:
psent, nsent=distribution(vb_simulator.sampled)

In [26]:
len(nsent), len(psent) #neg sent, labeled sent

(9, 1)

In [27]:
set(vb_simulator.sampled.sentence_id.to_list()) # unique sentences sampled in 1 round (random sampling)

{173777,
 238893,
 247359,
 384017,
 541227,
 541265,
 545918,
 606928,
 611466,
 827217}

In [28]:
type(vb_simulator.sdf_labels.iloc[0].sentence_id) #int32 is type of int

numpy.int32

#### 2nd round sample 10 sentence according to VBmax

In [29]:
vb_simulator.modelWrapper.fit(vb_simulator.sampled) #CRF model in VBsimulator is only for evaluation performances

[32m2024-05-15 16:06:40.572[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m285[0m - [34m[1mReset and train CRF model...[0m
[32m2024-05-15 16:06:40.618[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m293[0m - [34m[1mTraining complete.[0m


In [30]:
vb_simulator.sampled.shape, vb_simulator.remaining.shape

((141, 5), (932765, 5))

In [31]:
# this is to get the centroids from sampled dataset
vb_simulator.fit(vb_simulator.sampled)

[32m2024-05-15 16:06:44.510[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36mfit[0m:[36m265[0m - [34m[1m4 centroids detected from the given sampled_docs[0m


In [32]:
# test the sample_next_round
sampled_2nd, remaining_2nd =vb_simulator.sample_next_round(vb_simulator.sampled, vb_simulator.remaining, randomly=False)

[32m2024-05-15 16:06:48.431[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m319[0m - [34m[1mCalculating centroids...[0m
[32m2024-05-15 16:06:48.644[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36mfit[0m:[36m265[0m - [34m[1m4 centroids detected from the given sampled_docs[0m
[32m2024-05-15 16:06:48.645[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m322[0m - [34m[1mSearching from the vector index...[0m
[32m2024-05-15 16:06:48.753[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m338[0m - [1mdistance shape: (929928, 4), max to retrieve 51789 sentences[0m
[32m2024-05-15 16:06:48.753[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m338[0m - [1mdistance shape: (929928, 4), max to retrieve 51789 sentences[0m
[32m2024-05-15 16:06:48.755[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:

DISTANCE max_values each row [1.87979615 1.72878361 1.47523391 ... 1.69934332 1.51038969 1.32937431] size: 929928
DISTANCE min_values each row [1.76865101 1.4595108  0.92393023 ... 1.37233245 1.11906171 1.02086699] size: 929928
DISTANCE max_diff each row [0.11114514 0.2692728  0.55130368 ... 0.32701087 0.39132798 0.30850732] size: 929928
after masking max_diff size 51780
new_sampled_sentID is Unique Sent in sortedID: {0, 917505, 917506, 917507, 917508, 917509, 917504, 7, 917512, 917513}


[32m2024-05-15 16:06:50.058[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m381[0m - [34m[1mBEFORE update model with old sampled data 141, old remaining data 932765 [0m
[32m2024-05-15 16:06:50.060[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m384[0m - [34m[1mnew_sampled 123, new remaining data 932642 [0m
[32m2024-05-15 16:06:50.062[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m387[0m - [34m[1mUpdate model with new sampled data 264, new remaining data 932642 [0m


sampled sentID before updating: [611466, 541227, 238893, 606928, 173777, 384017, 541265, 827217, 545918, 247359]
new sampled current round: {0, 917505, 917506, 917507, 917508, 917509, 917504, 7, 917512, 917513}
sampled sentID after updating: [0, 917504, 917505, 917506, 917507, 917508, 917509, 7, 917512, 917513, 611466, 384017, 541227, 238893, 247359, 606928, 173777, 541265, 827217, 545918]


In [33]:
# now update the sampled to self.sampled
sampled_2nd_self, remaining_2nd_self=vb_simulator.keep_sample(False)

[32m2024-05-15 16:07:02.035[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m81[0m - [34m[1mSample according to certainties[0m
[32m2024-05-15 16:07:02.156[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m319[0m - [34m[1mCalculating centroids...[0m
[32m2024-05-15 16:07:02.369[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36mfit[0m:[36m265[0m - [34m[1m4 centroids detected from the given sampled_docs[0m
[32m2024-05-15 16:07:02.371[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m322[0m - [34m[1mSearching from the vector index...[0m
[32m2024-05-15 16:07:02.481[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m338[0m - [1mdistance shape: (929928, 4), max to retrieve 51789 sentences[0m
[32m2024-05-15 16:07:02.481[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36

DISTANCE max_values each row [1.87979615 1.72878361 1.47523391 ... 1.69934332 1.51038969 1.32937431] size: 929928
DISTANCE min_values each row [1.76865101 1.4595108  0.92393023 ... 1.37233245 1.11906171 1.02086699] size: 929928
DISTANCE max_diff each row [0.11114514 0.2692728  0.55130368 ... 0.32701087 0.39132798 0.30850732] size: 929928
after masking max_diff size 51780
new_sampled_sentID is Unique Sent in sortedID: {0, 917505, 917506, 917507, 917508, 917509, 917504, 7, 917512, 917513}


[32m2024-05-15 16:07:03.869[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m381[0m - [34m[1mBEFORE update model with old sampled data 141, old remaining data 932765 [0m
[32m2024-05-15 16:07:03.872[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m384[0m - [34m[1mnew_sampled 123, new remaining data 932642 [0m
[32m2024-05-15 16:07:03.874[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m387[0m - [34m[1mUpdate model with new sampled data 264, new remaining data 932642 [0m
[32m2024-05-15 16:07:04.008[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 20, remaining sentences: 51778[0m
[32m2024-05-15 16:07:04.008[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 20, remaining sentences: 51778[0m


sampled sentID before updating: [611466, 541227, 238893, 606928, 173777, 384017, 541265, 827217, 545918, 247359]
new sampled current round: {0, 917505, 917506, 917507, 917508, 917509, 917504, 7, 917512, 917513}
sampled sentID after updating: [0, 917504, 917505, 917506, 917507, 917508, 917509, 7, 917512, 917513, 611466, 384017, 541227, 238893, 247359, 606928, 173777, 541265, 827217, 545918]


In [34]:
# check if self.sampled updated after 2nd round
vb_simulator.sampled.shape, vb_simulator.remaining.shape

((264, 5), (932642, 5))

#### 3rd round VB

In [35]:
vb_simulator.modelWrapper.fit(vb_simulator.sampled) #CRF model in VBsimulator is only for evaluation performances

[32m2024-05-15 16:07:13.315[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m285[0m - [34m[1mReset and train CRF model...[0m
[32m2024-05-15 16:07:13.378[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m293[0m - [34m[1mTraining complete.[0m


In [36]:
vb_simulator.sampled.shape, vb_simulator.remaining.shape #output of 2nd round; input of 3rd round

((264, 5), (932642, 5))

In [37]:
# obtain the centroid from the sampled data in 2nd round
vb_simulator.fit(vb_simulator.sampled)

[32m2024-05-15 16:07:17.358[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36mfit[0m:[36m265[0m - [34m[1m4 centroids detected from the given sampled_docs[0m


In [38]:
# sampled the 3rd round and updated self.sampled and self.remaining
sampled_3rd_self, remaining_3rd_self=vb_simulator.keep_sample(False) #this should be 3rd round

[32m2024-05-15 16:07:20.381[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m81[0m - [34m[1mSample according to certainties[0m
[32m2024-05-15 16:07:20.499[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m319[0m - [34m[1mCalculating centroids...[0m
[32m2024-05-15 16:07:21.038[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36mfit[0m:[36m265[0m - [34m[1m4 centroids detected from the given sampled_docs[0m
[32m2024-05-15 16:07:21.040[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m322[0m - [34m[1mSearching from the vector index...[0m
[32m2024-05-15 16:07:21.153[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m338[0m - [1mdistance shape: (929928, 4), max to retrieve 51789 sentences[0m
[32m2024-05-15 16:07:21.153[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36

DISTANCE max_values each row [1.87979615 1.72878361 1.47523391 ... 1.69934332 1.51038969 1.40727425] size: 929928
DISTANCE min_values each row [0.04539243 0.85744834 0.89775473 ... 1.36736798 0.98108822 0.93269664] size: 929928
DISTANCE max_diff each row [1.83440372 0.87133527 0.57747918 ... 0.33197534 0.52930146 0.47457761] size: 929928
after masking max_diff size 51764
new_sampled_sentID is Unique Sent in sortedID: {3, 5, 917510, 917511, 8, 917514, 12, 13, 917519, 917521}


[32m2024-05-15 16:07:22.615[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m381[0m - [34m[1mBEFORE update model with old sampled data 264, old remaining data 932642 [0m
[32m2024-05-15 16:07:22.618[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m384[0m - [34m[1mnew_sampled 191, new remaining data 932451 [0m
[32m2024-05-15 16:07:22.620[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m387[0m - [34m[1mUpdate model with new sampled data 455, new remaining data 932451 [0m
[32m2024-05-15 16:07:22.758[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 30, remaining sentences: 51768[0m
[32m2024-05-15 16:07:22.758[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 30, remaining sentences: 51768[0m


sampled sentID before updating: [0, 917504, 917505, 917506, 917507, 917508, 917509, 7, 917512, 917513, 611466, 384017, 541227, 238893, 247359, 606928, 173777, 541265, 827217, 545918]
new sampled current round: {3, 5, 917510, 917511, 8, 917514, 12, 13, 917519, 917521}
sampled sentID after updating: [0, 917504, 917505, 917506, 917507, 917508, 917509, 7, 917512, 917513, 611466, 3, 5, 8, 12, 13, 917511, 384017, 917514, 917519, 917521, 917510, 541227, 238893, 247359, 606928, 173777, 541265, 827217, 545918]


In [39]:
vb_simulator.sampled.shape, vb_simulator.remaining.shape #output of 3rd round

((455, 5), (932451, 5))

In [40]:
sampled_3rd_self.shape, remaining_3rd_self.shape

((455, 5), (932451, 5))

In [40]:
type(vb_simulator.centroid)

dict

In [41]:
vb_simulator.centroid.keys()

dict_keys(['Drug', 'Form', 'NEG', 'Reason', 'Route'])

In [42]:
vb_simulator.centroid['Drug']

array([ 4.45337072e-02, -6.01181239e-02,  1.02942521e-02,  5.56362746e-03,
        2.74320059e-02,  2.50571650e-02, -2.82390062e-02,  5.34454212e-02,
       -2.75464337e-02,  4.29910747e-03,  1.62351150e-02,  1.04062436e-02,
       -8.26748554e-03, -2.50051860e-02,  2.88324663e-03,  1.71642490e-02,
       -7.18276063e-03,  1.39244320e-02, -4.65564393e-02, -1.82620175e-02,
       -2.82244366e-02,  7.65391439e-03,  7.15207634e-03,  2.77388226e-02,
        7.08213374e-02,  3.61935329e-03,  3.89691652e-03,  2.15454120e-03,
       -8.40971433e-03, -4.74486351e-02, -4.16325890e-02,  2.21200585e-02,
        7.94919953e-03, -9.37988758e-02,  1.55014959e-06,  1.58981644e-02,
        5.98545093e-03,  1.31536061e-02, -2.28002053e-02, -7.13021830e-02,
        4.48224507e-03, -9.83978808e-02,  9.67068132e-03, -1.59962028e-02,
       -1.31882094e-02, -6.07571639e-02,  2.45078187e-03,  4.51347642e-02,
       -3.03639472e-02, -3.09917983e-02,  2.09520776e-02,  4.98968363e-02,
       -1.83546413e-02,  

In [43]:
D, I=vb_simulator.index.search(vb_simulator.centroid['Drug'].reshape(1, len(vb_simulator.centroid['Drug'])), vb_simulator.max_retrieve)

In [44]:
I.shape

(1, 51789)

In [33]:
I

NameError: name 'I' is not defined