In [1]:
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite import CRF
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from spacy.lang.en import English
from spacy import displacy
from pathlib import Path
import joblib
from spacy.tokens import Doc
from typing import List
import random
from statistics import mean, stdev
from loguru import logger
import sys
import spacy
from medspacy_io.reader.brat_reader import BratDocReader, BratDirReader
import medspacy

## initialize brat reader

In [2]:
cleaned_train_dir=r'..\data\n2c2\cleaned_training'
cleaned_test_dir=r'..\data\n2c2\cleaned_test'
Path(cleaned_train_dir).exists(), Path(cleaned_test_dir).exists()

(True, True)

In [3]:
nlp=spacy.load('en_core_web_sm', disable=['ner'])

In [4]:
dir_reader = BratDirReader(nlp=nlp, schema_file=str(Path(cleaned_train_dir, 'annotation.conf')), support_overlap=True)

## Read eHOST annotations | load from pickles

In [5]:
pickle_file= r'..\data\n2c2\spacy_docs.joblib'

In [6]:
if not Path(pickle_file).exists():
    train_docs=dir_reader.read(txt_dir=cleaned_train_dir)
    test_docs=dir_reader.read(txt_dir=cleaned_test_dir)
    print(len(train_docs), len(test_docs))
    joblib.dump((train_docs, test_docs), pickle_file)
else:
    print(f'{pickle_file} already exists, load them directly')
    # before load from pickle, initiate EhostDirReader or EhostDocReader first, because some Doc extension used to store meta data will not be automatically recreated by loading.
    train_docs, test_docs=joblib.load(pickle_file)

..\data\n2c2\spacy_docs.joblib already exists, load them directly


In [7]:
len(train_docs), len(test_docs)

(303, 202)

## CRF Wrapper (only use for eval)

In [8]:
from CRFWrapper_Sentence import spans_to_bio, convert_docs, word2features, sent2features,compute_metrics_and_averages,  CRFModel


In [9]:
## Get all annotation types: 
annos=set()
for d in train_docs:
    for anno in d.spans.keys():
        annos.add(anno)
print(annos)

{'Reason', 'Duration', 'Form', 'Dosage', 'Strength', 'ADE', 'Route', 'Drug', 'Frequency'}


In [10]:
crf_model=CRFModel(anno_types=annos)

## converting docs into sentence level dataframe

In [11]:
from ALLSampler_Sentence import SamplingSimulator, ModelSamplingSimulator, VBSamplingSimulator, convert_docs_medspacyIOvec

In [12]:
# get sentence label from docs
sdf_labels_train=convert_docs_medspacyIOvec(train_docs)
sdf_labels_train

Unnamed: 0,sentence,concept,y,doc_name
0,[**2078-8-9**] Sex: M\n\nService...,Vicodin,Drug,100035.txt
1,"While at the OSH, he received CTX,\nazithromyc...",CTX,Drug,100035.txt
2,"While at the OSH, he received CTX,\nazithromyc...",azithromycin,Drug,100035.txt
3,"While at the OSH, he received CTX,\nazithromyc...",epinephrine,Drug,100035.txt
4,"While at the OSH, he received CTX,\nazithromyc...",solumedrol,Drug,100035.txt
...,...,...,...,...
90405,[**Name (NI) **],,NEG,198406.txt
90406,"[**Telephone/Fax (1) 92787**](H),",,NEG,198406.txt
90407,[**Telephone/Fax (1) 92788**](C)\n\n\nMedicati...,,NEG,198406.txt
90408,3.,,NEG,198406.txt


In [13]:
# get token level label from docs
_, train_df=convert_docs(train_docs, anno_types=annos)
_, test_df=convert_docs(test_docs, anno_types=annos)

In [14]:
train_df

Unnamed: 0,sentence_id,doc_name,token,label,sentence
0,0,100035.txt,Admission,O,Admission Date:
1,0,100035.txt,Date,O,Admission Date:
2,0,100035.txt,:,O,Admission Date:
3,1,100035.txt,[,O,[**2115-2-22**] Discharge Date: ...
4,1,100035.txt,*,O,[**2115-2-22**] Discharge Date: ...
...,...,...,...,...,...
3151,929927,198406.txt,Followup,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...
3152,929927,198406.txt,Instructions,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...
3153,929927,198406.txt,:,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...
3154,929927,198406.txt,\n,O,Mirtazapine 15 mg PO QHS\n\n\nDischarge Medica...


In [15]:
# embedding for unique sentence
pickle_embedding_file= r'..\data\n2c2\embedding_df_uniqueSentID.joblib' 
embedding_df=joblib.load(pickle_embedding_file)

In [16]:
embedding_df

Unnamed: 0,sentence_id,sentence,embedding
0,0,Admission Date:,"[0.026282, 0.03218903, -0.022386529, 0.0493732..."
3,1,[**2115-2-22**] Discharge Date: ...,"[0.016159855, 0.042264156, -0.018290585, -0.05..."
35,2,[**2078-8-9**] Sex: M\n\nService...,"[0.025958579, -0.05749655, 0.012378361, -0.009..."
113,3,[**Known lastname 3234**] is a 36 year old gen...,"[0.023170307, 0.03989108, 0.026217388, -0.0272..."
163,4,The patient initially presented to LGH ED with...,"[0.008176211, -0.06342948, 0.048615105, -0.045..."
...,...,...,...
3059,929923,"Cyanocobalamin 1,000 mcg/mL Injection once a m...","[0.050521564, -0.08905716, -0.0019493615, -0.0..."
3071,929924,"Lorazepam 0.25 QAM, O.25 QPM, 0.5 mg QHS\n8 Ca...","[-0.030010266, -0.062390286, 0.00167252, 0.016..."
3086,929925,Cream Topical TID\n9.,"[0.026732022, -0.04987913, 0.024520764, -0.016..."
3092,929926,Acetaminophen 1000 mg PO Q6H\n10.,"[-0.017295217, -0.10513715, -0.0030776137, -0...."


In [17]:
sdf_labels_sid = sdf_labels_train.merge(embedding_df, how='inner', on='sentence') 

In [18]:
sdf_labels_sid

Unnamed: 0,sentence,concept,y,doc_name,sentence_id,embedding
0,[**2078-8-9**] Sex: M\n\nService...,Vicodin,Drug,100035.txt,2,"[0.025958579, -0.05749655, 0.012378361, -0.009..."
1,"While at the OSH, he received CTX,\nazithromyc...",CTX,Drug,100035.txt,5,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
2,"While at the OSH, he received CTX,\nazithromyc...",azithromycin,Drug,100035.txt,5,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
3,"While at the OSH, he received CTX,\nazithromyc...",epinephrine,Drug,100035.txt,5,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
4,"While at the OSH, he received CTX,\nazithromyc...",solumedrol,Drug,100035.txt,5,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
...,...,...,...,...,...,...
638687,Patient had some cardiac enzyme leaks\nduring ...,,NEG,198406.txt,929907,"[0.02065785, -0.06587324, 0.055154495, 0.01074..."
638688,Patient was given cardiac healthy diet during ...,,NEG,198406.txt,929910,"[0.030450102, -0.042418838, 0.00325665, 0.0384..."
638689,# CODE: DNR/DNI (discussed with patient and so...,,NEG,198406.txt,929911,"[0.023342747, 0.013347558, -0.01095362, -0.052..."
638690,"[**Telephone/Fax (1) 92787**](H),",,NEG,198406.txt,929914,"[0.02003492, 0.029056935, -0.0140215475, 0.007..."


In [19]:
train_sentID_list = train_df['sentence_id'].to_list()
train_sentID_set = set(train_sentID_list)
train_sentID_uniqList = list(train_sentID_set)
test_sentID_list = test_df['sentence_id'].to_list()
test_sentID_set = set(test_sentID_list)
test_sentID_uniqList = list(test_sentID_set)
print(len(train_sentID_uniqList), len(test_sentID_uniqList))

51798 34334


In [20]:
# 10 round, how many sample in each round
int(1.0*len(train_sentID_uniqList)/10)

5179

In [21]:
# only input some columns from sentence label dataframe
sdf_labels_sid[['sentence','concept', 'y', 'doc_name','sentence_id']]

Unnamed: 0,sentence,concept,y,doc_name,sentence_id
0,[**2078-8-9**] Sex: M\n\nService...,Vicodin,Drug,100035.txt,2
1,"While at the OSH, he received CTX,\nazithromyc...",CTX,Drug,100035.txt,5
2,"While at the OSH, he received CTX,\nazithromyc...",azithromycin,Drug,100035.txt,5
3,"While at the OSH, he received CTX,\nazithromyc...",epinephrine,Drug,100035.txt,5
4,"While at the OSH, he received CTX,\nazithromyc...",solumedrol,Drug,100035.txt,5
...,...,...,...,...,...
638687,Patient had some cardiac enzyme leaks\nduring ...,,NEG,198406.txt,929907
638688,Patient was given cardiac healthy diet during ...,,NEG,198406.txt,929910
638689,# CODE: DNR/DNI (discussed with patient and so...,,NEG,198406.txt,929911
638690,"[**Telephone/Fax (1) 92787**](H),",,NEG,198406.txt,929914


## Define sampling simulator

In [22]:
logger.remove()
logger.add(sys.stderr, level='INFO')
logger.add(sys.stderr, level='DEBUG')

2

In [25]:
faiss_index_path= r'..\data\n2c2\faiss_index_st768' #FAISS indexing regenerated for unique sentence embedding

In [26]:
pickle_embedding_file=  r'..\data\n2c2\embedding_df_uniqueSentID.joblib' #unique sentence embeddings
embedding_df=joblib.load(pickle_embedding_file)
vb_simulator=VBSamplingSimulator(total_sents=train_df, 
                                 total_round=10, 
                                 modelWrapper=crf_model, 
                                 eval_sents=test_df, 
                                 init_seed=14, 
                                 faiss_index_path=faiss_index_path, 
                                 embedding_df=embedding_df,
                                 sdf_labels= sdf_labels_sid[['sentence','concept', 'y', 'doc_name','sentence_id']]
                                )

[32m2024-05-16 11:31:43.761[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36m__init__[0m:[36m69[0m - [34m[1mnum per found unique sent: 5179[0m
[32m2024-05-16 11:31:43.769[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36m__init__[0m:[36m224[0m - [34m[1mLoading index...[0m
[32m2024-05-16 11:31:44.455[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36m__init__[0m:[36m226[0m - [34m[1mdone[0m


## run test

In [27]:
scores=vb_simulator.simulate_rounds(boostrap_times=3)

[32m2024-05-16 11:32:40.713[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m95[0m - [1msimulate round 0.[0m
[32m2024-05-16 11:32:40.713[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m95[0m - [1msimulate round 0.[0m
[32m2024-05-16 11:32:40.716[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m78[0m - [34m[1mThe first round sampling will be random[0m
[32m2024-05-16 11:32:41.150[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 5179, remaining sentences: 46619[0m
[32m2024-05-16 11:32:41.150[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 5179, remaining sentences: 46619[0m
[32m2024-05-16 11:32:43.285[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m285[0m - [34m[1mReset and train CRF model...[0

DISTANCE max_values each row [1.54133558 1.37747204 1.32899022 ... 1.21001327 1.27035177 1.36610007] size: 929928
DISTANCE min_values each row [0.89112169 0.94867206 0.55982554 ... 0.84658188 0.60211623 0.62695014] size: 929928
DISTANCE max_diff each row [0.6502139  0.42879999 0.76916468 ... 0.36343139 0.66823554 0.73914993] size: 929928
after masking max_diff size 36740


[32m2024-05-16 11:34:59.913[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 93745, old remaining data 839161 [0m
[32m2024-05-16 11:34:59.915[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 105256, new remaining data 733905 [0m
[32m2024-05-16 11:34:59.933[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 199001, new remaining data 733905 [0m
[32m2024-05-16 11:35:00.126[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 10358, remaining sentences: 41440[0m
[32m2024-05-16 11:35:00.126[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 10358, remaining sentences: 41440[0

DISTANCE max_values each row [1.53036296 1.37846041 1.31468606 ... 1.19105077 1.24852931 1.33505845] size: 929928
DISTANCE min_values each row [0.7285589  0.95670348 0.53414804 ... 0.83298016 0.5780527  0.6140101 ] size: 929928
DISTANCE max_diff each row [0.80180407 0.42175692 0.78053802 ... 0.35807061 0.67047662 0.72104836] size: 929928
after masking max_diff size 37115


[32m2024-05-16 11:39:03.777[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 199001, old remaining data 733905 [0m
[32m2024-05-16 11:39:03.779[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 105413, new remaining data 628492 [0m
[32m2024-05-16 11:39:03.804[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 304414, new remaining data 628492 [0m
[32m2024-05-16 11:39:03.982[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 15537, remaining sentences: 36261[0m
[32m2024-05-16 11:39:03.982[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 15537, remaining sentences: 36261[

DISTANCE max_values each row [1.52868569 1.37109375 1.30065179 ... 1.14915967 1.20131803 1.28639793] size: 929928
DISTANCE min_values each row [0.69629812 0.96059871 0.53160524 ... 0.83288103 0.57208395 0.62128359] size: 929928
DISTANCE max_diff each row [0.83238757 0.41049504 0.76904655 ... 0.31627864 0.62923408 0.66511434] size: 929928
after masking max_diff size 37450


[32m2024-05-16 11:44:47.629[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 304414, old remaining data 628492 [0m
[32m2024-05-16 11:44:47.631[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 101161, new remaining data 527331 [0m
[32m2024-05-16 11:44:47.662[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 405575, new remaining data 527331 [0m
[32m2024-05-16 11:44:47.836[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 20716, remaining sentences: 31082[0m
[32m2024-05-16 11:44:47.836[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 20716, remaining sentences: 31082[

DISTANCE max_values each row [1.53267741 1.3715924  1.29785645 ... 1.13458061 1.13151419 1.19651365] size: 929928
DISTANCE min_values each row [0.79375386 0.95696282 0.53569335 ... 0.82673502 0.56830567 0.61862803] size: 929928
DISTANCE max_diff each row [0.73892355 0.41462958 0.7621631  ... 0.30784559 0.56320852 0.57788563] size: 929928
after masking max_diff size 37532


[32m2024-05-16 11:52:50.982[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 405575, old remaining data 527331 [0m
[32m2024-05-16 11:52:50.984[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 104915, new remaining data 422416 [0m
[32m2024-05-16 11:52:51.021[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 510490, new remaining data 422416 [0m
[32m2024-05-16 11:52:51.188[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 25895, remaining sentences: 25903[0m
[32m2024-05-16 11:52:51.188[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 25895, remaining sentences: 25903[

DISTANCE max_values each row [1.51914692 1.37026489 1.2940346  ... 1.12030435 1.07396555 1.13556898] size: 929928
DISTANCE min_values each row [0.80917788 0.95746559 0.53847408 ... 0.83628345 0.56692439 0.61421239] size: 929928
DISTANCE max_diff each row [0.70996904 0.4127993  0.75556052 ... 0.2840209  0.50704116 0.52135658] size: 929928
after masking max_diff size 37646


[32m2024-05-16 12:02:50.790[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 510490, old remaining data 422416 [0m
[32m2024-05-16 12:02:50.792[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 102073, new remaining data 320343 [0m
[32m2024-05-16 12:02:50.846[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 612563, new remaining data 320343 [0m
[32m2024-05-16 12:02:51.029[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 31074, remaining sentences: 20724[0m
[32m2024-05-16 12:02:51.029[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 31074, remaining sentences: 20724[

DISTANCE max_values each row [1.50327945 1.37355494 1.29575694 ... 1.10466051 1.02925253 1.09002256] size: 929928
DISTANCE min_values each row [0.73097253 0.9125905  0.538692   ... 0.83835727 0.57264888 0.60956693] size: 929928
DISTANCE max_diff each row [0.77230692 0.46096444 0.75706494 ... 0.26630324 0.45660365 0.48045564] size: 929928
after masking max_diff size 37724


[32m2024-05-16 12:14:48.459[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 612563, old remaining data 320343 [0m
[32m2024-05-16 12:14:48.460[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 99541, new remaining data 220802 [0m
[32m2024-05-16 12:14:48.523[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 712104, new remaining data 220802 [0m
[32m2024-05-16 12:14:48.695[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 36253, remaining sentences: 15545[0m
[32m2024-05-16 12:14:48.695[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 36253, remaining sentences: 15545[0

DISTANCE max_values each row [1.50693154 1.37769508 1.30233932 ... 1.10875797 0.99599051 1.06034946] size: 929928
DISTANCE min_values each row [0.70650148 0.89141464 0.54826581 ... 0.83451259 0.57855093 0.60684997] size: 929928
DISTANCE max_diff each row [0.80043006 0.48628044 0.7540735  ... 0.27424538 0.41743958 0.4534995 ] size: 929928
after masking max_diff size 37753


[32m2024-05-16 12:28:39.555[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 69831, new remaining data 150971 [0m
[32m2024-05-16 12:28:39.622[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 781935, new remaining data 150971 [0m
[32m2024-05-16 12:28:39.791[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39767, remaining sentences: 12031[0m
[32m2024-05-16 12:28:39.791[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39767, remaining sentences: 12031[0m
[32m2024-05-16 12:28:56.727[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m285[0m - [34m[1mReset and train CRF model...[0m
[32m2024-05-16 12:37:22.472[0m | [34m[1mDEBUG   [0m | 

DISTANCE max_values each row [1.51149988 1.37501872 1.29779828 ... 1.11319077 0.9810766  1.04472613] size: 929928
DISTANCE min_values each row [0.69877166 0.88358521 0.54771459 ... 0.83410776 0.57197213 0.6033752 ] size: 929928
DISTANCE max_diff each row [0.81272823 0.4914335  0.75008368 ... 0.27908301 0.40910447 0.44135094] size: 929928
after masking max_diff size 37793


[32m2024-05-16 12:43:41.719[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 783280, new remaining data 149626 [0m
[32m2024-05-16 12:43:41.883[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39860, remaining sentences: 11938[0m
[32m2024-05-16 12:43:41.883[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39860, remaining sentences: 11938[0m
[32m2024-05-16 12:43:59.823[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m285[0m - [34m[1mReset and train CRF model...[0m
[32m2024-05-16 12:52:26.340[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m293[0m - [34m[1mTraining complete.[0m
[32m2024-05-16 12:52:27.169[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mbootstra

DISTANCE max_values each row [1.51149988 1.37501872 1.29779828 ... 1.11319077 0.98107576 1.04472482] size: 929928
DISTANCE min_values each row [0.69877207 0.88358474 0.54785913 ... 0.83415127 0.5719558  0.60331142] size: 929928
DISTANCE max_diff each row [0.81272781 0.49143398 0.74993914 ... 0.2790395  0.40911996 0.4414134 ] size: 929928
after masking max_diff size 37792


[32m2024-05-16 12:58:45.422[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 783280, new remaining data 149626 [0m
[32m2024-05-16 12:58:45.589[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39860, remaining sentences: 11938[0m
[32m2024-05-16 12:58:45.589[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39860, remaining sentences: 11938[0m
[32m2024-05-16 12:58:45.639[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m100[0m - [1mIt's the last round, now add all docs to sampled.[0m
[32m2024-05-16 12:58:45.639[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m100[0m - [1mIt's the last round, now add all docs to sampled.[0m
[32m2024-05-16 12:59:06.605[0m | [34m[1mD

In [28]:
def compute_mean_ci(scores):
    ave=np.mean(scores)
    ci=np.percentile(scores, [2.5, 97.5])
    return ave, ci

summary={'precision': [], 'pl':[], 'pu': [], 'recall': [], 'rl':[], 'ru': [], 'f1':[], 'fl':[], 'fu': []}
for s in scores:    
    for k,v in s.items():
        ave, (l, u)=compute_mean_ci(v)
        summary[k].append(ave)
        summary[k[0]+'l'].append(l)
        summary[k[0]+'u'].append(u)

In [29]:
pd.options.display.float_format='{:,.5f}'.format
pd.DataFrame(summary)

Unnamed: 0,precision,pl,pu,recall,rl,ru,f1,fl,fu
0,0.94531,0.9427,0.94725,0.77002,0.7682,0.77201,0.84871,0.84658,0.85026
1,0.93646,0.93479,0.93845,0.8244,0.82155,0.82649,0.87686,0.87505,0.87889
2,0.94099,0.93953,0.94205,0.83791,0.8348,0.84116,0.88646,0.88514,0.88845
3,0.94317,0.94231,0.94392,0.84744,0.84531,0.84922,0.89274,0.89119,0.89381
4,0.94305,0.94193,0.94484,0.84944,0.84684,0.8515,0.8938,0.89316,0.89443
5,0.94547,0.94453,0.94688,0.85296,0.8497,0.85681,0.89683,0.89483,0.89858
6,0.94629,0.94354,0.94848,0.85626,0.85475,0.85771,0.89903,0.89699,0.90016
7,0.94601,0.94402,0.94909,0.85846,0.85737,0.8599,0.90011,0.89905,0.90095
8,0.94506,0.9435,0.94772,0.85873,0.85356,0.86665,0.89983,0.8964,0.90537
9,0.94562,0.94501,0.94632,0.85615,0.85242,0.85917,0.89866,0.89635,0.90029


## bootstrap 3 runs

In [30]:
logger.remove()
logger.add(sys.stderr, level='INFO')
logger.add(sys.stderr, level='DEBUG')

4

In [31]:
boostrap_runs=3
total_round=10

In [32]:
random.seed(14)
seeds=[random.randint(1,10000000) for  _ in range(boostrap_runs)]
seeds

[1792286, 8843471, 4142887]

In [33]:
all_scores=[]
embedding_df=joblib.load(pickle_embedding_file)
for si, seed  in enumerate(seeds):
    logger.info(f'start run {si}.')
    pickle_embedding_file=  r'..\data\n2c2\embedding_df_uniqueSentID.joblib' #r'..\data\n2c2\embedding_df.joblib'
    crf_model=CRFModel(anno_types=annos)
    vb_simulator=VBSamplingSimulator(total_sents=train_df, 
                                 total_round=10, 
                                 modelWrapper=crf_model, 
                                 eval_sents=test_df, 
                                 init_seed=14, 
                                 faiss_index_path=faiss_index_path, 
                                 embedding_df=embedding_df,
                                 sdf_labels= sdf_labels_sid[['sentence','concept', 'y', 'doc_name','sentence_id']]
                                    # sdf_labels_sid[['sentence','concept', 'y', 'doc_name_x','sid']]
                                )
    #v_simulator=VBSamplingSimulator(train_docs, 
    #                                total_round=10, 
    #                                modelWrapper=crf_model, 
    #                                eval_docs=test_docs, 
    #                                init_seed=seed, 
    #                                faiss_index_path=faiss_index_path, 
    #                                embedding_df=embedding_df)
    scores=vb_simulator.simulate_rounds(boostrap_times=200)
    all_scores.append(scores) 

[32m2024-05-16 13:55:42.414[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mstart run 0.[0m
[32m2024-05-16 13:55:42.414[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mstart run 0.[0m
[32m2024-05-16 13:55:42.580[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36m__init__[0m:[36m69[0m - [34m[1mnum per found unique sent: 5179[0m
[32m2024-05-16 13:55:42.583[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36m__init__[0m:[36m224[0m - [34m[1mLoading index...[0m
[32m2024-05-16 13:55:43.093[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36m__init__[0m:[36m226[0m - [34m[1mdone[0m
[32m2024-05-16 13:55:43.249[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m95[0m - [1msimulate round 0.[0m
[32m2024-05-16 13:55:43.249[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m95[0m - [1msimulate round 0.[0m


DISTANCE max_values each row [1.54133558 1.37747204 1.32899022 ... 1.21001327 1.27035177 1.36610007] size: 929928
DISTANCE min_values each row [0.89112169 0.94867206 0.55982554 ... 0.84658188 0.60211623 0.62695014] size: 929928
DISTANCE max_diff each row [0.6502139  0.42879999 0.76916468 ... 0.36343139 0.66823554 0.73914993] size: 929928
after masking max_diff size 36740


[32m2024-05-16 14:02:13.369[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 93745, old remaining data 839161 [0m
[32m2024-05-16 14:02:13.371[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 105256, new remaining data 733905 [0m
[32m2024-05-16 14:02:13.387[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 199001, new remaining data 733905 [0m
[32m2024-05-16 14:02:13.559[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 10358, remaining sentences: 41440[0m
[32m2024-05-16 14:02:13.559[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 10358, remaining sentences: 41440[0

DISTANCE max_values each row [1.53036296 1.37846041 1.31468606 ... 1.19105077 1.24852931 1.33505845] size: 929928
DISTANCE min_values each row [0.7285589  0.95670348 0.53414804 ... 0.83298016 0.5780527  0.6140101 ] size: 929928
DISTANCE max_diff each row [0.80180407 0.42175692 0.78053802 ... 0.35807061 0.67047662 0.72104836] size: 929928
after masking max_diff size 37115


[32m2024-05-16 14:10:21.499[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 199001, old remaining data 733905 [0m
[32m2024-05-16 14:10:21.502[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 105413, new remaining data 628492 [0m
[32m2024-05-16 14:10:21.529[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 304414, new remaining data 628492 [0m
[32m2024-05-16 14:10:21.700[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 15537, remaining sentences: 36261[0m
[32m2024-05-16 14:10:21.700[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 15537, remaining sentences: 36261[

DISTANCE max_values each row [1.52868569 1.37109375 1.30065179 ... 1.14915967 1.20131803 1.28639793] size: 929928
DISTANCE min_values each row [0.69629812 0.96059871 0.53160524 ... 0.83288103 0.57208395 0.62128359] size: 929928
DISTANCE max_diff each row [0.83238757 0.41049504 0.76904655 ... 0.31627864 0.62923408 0.66511434] size: 929928
after masking max_diff size 37450


[32m2024-05-16 14:20:13.016[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 304414, old remaining data 628492 [0m
[32m2024-05-16 14:20:13.018[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 101161, new remaining data 527331 [0m
[32m2024-05-16 14:20:13.048[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 405575, new remaining data 527331 [0m
[32m2024-05-16 14:20:13.221[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 20716, remaining sentences: 31082[0m
[32m2024-05-16 14:20:13.221[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 20716, remaining sentences: 31082[

DISTANCE max_values each row [1.53267741 1.3715924  1.29785645 ... 1.13458061 1.13151419 1.19651365] size: 929928
DISTANCE min_values each row [0.79375386 0.95696282 0.53569335 ... 0.82673502 0.56830567 0.61862803] size: 929928
DISTANCE max_diff each row [0.73892355 0.41462958 0.7621631  ... 0.30784559 0.56320852 0.57788563] size: 929928
after masking max_diff size 37532


[32m2024-05-16 14:32:34.314[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 405575, old remaining data 527331 [0m
[32m2024-05-16 14:32:34.316[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 104915, new remaining data 422416 [0m
[32m2024-05-16 14:32:34.353[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 510490, new remaining data 422416 [0m
[32m2024-05-16 14:32:34.529[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 25895, remaining sentences: 25903[0m
[32m2024-05-16 14:32:34.529[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 25895, remaining sentences: 25903[

DISTANCE max_values each row [1.51914692 1.37026489 1.2940346  ... 1.12030435 1.07396555 1.13556898] size: 929928
DISTANCE min_values each row [0.80917788 0.95746559 0.53847408 ... 0.83628345 0.56692439 0.61421239] size: 929928
DISTANCE max_diff each row [0.70996904 0.4127993  0.75556052 ... 0.2840209  0.50704116 0.52135658] size: 929928
after masking max_diff size 37646


[32m2024-05-16 14:46:46.491[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 510490, old remaining data 422416 [0m
[32m2024-05-16 14:46:46.492[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 102073, new remaining data 320343 [0m
[32m2024-05-16 14:46:46.547[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 612563, new remaining data 320343 [0m
[32m2024-05-16 14:46:46.732[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 31074, remaining sentences: 20724[0m
[32m2024-05-16 14:46:46.732[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 31074, remaining sentences: 20724[

DISTANCE max_values each row [1.50327945 1.37355494 1.29575694 ... 1.10466051 1.02925253 1.09002256] size: 929928
DISTANCE min_values each row [0.73097253 0.9125905  0.538692   ... 0.83835727 0.57264888 0.60956693] size: 929928
DISTANCE max_diff each row [0.77230692 0.46096444 0.75706494 ... 0.26630324 0.45660365 0.48045564] size: 929928
after masking max_diff size 37724


[32m2024-05-16 15:02:48.437[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 612563, old remaining data 320343 [0m
[32m2024-05-16 15:02:48.439[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 99541, new remaining data 220802 [0m
[32m2024-05-16 15:02:48.502[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 712104, new remaining data 220802 [0m
[32m2024-05-16 15:02:48.680[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 36253, remaining sentences: 15545[0m
[32m2024-05-16 15:02:48.680[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 36253, remaining sentences: 15545[0

DISTANCE max_values each row [1.50693154 1.37769508 1.30233932 ... 1.10875797 0.99599051 1.06034946] size: 929928
DISTANCE min_values each row [0.70650148 0.89141464 0.54826581 ... 0.83451259 0.57855093 0.60684997] size: 929928
DISTANCE max_diff each row [0.80043006 0.48628044 0.7540735  ... 0.27424538 0.41743958 0.4534995 ] size: 929928
after masking max_diff size 37753


[32m2024-05-16 15:20:52.856[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 69831, new remaining data 150971 [0m
[32m2024-05-16 15:20:52.924[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 781935, new remaining data 150971 [0m
[32m2024-05-16 15:20:53.098[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39767, remaining sentences: 12031[0m
[32m2024-05-16 15:20:53.098[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39767, remaining sentences: 12031[0m
[32m2024-05-16 15:21:10.183[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m285[0m - [34m[1mReset and train CRF model...[0m
[32m2024-05-16 15:29:34.835[0m | [34m[1mDEBUG   [0m | 

DISTANCE max_values each row [1.51149988 1.37501872 1.29779828 ... 1.11319077 0.9810766  1.04472613] size: 929928
DISTANCE min_values each row [0.69877166 0.88358521 0.54771459 ... 0.83410776 0.57197213 0.6033752 ] size: 929928
DISTANCE max_diff each row [0.81272823 0.4914335  0.75008368 ... 0.27908301 0.40910447 0.44135094] size: 929928
after masking max_diff size 37793


[32m2024-05-16 15:40:16.833[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 1345, new remaining data 149626 [0m
[32m2024-05-16 15:40:16.904[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 783280, new remaining data 149626 [0m
[32m2024-05-16 15:40:17.084[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39860, remaining sentences: 11938[0m
[32m2024-05-16 15:40:17.084[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39860, remaining sentences: 11938[0m
[32m2024-05-16 15:40:32.900[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m285[0m - [34m[1mReset and train CRF model...[0m
[32m2024-05-16 15:49:02.468[0m | [34m[1mDEBUG   [0m | [

DISTANCE max_values each row [1.51149988 1.37501872 1.29779828 ... 1.11319077 0.98107576 1.04472482] size: 929928
DISTANCE min_values each row [0.69877207 0.88358474 0.54785913 ... 0.83415127 0.5719558  0.60331142] size: 929928
DISTANCE max_diff each row [0.81272781 0.49143398 0.74993914 ... 0.2790395  0.40911996 0.4414134 ] size: 929928
after masking max_diff size 37792


[32m2024-05-16 15:59:39.710[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 783280, new remaining data 149626 [0m
[32m2024-05-16 15:59:39.880[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39860, remaining sentences: 11938[0m
[32m2024-05-16 15:59:39.880[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39860, remaining sentences: 11938[0m
[32m2024-05-16 15:59:39.930[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m100[0m - [1mIt's the last round, now add all docs to sampled.[0m
[32m2024-05-16 15:59:39.930[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m100[0m - [1mIt's the last round, now add all docs to sampled.[0m
[32m2024-05-16 16:00:01.265[0m | [34m[1mD

DISTANCE max_values each row [1.54133558 1.37747204 1.32899022 ... 1.21001327 1.27035177 1.36610007] size: 929928
DISTANCE min_values each row [0.89112169 0.94867206 0.55982554 ... 0.84658188 0.60211623 0.62695014] size: 929928
DISTANCE max_diff each row [0.6502139  0.42879999 0.76916468 ... 0.36343139 0.66823554 0.73914993] size: 929928
after masking max_diff size 36740


[32m2024-05-16 16:21:50.547[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 93745, old remaining data 839161 [0m
[32m2024-05-16 16:21:50.549[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 105256, new remaining data 733905 [0m
[32m2024-05-16 16:21:50.566[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 199001, new remaining data 733905 [0m
[32m2024-05-16 16:21:50.741[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 10358, remaining sentences: 41440[0m
[32m2024-05-16 16:21:50.741[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 10358, remaining sentences: 41440[0

DISTANCE max_values each row [1.53036296 1.37846041 1.31468606 ... 1.19105077 1.24852931 1.33505845] size: 929928
DISTANCE min_values each row [0.7285589  0.95670348 0.53414804 ... 0.83298016 0.5780527  0.6140101 ] size: 929928
DISTANCE max_diff each row [0.80180407 0.42175692 0.78053802 ... 0.35807061 0.67047662 0.72104836] size: 929928
after masking max_diff size 37115


[32m2024-05-16 16:30:04.011[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 199001, old remaining data 733905 [0m
[32m2024-05-16 16:30:04.012[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 105413, new remaining data 628492 [0m
[32m2024-05-16 16:30:04.038[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 304414, new remaining data 628492 [0m
[32m2024-05-16 16:30:04.210[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 15537, remaining sentences: 36261[0m
[32m2024-05-16 16:30:04.210[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 15537, remaining sentences: 36261[

DISTANCE max_values each row [1.52868569 1.37109375 1.30065179 ... 1.14915967 1.20131803 1.28639793] size: 929928
DISTANCE min_values each row [0.69629812 0.96059871 0.53160524 ... 0.83288103 0.57208395 0.62128359] size: 929928
DISTANCE max_diff each row [0.83238757 0.41049504 0.76904655 ... 0.31627864 0.62923408 0.66511434] size: 929928
after masking max_diff size 37450


[32m2024-05-16 16:39:52.294[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 304414, old remaining data 628492 [0m
[32m2024-05-16 16:39:52.295[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 101161, new remaining data 527331 [0m
[32m2024-05-16 16:39:52.325[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 405575, new remaining data 527331 [0m
[32m2024-05-16 16:39:52.503[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 20716, remaining sentences: 31082[0m
[32m2024-05-16 16:39:52.503[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 20716, remaining sentences: 31082[

DISTANCE max_values each row [1.53267741 1.3715924  1.29785645 ... 1.13458061 1.13151419 1.19651365] size: 929928
DISTANCE min_values each row [0.79375386 0.95696282 0.53569335 ... 0.82673502 0.56830567 0.61862803] size: 929928
DISTANCE max_diff each row [0.73892355 0.41462958 0.7621631  ... 0.30784559 0.56320852 0.57788563] size: 929928
after masking max_diff size 37532


[32m2024-05-16 16:51:59.818[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 405575, old remaining data 527331 [0m
[32m2024-05-16 16:51:59.819[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 104915, new remaining data 422416 [0m
[32m2024-05-16 16:51:59.856[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 510490, new remaining data 422416 [0m
[32m2024-05-16 16:52:00.038[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 25895, remaining sentences: 25903[0m
[32m2024-05-16 16:52:00.038[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 25895, remaining sentences: 25903[

DISTANCE max_values each row [1.51914692 1.37026489 1.2940346  ... 1.12030435 1.07396555 1.13556898] size: 929928
DISTANCE min_values each row [0.80917788 0.95746559 0.53847408 ... 0.83628345 0.56692439 0.61421239] size: 929928
DISTANCE max_diff each row [0.70996904 0.4127993  0.75556052 ... 0.2840209  0.50704116 0.52135658] size: 929928
after masking max_diff size 37646


[32m2024-05-16 17:06:15.493[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 510490, old remaining data 422416 [0m
[32m2024-05-16 17:06:15.495[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 102073, new remaining data 320343 [0m
[32m2024-05-16 17:06:15.551[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 612563, new remaining data 320343 [0m
[32m2024-05-16 17:06:15.743[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 31074, remaining sentences: 20724[0m
[32m2024-05-16 17:06:15.743[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 31074, remaining sentences: 20724[

DISTANCE max_values each row [1.50327945 1.37355494 1.29575694 ... 1.10466051 1.02925253 1.09002256] size: 929928
DISTANCE min_values each row [0.73097253 0.9125905  0.538692   ... 0.83835727 0.57264888 0.60956693] size: 929928
DISTANCE max_diff each row [0.77230692 0.46096444 0.75706494 ... 0.26630324 0.45660365 0.48045564] size: 929928
after masking max_diff size 37724


[32m2024-05-16 17:22:16.811[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 612563, old remaining data 320343 [0m
[32m2024-05-16 17:22:16.812[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 99541, new remaining data 220802 [0m
[32m2024-05-16 17:22:16.875[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 712104, new remaining data 220802 [0m
[32m2024-05-16 17:22:17.048[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 36253, remaining sentences: 15545[0m
[32m2024-05-16 17:22:17.048[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 36253, remaining sentences: 15545[0

DISTANCE max_values each row [1.50693154 1.37769508 1.30233932 ... 1.10875797 0.99599051 1.06034946] size: 929928
DISTANCE min_values each row [0.70650148 0.89141464 0.54826581 ... 0.83451259 0.57855093 0.60684997] size: 929928
DISTANCE max_diff each row [0.80043006 0.48628044 0.7540735  ... 0.27424538 0.41743958 0.4534995 ] size: 929928
after masking max_diff size 37753


[32m2024-05-16 17:39:58.494[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 69831, new remaining data 150971 [0m
[32m2024-05-16 17:39:58.564[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 781935, new remaining data 150971 [0m
[32m2024-05-16 17:39:58.744[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39767, remaining sentences: 12031[0m
[32m2024-05-16 17:39:58.744[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39767, remaining sentences: 12031[0m
[32m2024-05-16 17:40:15.153[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m285[0m - [34m[1mReset and train CRF model...[0m
[32m2024-05-16 17:48:27.760[0m | [34m[1mDEBUG   [0m | 

DISTANCE max_values each row [1.51149988 1.37501872 1.29779828 ... 1.11319077 0.9810766  1.04472613] size: 929928
DISTANCE min_values each row [0.69877166 0.88358521 0.54771459 ... 0.83410776 0.57197213 0.6033752 ] size: 929928
DISTANCE max_diff each row [0.81272823 0.4914335  0.75008368 ... 0.27908301 0.40910447 0.44135094] size: 929928
after masking max_diff size 37793


[32m2024-05-16 17:58:54.142[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 783280, new remaining data 149626 [0m
[32m2024-05-16 17:58:54.309[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39860, remaining sentences: 11938[0m
[32m2024-05-16 17:58:54.309[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39860, remaining sentences: 11938[0m
[32m2024-05-16 17:59:11.498[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m285[0m - [34m[1mReset and train CRF model...[0m
[32m2024-05-16 18:07:35.666[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m293[0m - [34m[1mTraining complete.[0m
[32m2024-05-16 18:07:36.387[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mbootstra

DISTANCE max_values each row [1.51149988 1.37501872 1.29779828 ... 1.11319077 0.98107576 1.04472482] size: 929928
DISTANCE min_values each row [0.69877207 0.88358474 0.54785913 ... 0.83415127 0.5719558  0.60331142] size: 929928
DISTANCE max_diff each row [0.81272781 0.49143398 0.74993914 ... 0.2790395  0.40911996 0.4414134 ] size: 929928
after masking max_diff size 37792


[32m2024-05-16 18:18:02.282[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 783280, new remaining data 149626 [0m
[32m2024-05-16 18:18:02.451[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39860, remaining sentences: 11938[0m
[32m2024-05-16 18:18:02.451[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39860, remaining sentences: 11938[0m
[32m2024-05-16 18:18:02.502[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m100[0m - [1mIt's the last round, now add all docs to sampled.[0m
[32m2024-05-16 18:18:02.502[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m100[0m - [1mIt's the last round, now add all docs to sampled.[0m
[32m2024-05-16 18:18:23.382[0m | [34m[1mD

DISTANCE max_values each row [1.54133558 1.37747204 1.32899022 ... 1.21001327 1.27035177 1.36610007] size: 929928
DISTANCE min_values each row [0.89112169 0.94867206 0.55982554 ... 0.84658188 0.60211623 0.62695014] size: 929928
DISTANCE max_diff each row [0.6502139  0.42879999 0.76916468 ... 0.36343139 0.66823554 0.73914993] size: 929928
after masking max_diff size 36740


[32m2024-05-16 18:39:42.682[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 93745, old remaining data 839161 [0m
[32m2024-05-16 18:39:42.684[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 105256, new remaining data 733905 [0m
[32m2024-05-16 18:39:42.700[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 199001, new remaining data 733905 [0m
[32m2024-05-16 18:39:42.876[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 10358, remaining sentences: 41440[0m
[32m2024-05-16 18:39:42.876[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 10358, remaining sentences: 41440[0

DISTANCE max_values each row [1.53036296 1.37846041 1.31468606 ... 1.19105077 1.24852931 1.33505845] size: 929928
DISTANCE min_values each row [0.7285589  0.95670348 0.53414804 ... 0.83298016 0.5780527  0.6140101 ] size: 929928
DISTANCE max_diff each row [0.80180407 0.42175692 0.78053802 ... 0.35807061 0.67047662 0.72104836] size: 929928
after masking max_diff size 37115


[32m2024-05-16 18:47:57.208[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 199001, old remaining data 733905 [0m
[32m2024-05-16 18:47:57.210[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 105413, new remaining data 628492 [0m
[32m2024-05-16 18:47:57.235[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 304414, new remaining data 628492 [0m
[32m2024-05-16 18:47:57.406[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 15537, remaining sentences: 36261[0m
[32m2024-05-16 18:47:57.406[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 15537, remaining sentences: 36261[

DISTANCE max_values each row [1.52868569 1.37109375 1.30065179 ... 1.14915967 1.20131803 1.28639793] size: 929928
DISTANCE min_values each row [0.69629812 0.96059871 0.53160524 ... 0.83288103 0.57208395 0.62128359] size: 929928
DISTANCE max_diff each row [0.83238757 0.41049504 0.76904655 ... 0.31627864 0.62923408 0.66511434] size: 929928
after masking max_diff size 37450


[32m2024-05-16 18:57:46.879[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 304414, old remaining data 628492 [0m
[32m2024-05-16 18:57:46.881[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 101161, new remaining data 527331 [0m
[32m2024-05-16 18:57:46.912[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 405575, new remaining data 527331 [0m
[32m2024-05-16 18:57:47.098[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 20716, remaining sentences: 31082[0m
[32m2024-05-16 18:57:47.098[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 20716, remaining sentences: 31082[

DISTANCE max_values each row [1.53267741 1.3715924  1.29785645 ... 1.13458061 1.13151419 1.19651365] size: 929928
DISTANCE min_values each row [0.79375386 0.95696282 0.53569335 ... 0.82673502 0.56830567 0.61862803] size: 929928
DISTANCE max_diff each row [0.73892355 0.41462958 0.7621631  ... 0.30784559 0.56320852 0.57788563] size: 929928
after masking max_diff size 37532


[32m2024-05-16 19:10:16.034[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 405575, old remaining data 527331 [0m
[32m2024-05-16 19:10:16.036[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 104915, new remaining data 422416 [0m
[32m2024-05-16 19:10:16.073[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 510490, new remaining data 422416 [0m
[32m2024-05-16 19:10:16.262[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 25895, remaining sentences: 25903[0m
[32m2024-05-16 19:10:16.262[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 25895, remaining sentences: 25903[

DISTANCE max_values each row [1.51914692 1.37026489 1.2940346  ... 1.12030435 1.07396555 1.13556898] size: 929928
DISTANCE min_values each row [0.80917788 0.95746559 0.53847408 ... 0.83628345 0.56692439 0.61421239] size: 929928
DISTANCE max_diff each row [0.70996904 0.4127993  0.75556052 ... 0.2840209  0.50704116 0.52135658] size: 929928
after masking max_diff size 37646


[32m2024-05-16 19:24:18.357[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 510490, old remaining data 422416 [0m
[32m2024-05-16 19:24:18.358[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 102073, new remaining data 320343 [0m
[32m2024-05-16 19:24:18.411[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 612563, new remaining data 320343 [0m
[32m2024-05-16 19:24:18.587[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 31074, remaining sentences: 20724[0m
[32m2024-05-16 19:24:18.587[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 31074, remaining sentences: 20724[

DISTANCE max_values each row [1.50327945 1.37355494 1.29575694 ... 1.10466051 1.02925253 1.09002256] size: 929928
DISTANCE min_values each row [0.73097253 0.9125905  0.538692   ... 0.83835727 0.57264888 0.60956693] size: 929928
DISTANCE max_diff each row [0.77230692 0.46096444 0.75706494 ... 0.26630324 0.45660365 0.48045564] size: 929928
after masking max_diff size 37724


[32m2024-05-16 19:40:43.908[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m377[0m - [34m[1mBEFORE update model with old sampled data 612563, old remaining data 320343 [0m
[32m2024-05-16 19:40:43.910[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 99541, new remaining data 220802 [0m
[32m2024-05-16 19:40:43.972[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 712104, new remaining data 220802 [0m
[32m2024-05-16 19:40:44.152[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 36253, remaining sentences: 15545[0m
[32m2024-05-16 19:40:44.152[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 36253, remaining sentences: 15545[0

DISTANCE max_values each row [1.50693154 1.37769508 1.30233932 ... 1.10875797 0.99599051 1.06034946] size: 929928
DISTANCE min_values each row [0.70650148 0.89141464 0.54826581 ... 0.83451259 0.57855093 0.60684997] size: 929928
DISTANCE max_diff each row [0.80043006 0.48628044 0.7540735  ... 0.27424538 0.41743958 0.4534995 ] size: 929928
after masking max_diff size 37753


[32m2024-05-16 19:58:45.939[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 69831, new remaining data 150971 [0m
[32m2024-05-16 19:58:46.008[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 781935, new remaining data 150971 [0m
[32m2024-05-16 19:58:46.180[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39767, remaining sentences: 12031[0m
[32m2024-05-16 19:58:46.180[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39767, remaining sentences: 12031[0m
[32m2024-05-16 19:59:03.097[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m285[0m - [34m[1mReset and train CRF model...[0m
[32m2024-05-16 20:07:40.650[0m | [34m[1mDEBUG   [0m | 

DISTANCE max_values each row [1.51149988 1.37501872 1.29779828 ... 1.11319077 0.9810766  1.04472613] size: 929928
DISTANCE min_values each row [0.69877166 0.88358521 0.54771459 ... 0.83410776 0.57197213 0.6033752 ] size: 929928
DISTANCE max_diff each row [0.81272823 0.4914335  0.75008368 ... 0.27908301 0.40910447 0.44135094] size: 929928
after masking max_diff size 37793


[32m2024-05-16 20:18:19.888[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m380[0m - [34m[1mnew_sampled 1345, new remaining data 149626 [0m
[32m2024-05-16 20:18:19.961[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 783280, new remaining data 149626 [0m
[32m2024-05-16 20:18:20.143[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39860, remaining sentences: 11938[0m
[32m2024-05-16 20:18:20.143[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39860, remaining sentences: 11938[0m
[32m2024-05-16 20:18:37.384[0m | [34m[1mDEBUG   [0m | [36mCRFWrapper_Sentence[0m:[36mfit[0m:[36m285[0m - [34m[1mReset and train CRF model...[0m
[32m2024-05-16 20:27:03.370[0m | [34m[1mDEBUG   [0m | [

DISTANCE max_values each row [1.51149988 1.37501872 1.29779828 ... 1.11319077 0.98107576 1.04472482] size: 929928
DISTANCE min_values each row [0.69877207 0.88358474 0.54785913 ... 0.83415127 0.5719558  0.60331142] size: 929928
DISTANCE max_diff each row [0.81272781 0.49143398 0.74993914 ... 0.2790395  0.40911996 0.4414134 ] size: 929928
after masking max_diff size 37792


[32m2024-05-16 20:37:34.355[0m | [34m[1mDEBUG   [0m | [36mALLSampler_Sentence[0m:[36msample_next_round[0m:[36m383[0m - [34m[1mAFTER update model with new sampled data 783280, new remaining data 149626 [0m
[32m2024-05-16 20:37:34.534[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39860, remaining sentences: 11938[0m
[32m2024-05-16 20:37:34.534[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m85[0m - [1mcurrent sampled sentences: 39860, remaining sentences: 11938[0m
[32m2024-05-16 20:37:34.584[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m100[0m - [1mIt's the last round, now add all docs to sampled.[0m
[32m2024-05-16 20:37:34.584[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m100[0m - [1mIt's the last round, now add all docs to sampled.[0m
[32m2024-05-16 20:37:54.772[0m | [34m[1mD

In [34]:
joblib.dump(all_scores, r'../data/n2c2/scores_sentence_sampling/ner_VBmax_scores_sentenceSampling_fixed.joblib')


['../data/n2c2/scores_sentence_sampling/ner_VBmax_scores_sentenceSampling_fixed.joblib']