In [1]:
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite import CRF
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from spacy.lang.en import English
from spacy import displacy
from pathlib import Path
import joblib
from spacy.tokens import Doc
from typing import List
import random
from statistics import mean, stdev
from loguru import logger
import sys
import spacy
from medspacy_io.reader.brat_reader import BratDocReader, BratDirReader
import medspacy

###  initiate brat reader

In [2]:
cleaned_train_dir=r'..\data\n2c2\cleaned_training'
cleaned_test_dir=r'..\data\n2c2\cleaned_test'
Path(cleaned_train_dir).exists(), Path(cleaned_test_dir).exists()

(True, True)

In [3]:
nlp=spacy.load('en_core_web_sm', disable=['ner'])

In [4]:
dir_reader = BratDirReader(nlp=nlp, schema_file=str(Path(cleaned_train_dir, 'annotation.conf')), support_overlap=True)

### Read eHOST annotations | load from pickles

In [5]:
pickle_file= r'..\data\n2c2\spacy_docs.joblib'

In [6]:
if not Path(pickle_file).exists():
    train_docs=dir_reader.read(txt_dir=cleaned_train_dir)
    test_docs=dir_reader.read(txt_dir=cleaned_test_dir)
    print(len(train_docs), len(test_docs))
    joblib.dump((train_docs, test_docs), pickle_file)
else:
    print(f'{pickle_file} already exists, load them directly')
    # before load from pickle, initiate EhostDirReader or EhostDocReader first, because some Doc extension used to store meta data will not be automatically recreated by loading.
    train_docs, test_docs=joblib.load(pickle_file)

..\data\n2c2\spacy_docs.joblib already exists, load them directly


### define sampling functions

In [7]:
rounds =10
seed= 14

In [8]:
len(train_docs), len(test_docs)

(303, 202)

### Convert training and testing docs into sentence level dataframe

In [9]:
from medspacy_io.vectorizer import Vectorizer

In [10]:
sdf_labels=Vectorizer.docs_to_sents_df(train_docs, track_doc_name=True).rename(columns={"X":"sentence"})

In [11]:
sdf_labels

Unnamed: 0,sentence,concept,y,doc_name
0,[**2078-8-9**] Sex: M\n\nService...,Vicodin,Drug,100035.txt
1,"While at the OSH, he received CTX,\nazithromyc...",CTX,Drug,100035.txt
2,"While at the OSH, he received CTX,\nazithromyc...",azithromycin,Drug,100035.txt
3,"While at the OSH, he received CTX,\nazithromyc...",epinephrine,Drug,100035.txt
4,"While at the OSH, he received CTX,\nazithromyc...",solumedrol,Drug,100035.txt
...,...,...,...,...
90405,[**Name (NI) **],,NEG,198406.txt
90406,"[**Telephone/Fax (1) 92787**](H),",,NEG,198406.txt
90407,[**Telephone/Fax (1) 92788**](C)\n\n\nMedicati...,,NEG,198406.txt
90408,3.,,NEG,198406.txt


In [12]:
uniq_sentSet = set(sdf_labels['sentence'].to_list())
len(list(uniq_sentSet))

43875

In [13]:
pickle_embedding_file= r'..\data\n2c2\embedding_df.joblib'
embedding_df=joblib.load(pickle_embedding_file)

In [14]:
embedding_df

Unnamed: 0,sid,sentence,doc_name,embedding
0,0,Admission Date:,100035.txt,"[0.026282, 0.03218903, -0.022386529, 0.0493732..."
1,1,[**2115-2-22**] Discharge Date: ...,100035.txt,"[0.016159855, 0.042264156, -0.018290585, -0.05..."
2,2,[**2078-8-9**] Sex: M\n\nService...,100035.txt,"[0.025958579, -0.05749655, 0.012378361, -0.009..."
3,3,[**Known lastname 3234**] is a 36 year old gen...,100035.txt,"[0.023170307, 0.03989108, 0.026217388, -0.0272..."
4,4,The patient initially presented to LGH ED with...,100035.txt,"[0.008176211, -0.06342948, 0.048615105, -0.045..."
...,...,...,...,...
47525,47525,"Cyanocobalamin 1,000 mcg/mL Injection once a m...",198406.txt,"[0.050521564, -0.08905716, -0.0019493615, -0.0..."
47526,47526,"Lorazepam 0.25 QAM, O.25 QPM, 0.5 mg QHS\n8 Ca...",198406.txt,"[-0.030010268, -0.062390275, 0.0016725484, 0.0..."
47527,47527,Cream Topical TID\n9.,198406.txt,"[0.026732022, -0.04987913, 0.024520764, -0.016..."
47528,47528,Acetaminophen 1000 mg PO Q6H\n10.,198406.txt,"[-0.017295217, -0.10513715, -0.0030776137, -0...."


In [15]:
embedding_df[['sentence', 'embedding']].drop_duplicates(subset='sentence',keep='first', inplace=True)

In [19]:
embedding_df[embedding_df['sid']==1]

Unnamed: 0,sid,sentence,doc_name,embedding
1,1,[**2115-2-22**] Discharge Date: ...,100035.txt,"[0.016159855, 0.042264156, -0.018290585, -0.05..."


In [17]:
sdf_labels.merge(embedding_df, how='inner', on='sentence')  

Unnamed: 0,sentence,concept,y,doc_name_x,sid,doc_name_y,embedding
0,[**2078-8-9**] Sex: M\n\nService...,Vicodin,Drug,100035.txt,2,100035.txt,"[0.025958579, -0.05749655, 0.012378361, -0.009..."
1,"While at the OSH, he received CTX,\nazithromyc...",CTX,Drug,100035.txt,5,100035.txt,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
2,"While at the OSH, he received CTX,\nazithromyc...",azithromycin,Drug,100035.txt,5,100035.txt,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
3,"While at the OSH, he received CTX,\nazithromyc...",epinephrine,Drug,100035.txt,5,100035.txt,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
4,"While at the OSH, he received CTX,\nazithromyc...",solumedrol,Drug,100035.txt,5,100035.txt,"[0.038356796, -0.054362558, 0.028156247, -0.02..."
...,...,...,...,...,...,...,...
272209,Patient had some cardiac enzyme leaks\nduring ...,,NEG,198406.txt,47513,198406.txt,"[0.02065785, -0.06587324, 0.055154495, 0.01074..."
272210,Patient was given cardiac healthy diet during ...,,NEG,198406.txt,47515,198406.txt,"[0.030450102, -0.042418838, 0.00325665, 0.0384..."
272211,# CODE: DNR/DNI (discussed with patient and so...,,NEG,198406.txt,47516,198406.txt,"[0.023342747, 0.013347558, -0.01095362, -0.052..."
272212,"[**Telephone/Fax (1) 92787**](H),",,NEG,198406.txt,47518,198406.txt,"[0.02003492, 0.029056935, -0.0140215475, 0.007..."
