In [1]:
import os
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
HTML("<style>.rendered_html th {max-width: 120px;}</style>")

In [290]:
def readTerminology(file):
    "Reads a terminology file into two dictionaries."
    cuiName = {}
    nameCui = {}
    with open(file) as f:
        for m in f.readlines():
            s = m.split('||')
            cui = s[0]
            names = s[1].strip().split('|')
            cuiName[cui] = names

            for name in names:
                if name not in nameCui:
                    nameCui[name] = []
                nameCui[name].append(cui)
    return cuiName, nameCui

def readAnnotations(path):
    "Reads all .concept files from path into single dataframe."
    annotations = pd.DataFrame([])
    for file in os.listdir(path):
        if '.concept' in file:
            df = pd.read_table(f'{path}/{file}',sep='\|\|',header=None, names=['file_id','ix','type','name','cui'])
            df['file'] = [file for x in range(len(df))]
            annotations = pd.concat([annotations, df])
    annotations = annotations[['cui','name','file']]
    return annotations

def readCuiType(cuis):
    "Reads in a dictionary mapping cuis to semantic types."
    
    # Load UMLS semantic type mapping file.
    try:
        mrsty = pd.read_table('umls/mrsty.txt',sep='|',header=0,names=['cui','tui','stn','type','atui','cvf'])[['cui','tui','type']]
        mrsty = mrsty[mrsty.cui.isin(cuis)]
    except:
        raise('NOTE: Must have previously created umls/mrsty.txt by running "Load UMLS data.ipynb" to run readCuiType()')
    
    cuiType = {}
    for x in mrsty.iterrows():
        if x[1].cui not in cuiType:
            cuiType[x[1].cui] = []
        cuiType[x[1].cui].append(x[1].type)
    return cuiType

def getStats(df):
    "Gets stats for given dataframe"
    n = len(df)
    tp = sum(df.prediction == df.goldCui)
    fp = sum((df.prediction != df.goldCui) & (df.normalized==True))
    recall = round(tp/n,2) if n > 0 else 0
    precision = round(tp/(tp+fp),2) if (tp+fp) > 0 else 0
    return n, tp, fp, recall, precision

def sieveResults(results):
    "Returns a sieve-level analysis of results."
    levels = range(1,max(results.normalizingSieveLevel)+1)
    sieves = pd.DataFrame([], columns=['sieve','n','tp','fp', 'sieve_acc', 'agg_recall', 'agg_precision'])
    
    # Results for each sieve
    for i in levels:
        df = results[results.normalizingSieveLevel==i]
        n, tp, fp, recall, precision = getStats(df)
        sieve = df.normalizingSieveName.iloc[0] if n > 0 else "Unknown"
        sieves.loc[i] = [sieve, n, tp, fp, recall, 0, 0]
        sieves.loc[i,'agg_recall'] = round(sum(sieves.tp)/sum(sieves.n),2)
        sieves.loc[i,'agg_precision'] = round(sum(sieves.tp)/(sum(sieves.tp)+sum(sieves.fp)),2)
    
    # Total results
    n, tp, fp, recall, precision = getStats(results)
    sieves.loc[i+1] = ['Total', n, tp, fp, '-', recall, precision]
    return sieves

def stratifyByCol(df, col, asc=False):
    "Stratifies the results by given column."    
    # If the column is a list, explode list into individual rows
    if (df.sample(100).applymap(type).mode(0).astype(str) == "<class 'list'>")[col][0]:
        df = df.explode(col)
        
    rows = []
    for key in set(df[col]):
        if not pd.isnull(key):
            sub = df[df[col]==key]
            rows.append([key] + list(getStats(sub)))
    return pd.DataFrame(rows, columns=[col,'n','tp','fp', 'recall', 'precision']).sort_values('precision',ascending=asc)

def stratifyByCols(df, cols, asc=False):
    for col in cols:
        # If the column is a list, explode list into individual rows
        if (df.sample(100).applymap(type).mode(0).astype(str) == "<class 'list'>")[col][0]:
            df = df.explode(col)            
    return stratifyByCol(df.assign(combined=df[cols].agg('-'.join, axis=1)), 'combined',asc=asc)

def getAmbiguous(df):
    "Find ambiguous names"
    dfMap = {}
    for i in range(len(df)):
        name = df.iloc[i]['name'].lower().strip()
        cui = df.iloc[i]['cui'].lower().strip()

        if name not in dfMap:
            dfMap[name] = []

        dfMap[name] = list(set([cui] + dfMap[name]))

    namesToCuis = pd.DataFrame(dfMap.items(),columns=['name','cuis'])
    namesToCuis['ambiguous'] = [len(x) > 1 for x in namesToCuis.cuis]
    return namesToCuis[namesToCuis.ambiguous]

In [554]:
%%time
# Setup: Load terminology into dictionary, train, test, and results
dataset = 'n2c2'
cuiName, nameCui = readTerminology(f'../resources/{dataset}_terminology.txt')
train = readAnnotations(f'../{dataset}-data/train')
# test = readAnnotations(f'../{dataset}-data/test')
results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')

# Load semantic type map
cuis = list(set(list(results.goldCui) + list(results.prediction)))
cuiType = readCuiType(cuis)

# Create analysis dataframe
cols = ['normalized','normalizingSource','normalizingSieveName','name','prediction','goldCui','namePermutations','keyPhrase']
analysis = results[cols]
analysis = analysis.assign(goldNames=[['CUI-less'] if c=='CUI-less' else cuiName[c] if c in cuiName else ['Missing'] for c in results.goldCui])
analysis = analysis.assign(predTypes=[cuiType[c] if c in cuiType else ['Missing'] for c in results.prediction])
analysis = analysis.assign(goldTypes=[cuiType[c] if c in cuiType else ['Missing'] for c in results.goldCui])

# Sanity checks
# assert len(analysis[analysis.normalized & (analysis.predTypes=='Missing')])==0, 'Predicted CUI missing ST'
# assert len(analysis[(analysis.goldCui != 'CUI-less') & analysis.goldTypes=='Missing'])==0, 'Gold CUI missing ST'
assert len(analysis[analysis.goldNames=='Missing'])==0, 'Gold names missing'
assert len(analysis[analysis.goldTypes=='Missing'])==0, 'Gold types missing'



Wall time: 8.66 s


In [555]:
# Stratify performance by column
normalized = analysis[analysis.normalized]
stratifyByCol(normalized, 'normalizingSource')
stratifyByCol(normalized, 'normalizingSieveName')
# stratifyByCols(normalized, ['normalizingSource','normalizingSieveName'])
# display(stratifyByCols(normalized, ['predTypes','goldTypes']))
# stratifyByCol(analysis, 'predTypes', asc=True)
# stratifyByCol(analysis, 'goldTypes', asc=True)

Unnamed: 0,normalizingSieveName,n,tp,fp,recall,precision
0,ExactMatchSieve,4202,3973,229,0.95,0.95
6,PrepositionalTransformSieve,39,36,3,0.92,0.92
2,HyphenationSieve,18,16,2,0.89,0.89
4,RemoveStopwordsSieve,318,284,34,0.89,0.89
5,AmbiguitySieve,37,32,5,0.86,0.86
3,AbbreviationExpansionSieve,73,61,12,0.84,0.84
1,UmlsEndingSieve,44,32,12,0.73,0.73


In [556]:
stratifyByCol(normalized[normalized.normalizingSieveName=='RemoveStopwordsSieve'], 'keyPhrase')

Unnamed: 0,keyPhrase,n,tp,fp,recall,precision
0,'s,3,3,0,1.0,1.0
4,this,7,7,0,1.0,1.0
5,any,8,8,0,1.0,1.0
2,&apos;s,19,18,1,0.95,0.95
3,her,33,30,3,0.91,0.91
7,an,32,29,3,0.91,0.91
6,the,81,72,9,0.89,0.89
1,his,54,47,7,0.87,0.87
9,a,76,66,10,0.87,0.87
8,these,5,4,1,0.8,0.8


In [557]:
remove = ['normalized','normalizingSource','normalizingSieveName']#,'goldTypes','predTypes'
errors = analysis[(results.prediction != results.goldCui) & (results.normalized==True)]
errors = errors[(errors.normalizingSource=='standardTerminology') & (errors.normalizingSieveName=='RemoveStopwordsSieve')]
errors = errors.loc[:, ~errors.columns.isin(remove)]
print(len(errors))
errors.style.set_properties(subset=['goldNames'], **{'width': '1000px'})

25


Unnamed: 0,name,prediction,goldCui,namePermutations,keyPhrase,goldNames,predTypes,goldTypes
332,the epidural,C1283259,C0002913,"the epidural,epidural",the,"['epidural anaesthesia', 'epidural anesthesia', 'epidural anesthesia (procedure)', 'epidural block', 'extradural block', 'local anaesthetic epidural block', 'local anesthetic epidural block', 'peridural anaesthesia', 'peridural anesthesia']",['Medical Device'],['Therapeutic or Preventive Procedure']
338,the epidural,C1283259,C0002913,"the epidural,epidural",the,"['epidural anaesthesia', 'epidural anesthesia', 'epidural anesthesia (procedure)', 'epidural block', 'extradural block', 'local anaesthetic epidural block', 'local anesthetic epidural block', 'peridural anaesthesia', 'peridural anesthesia']",['Medical Device'],['Therapeutic or Preventive Procedure']
461,the heart sounds,C0018820,C0577816,"the heart sounds,heart sounds",the,"['finding of heart sounds', 'finding of heart sounds (finding)', 'observation of heart sounds']",['Organ or Tissue Function'],['Finding']
469,the bowel sounds,C0232693,C0577154,"the bowel sounds,bowel sounds",the,"['bowel sounds - finding', 'finding of bowel sounds', 'finding of bowel sounds (finding)', 'finding of bowel sounds [dup] (finding)', 'observation of bowel sounds']",['Finding'],['Finding']
605,the incision,C0184898,C3543005,"the incision,incision",the,"['operative incision', 'surgical incision', 'surgical incision wound', 'surgical incision wound (morphologic abnormality)']",['Therapeutic or Preventive Procedure'],['Acquired Abnormality']
865,a holter monitor,C0182920,C0430461,"a holter monitor,holter monitor",a,"['24 hour ecg', '24 hour ecg (procedure)', '24 hour ecg (regime/therapy)', '24 hour electrocardiogram', '24 hour electrocardiogram (procedure)', '24 hour holter tape']",['Medical Device'],['Diagnostic Procedure']
1099,a creatine kinase,C0010287,C0201973,"a creatine kinase,creatine kinase",a,"['ck measurement', 'cpk measurement', 'creatine (phospho)kinase (& level)', 'creatine (phospho)kinase (& level) (procedure)', 'creatine kinase measurement', 'creatine kinase measurement (procedure)', 'creatine phosphokinase measurement']","['Amino Acid, Peptide, or Protein', 'Enzyme']",['Laboratory Procedure']
1109,a exercise stress test,C0430120,C0015260,"a exercise stress test,exercise stress test",a,"['exercise test', 'exercise tolerance test (procedure)', 'exercise tolerance test nos', 'exercise tolerance test nos (procedure)']",['Diagnostic Procedure'],['Diagnostic Procedure']
1231,the incisions,C0184898,C3543005,"the incisions,incisions",the,"['operative incision', 'surgical incision', 'surgical incision wound', 'surgical incision wound (morphologic abnormality)']",['Therapeutic or Preventive Procedure'],['Acquired Abnormality']
1252,a foley catheter,C0179804,C1970989,"a foley catheter,foley catheter",a,"['bladder catheterisation', 'bladder catheterization', 'catheterise bladder', 'catheterization of bladder', 'catheterization of bladder, nos', 'catheterization of urinary bladder', 'catheterization of urinary bladder (procedure)', 'insertion of catheter into urinary bladder']",['Medical Device'],['Therapeutic or Preventive Procedure']


In [549]:
# results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,ExactMatchSieve,4202,3973,229,0.95,0.95,0.95
2,RemoveStopwordsSieve,322,286,36,0.89,0.94,0.94
3,UmlsEndingSieve,44,32,12,0.73,0.94,0.94
4,AbbreviationExpansionSieve,73,61,12,0.84,0.94,0.94
5,PrepositionalTransformSieve,38,35,3,0.92,0.94,0.94
6,Unknown,0,0,0,0,0.94,0.94
7,HyphenationSieve,18,16,2,0.89,0.94,0.94
8,Unknown,0,0,0,0,0.94,0.94
9,AmbiguitySieve,37,32,5,0.86,0.94,0.94
10,Total,6619,4435,299,-,0.67,0.94


In [550]:
results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,ExactMatchSieve,4202,3973,229,0.95,0.95,0.95
2,RemoveStopwordsSieve,318,284,34,0.89,0.94,0.94
3,UmlsEndingSieve,44,32,12,0.73,0.94,0.94
4,AbbreviationExpansionSieve,73,61,12,0.84,0.94,0.94
5,PrepositionalTransformSieve,39,36,3,0.92,0.94,0.94
6,Unknown,0,0,0,0,0.94,0.94
7,HyphenationSieve,18,16,2,0.89,0.94,0.94
8,Unknown,0,0,0,0,0.94,0.94
9,AmbiguitySieve,37,32,5,0.86,0.94,0.94
10,Total,6619,4434,297,-,0.67,0.94


In [262]:
len(set(results.filename))

50

In [527]:
# getAmbiguous(train)
omissions = analysis[analysis.prediction.isnull()]
omissions = omissions[['Missing' not in x for x in omissions.goldNames]]
print(len(omissions))
omissions[['name','goldNames']].style.set_properties(subset=['goldNames'], **{'width': '500px'})

1391


Unnamed: 0,name,goldNames
0,right le pain,"['pain in right leg', 'pain in right leg (finding)', 'pain in right lower limb', 'pain in right lower limb (finding)']"
9,r leg pain,"['pain in right leg', 'pain in right leg (finding)', 'pain in right lower limb', 'pain in right lower limb (finding)']"
10,right le pain,"['pain in right leg', 'pain in right leg (finding)', 'pain in right lower limb', 'pain in right lower limb (finding)']"
12,pelvis xr,"['pelvic x-ray', 'pelvis x-ray', 'pelvis x-ray (procedure)']"
13,this problem,"['problem', 'problem (finding)', 'problem, nos']"
25,any changes in bowel habits,"['[d]change in bowel habit', '[d]change in bowel habit (context-dependent category)', '[d]change in bowel habit (situation)', 'abnormal bowel habits', 'altered bowel function', 'altered bowel function (finding)', 'altered bowel habit', 'altered bowel habits', 'change in bowel habit', 'change in bowel habit (context-dependent category)', 'change in bowel habit (situation)', 'change in bowel pattern']"
26,non-insulin-requiring diabetes mellitus,"['diabetes mellitus - adult onset', 'diabetes mellitus -adult onset', 'diabetes mellitus type 2', 'diabetes mellitus type 2 (disorder)', 'diabetes mellitus type 2 disorder', 'diabetes mellitus type ii', 'diabetes mellitus: [adult onset] or [noninsulin dependent]', 'diabetes mellitus: [adult onset] or [noninsulin dependent] (disorder)', 'diabetes mellitus: [adult onset] or [noninsulin dependent] disorder', 'maturity onset diabetes', 'maturity onset diabetes mellitus', 'ncdmm', 'non-insulin dependent diabetes mellitus', 'non-insulin-dependent diabetes mellitus', 'noninsulin dependent diab.mell', 'type 2 diabetes mellitus', 'type ii diabetes mellitus', 'type ii diabetes mellitus (disorder)', 'type ii diabetes mellitus disorder']"
30,analgesics,"['analgesic', 'analgesic (product)', 'analgesic (substance)', 'analgesic agent', 'analgesic product', 'analgesic product (substance)', 'analgesic, nos', 'medicinal product acting as analgesic agent (product)']"
42,organomegaly,"['abdominal organomegaly', 'abdominal organomegaly (disorder)', 'abdominal organomegaly disorder']"
48,neurologic examination,"['assessing neurological performance', 'assessing neurological status', 'nervous sys.exam.-gener', 'nervous system examination - general', 'nervous system-general exam.', 'neurological assessment', 'neurological assessment (procedure)', 'neurological assessment (regime/therapy)', 'neurological examination', 'neurological examination (procedure)', 'neurological examination, nos']"


In [537]:
# TODO: Investigate
results[results.name=='his intra-aortic balloon pump']
results[results.name.str.contains("patient")]

Unnamed: 0,filename,name,nameExpansion,prediction,normalized,normalizingSieveName,normalizingSource,goldCui,namePermutations,normalizingSieveLevel,goldNames
190,0070,the patient &apos;s temperature,,C0204688,True,ExactMatchSieve,trainTerminology,C0204688,the patient &apos;s temperature,1,"temperature taking,temperature taking (procedu..."
488,0250,the patient &apos;s gunshot wound,,,False,,,C0043252,"the patient &apos;s gunshot wound,patient &apo...",0,
1094,0467,the patient 's sma 7,,,False,,,C0438930,"the patient 's sma 7,patient 's sma 7,the pati...",0,"chem. metabolic function tests,chem. metabolic..."
1256,0477,the patient &apos;s white count,,,False,,,C0023508,"the patient &apos;s white count,patient &apos;...",0,"wbc count,white blood cell count,white blood c..."
1267,0477,the patient &apos;s medications,,,False,,,C0013216,"the patient &apos;s medications,patient &apos;...",0,"drug therapy,drug therapy (procedure),drug the..."
1684,433651389,the patient &apos;s extremity examination,,,False,,,C0436150,"the patient &apos;s extremity examination,pati...",0,"exam. of extremities nos (procedure),examinati..."
1747,433651389,the patient &apos;s coumadin,,,False,,,C0699129,"the patient &apos;s coumadin,patient &apos;s c...",0,"coumadin,his coumadin"
1884,622086964,the patient &apos;s urine culture,,,False,,,C0430404,"the patient &apos;s urine culture,patient &apo...",0,"microbial urine culture,urine culture,urine cu..."
2154,0038,the patient &apos;s epigastric pain,,,False,,,C0232493,"the patient &apos;s epigastric pain,patient &a...",0,"[d]epigastric pain,[d]epigastric pain (context..."
2179,0038,the patient &apos;s fever,,,False,,,C0015967,"the patient &apos;s fever,patient &apos;s feve...",0,"[d]fever nos,[d]fever nos (context-dependent c..."
