In [1]:
import os
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
HTML("<style>.rendered_html th {max-width: 120px;}</style>")

In [2]:
def readTerminology(file):
    "Reads a terminology file into two dictionaries."
    cuiName = {}
    nameCui = {}
    with open(file, encoding="ISO-8859-1") as f:
        for m in f.readlines():
            s = m.split('||')
            cui = s[0]
            names = s[1].strip().split('|')
            cuiName[cui] = names

            for name in names:
                if name not in nameCui:
                    nameCui[name] = []
                nameCui[name].append(cui)
    return cuiName, nameCui

def readAnnotations(path):
    "Reads all .concept files from path into single dataframe."
    annotations = pd.DataFrame([])
    for file in os.listdir(path):
        if '.concept' in file:
            df = pd.read_table(f'{path}/{file}',sep='\|\|',header=None, names=['file_id','ix','type','name','cui'])
            df['file'] = [file for x in range(len(df))]
            annotations = pd.concat([annotations, df])
    annotations = annotations[['cui','name','file']]
    return annotations

def readCuiType(cuis):
    "Reads in a dictionary mapping cuis to semantic types."
    
    # Load UMLS semantic type mapping file.
    try:
        mrsty = pd.read_table('umls/mrsty.txt',sep='|',header=0,names=['cui','tui','stn','type','atui','cvf'])[['cui','tui','type']]
        mrsty = mrsty[mrsty.cui.isin(cuis)]
    except:
        raise('NOTE: Must have previously created umls/mrsty.txt by running "Load UMLS data.ipynb" to run readCuiType()')
    
    cuiType = {}
    for x in mrsty.iterrows():
        if x[1].cui not in cuiType:
            cuiType[x[1].cui] = []
        cuiType[x[1].cui].append(x[1].type)
    return cuiType

def getStats(df):
    "Gets stats for given dataframe"
    n = len(df)
    tp = sum(df.prediction == df.goldCui)
    fp = sum((df.prediction != df.goldCui) & (df.normalized==True))
    recall = round(tp/n,2) if n > 0 else 0
    precision = round(tp/(tp+fp),2) if (tp+fp) > 0 else 0
    return n, tp, fp, recall, precision

def sieveResults(results):
    "Returns a sieve-level analysis of results."
    levels = range(1,max(results.normalizingSieveLevel)+1)
    sieves = pd.DataFrame([], columns=['sieve','n','tp','fp', 'sieve_acc', 'agg_recall', 'agg_precision'])
    
    # Results for each sieve
    for i in levels:
        df = results[results.normalizingSieveLevel==i]
        n, tp, fp, recall, precision = getStats(df)
        sieve = df.normalizingSieveName.iloc[0] if n > 0 else "Unknown"
        sieves.loc[i] = [sieve, n, tp, fp, recall, 0, 0]
        sieves.loc[i,'agg_recall'] = round(sum(sieves.tp)/sum(sieves.n),2)
        sieves.loc[i,'agg_precision'] = round(sum(sieves.tp)/(sum(sieves.tp)+sum(sieves.fp)),2)
    
    # Total results
    n, tp, fp, recall, precision = getStats(results)
    sieves.loc[i+1] = ['Total', n, tp, fp, '-', recall, precision]
    return sieves

def stratifyByCol(df, col, asc=False):
    "Stratifies the results by given column."    
    # If the column is a list, explode list into individual rows
    if (df.sample(10).applymap(type).mode(0).astype(str) == "<class 'list'>")[col][0]:
        df = df.explode(col)
        
    rows = []
    for key in set(df[col]):
        if not pd.isnull(key):
            sub = df[df[col]==key]
            rows.append([key] + list(getStats(sub)))
    return pd.DataFrame(rows, columns=[col,'n','tp','fp', 'recall', 'precision']).sort_values('precision',ascending=asc)

def stratifyByCols(df, cols, asc=False):
    for col in cols:
        # If the column is a list, explode list into individual rows
        if (df.sample(10).applymap(type).mode(0).astype(str) == "<class 'list'>")[col][0]:
            df = df.explode(col)            
    return stratifyByCol(df.assign(combined=df[cols].agg('-'.join, axis=1)), 'combined',asc=asc)

def getAmbiguous(df):
    "Find ambiguous names"
    dfMap = {}
    for i in range(len(df)):
        name = df.iloc[i]['name'].lower().strip()
        cui = df.iloc[i]['cui'].lower().strip()

        if name not in dfMap:
            dfMap[name] = []

        dfMap[name] = list(set([cui] + dfMap[name]))

    namesToCuis = pd.DataFrame(dfMap.items(),columns=['name','cuis'])
    namesToCuis['ambiguous'] = [len(x) > 1 for x in namesToCuis.cuis]
    return namesToCuis[namesToCuis.ambiguous]

In [131]:
%%time
# Setup: Load terminology into dictionary, train, test, and results
dataset = 'n2c2'
cuiName, nameCui = readTerminology(f'../resources/{dataset}_terminology.txt')
# train = readAnnotations(f'../{dataset}-data/train')
# test = readAnnotations(f'../{dataset}-data/test')
results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')

# Load semantic type map
cuis = list(set(list(results.goldCui) + list(results.prediction)))
cuiType = readCuiType(cuis)

# Create analysis dataframe
cols = ['normalized','normalizingSource','normalizingSieveName','name','prediction','goldCui','normalizingName','keyPhrase','filename']
analysis = results[cols]
analysis = analysis.assign(goldNames=[['CUI-less'] if c=='CUI-less' else cuiName[c] if c in cuiName else ['Missing'] for c in results.goldCui])
analysis = analysis.assign(predTypes=[cuiType[c] if c in cuiType else ['Missing'] for c in results.prediction])
analysis = analysis.assign(goldTypes=[cuiType[c] if c in cuiType else ['Missing'] for c in results.goldCui])

# Sanity checks
# assert len(analysis[analysis.normalized & (analysis.predTypes=='Missing')])==0, 'Predicted CUI missing ST'
# assert len(analysis[(analysis.goldCui != 'CUI-less') & analysis.goldTypes=='Missing'])==0, 'Gold CUI missing ST'
# assert len(analysis[analysis.goldNames=='Missing'])==0, 'Gold names missing'
# assert len(analysis[analysis.goldTypes=='Missing'])==0, 'Gold types missing'

Wall time: 8.58 s


In [132]:
# Stratify performance by column
normalized = analysis[analysis.normalized]
stratifyByCol(normalized, 'normalizingSource')
stratifyByCol(normalized, 'normalizingSieveName')
# stratifyByCols(normalized, ['normalizingSource','normalizingSieveName'])
# display(stratifyByCols(normalized, ['predTypes','goldTypes']))
# stratifyByCol(analysis, 'predTypes', asc=True)
# stratifyByCol(analysis, 'goldTypes', asc=True)

Unnamed: 0,normalizingSieveName,n,tp,fp,recall,precision
0,AbbreviationExpansionSieve,79,74,5,0.94,0.94
5,ExactMatchSieve,4285,4034,251,0.94,0.94
3,SynonymSieve,41,38,3,0.93,0.93
1,PrepositionalTransformSieve,50,46,4,0.92,0.92
6,RemoveStopwordsSieve,330,292,38,0.88,0.88
2,SuffixationSieve,136,117,19,0.86,0.86
4,HyphenationSieve,27,23,4,0.85,0.85
7,AmbiguitySieve,42,35,7,0.83,0.83


In [118]:
stratifyByCol(normalized[normalized.normalizingSieveName=='RemoveStopwordsSieve'], 'keyPhrase')

Unnamed: 0,keyPhrase,n,tp,fp,recall,precision
3,'s,3,3,0,1.0,1.0
7,any,8,8,0,1.0,1.0
4,&apos;s,20,19,1,0.95,0.95
9,the,82,74,8,0.9,0.9
1,an,34,30,4,0.88,0.88
2,her,34,30,4,0.88,0.88
6,this,8,7,1,0.88,0.88
8,his,53,46,7,0.87,0.87
0,a,79,67,12,0.85,0.85
5,these,5,4,1,0.8,0.8


In [121]:
remove = ['normalized','normalizingSource']#,'normalizingSieveName','goldTypes','predTypes'
errors = analysis[(results.prediction != results.goldCui) & (results.normalized==True)]
# errors = errors[(errors.normalizingSource=='standardTerminology') & (errors.normalizingSieveName=='SuffixationSieve')]
errors = errors[errors.normalizingSieveName=='RemoveStopwordsSieve']
errors = errors.loc[:, ~errors.columns.isin(remove)]
print(len(errors))
errors.style.set_properties(subset=['goldNames'], **{'width': '1000px'})

38


Unnamed: 0,normalizingSieveName,name,prediction,goldCui,normalizingName,keyPhrase,filename,goldNames,predTypes,goldTypes
461,RemoveStopwordsSieve,the heart sounds,C0018820,C0577816,heart sounds,the,0250,"['finding of heart sounds', 'finding of heart sounds (finding)', 'observation of heart sounds']",['Organ or Tissue Function'],['Finding']
469,RemoveStopwordsSieve,the bowel sounds,C0232693,C0577154,bowel sounds,the,0250,"['bowel sounds - finding', 'finding of bowel sounds', 'finding of bowel sounds (finding)', 'finding of bowel sounds [dup] (finding)', 'observation of bowel sounds']",['Finding'],['Finding']
605,RemoveStopwordsSieve,the incision,C0184898,C3543005,incision,the,0286,"['surgical incision wound', 'surgical incision wound (morphologic abnormality)']",['Therapeutic or Preventive Procedure'],['Acquired Abnormality']
865,RemoveStopwordsSieve,a holter monitor,C0182920,C0430461,holter monitor,a,0390,"['24 hour ecg', '24 hour ecg (procedure)', '24 hour ecg (regime/therapy)', '24 hour electrocardiogram', '24 hour electrocardiogram (procedure)', '24 hour holter tape']",['Medical Device'],['Diagnostic Procedure']
895,RemoveStopwordsSieve,a pulmonary artery pressure,C0428642,C0199629,pulmonary artery pressure,a,0390,"['pulmonary artery pressure monitoring', 'pulmonary artery pressure monitoring (procedure)', 'pulmonary artery pressure monitoring (regime/therapy)']",['Finding'],['Diagnostic Procedure']
1109,RemoveStopwordsSieve,a exercise stress test,C0430120,C0015260,exercise stress test,a,0467,"['exercise test', 'exercise tolerance test (procedure)', 'exercise tolerance test nos', 'exercise tolerance test nos (procedure)']",['Diagnostic Procedure'],['Diagnostic Procedure']
1231,RemoveStopwordsSieve,the incisions,C0184898,C3543005,incisions,the,0477,"['surgical incision wound', 'surgical incision wound (morphologic abnormality)']",['Therapeutic or Preventive Procedure'],['Acquired Abnormality']
1252,RemoveStopwordsSieve,a foley catheter,C0179804,C1970989,foley catheter,a,0477,"['bladder catheterisation', 'bladder catheterization', 'catheterise bladder', 'catheterization of bladder', 'catheterization of bladder, nos', 'catheterization of urinary bladder', 'catheterization of urinary bladder (procedure)', 'insertion of catheter into urinary bladder']",['Medical Device'],['Therapeutic or Preventive Procedure']
1253,RemoveStopwordsSieve,his foley catheter,C0179804,C1970989,foley catheter,his,0477,"['bladder catheterisation', 'bladder catheterization', 'catheterise bladder', 'catheterization of bladder', 'catheterization of bladder, nos', 'catheterization of urinary bladder', 'catheterization of urinary bladder (procedure)', 'insertion of catheter into urinary bladder']",['Medical Device'],['Therapeutic or Preventive Procedure']
1903,RemoveStopwordsSieve,a nasogastric,C0694637,C0085678,nasogastric,a,622086964,"['nasogastric tube', 'nasogastric tube, device', 'nasogastric tube, device (physical object)']",['Functional Concept'],['Medical Device']


In [219]:
# results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,ExactMatchSieve,4285,4034,251,0.94,0.94,0.94
2,AbbreviationExpansionSieve,86,81,5,0.94,0.94,0.94
3,RemoveStopwordsSieve,332,293,39,0.88,0.94,0.94
4,SynonymSieve,41,38,3,0.93,0.94,0.94
5,SuffixationSieve,136,117,19,0.86,0.94,0.94
6,PrepositionalTransformSieve,50,46,4,0.92,0.93,0.93
7,HyphenationSieve,31,28,3,0.9,0.93,0.93
8,AmbiguitySieve,42,35,7,0.83,0.93,0.93
9,Total,6614,4672,331,-,0.71,0.93


In [220]:
results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,ExactMatchSieve,4285,4034,251,0.94,0.94,0.94
2,AbbreviationExpansionSieve,95,90,5,0.95,0.94,0.94
3,RemoveStopwordsSieve,334,295,39,0.88,0.94,0.94
4,SynonymSieve,41,38,3,0.93,0.94,0.94
5,SuffixationSieve,136,117,19,0.86,0.94,0.94
6,PrepositionalTransformSieve,50,46,4,0.92,0.94,0.94
7,HyphenationSieve,29,26,3,0.9,0.93,0.93
8,AmbiguitySieve,42,35,7,0.83,0.93,0.93
9,Total,6614,4681,331,-,0.71,0.93


In [163]:
# getAmbiguous(train)
omissions = analysis[analysis.prediction.isnull()]
omissions = omissions[['Missing' not in x for x in omissions.goldNames]]
print(len(omissions),len(set(omissions.name)))
omissions[['filename','name','goldNames']].style.set_properties(subset=['goldNames'], **{'width': '1000px'})

1373 1226


Unnamed: 0,filename,name,goldNames
0,0034,right le pain,"['pain in right leg', 'pain in right leg (finding)', 'pain in right lower limb', 'pain in right lower limb (finding)']"
9,0034,r leg pain,"['pain in right leg', 'pain in right leg (finding)', 'pain in right lower limb', 'pain in right lower limb (finding)']"
10,0034,right le pain,"['pain in right leg', 'pain in right leg (finding)', 'pain in right lower limb', 'pain in right lower limb (finding)']"
26,0034,non-insulin-requiring diabetes mellitus,"['diabetes mellitus - adult onset', 'diabetes mellitus -adult onset', 'diabetes mellitus type 2', 'diabetes mellitus type 2 (disorder)', 'diabetes mellitus type ii', 'diabetes mellitus: [adult onset] or [noninsulin dependent]', 'diabetes mellitus: [adult onset] or [noninsulin dependent] (disorder)', 'maturity onset diabetes', 'maturity onset diabetes mellitus', 'ncdmm', 'non-insulin dependent diabetes mellitus', 'non-insulin-dependent diabetes mellitus', 'noninsulin dependent diab.mell', 'type 2 diabetes mellitus', 'type ii diabetes mellitus', 'type ii diabetes mellitus (disorder)']"
42,0034,organomegaly,"['abdominal organomegaly', 'abdominal organomegaly (disorder)']"
48,0034,neurologic examination,"['assessing neurological performance', 'assessing neurological status', 'nervous sys.exam.-gener', 'nervous system examination - general', 'nervous system-general exam.', 'neurological assessment', 'neurological assessment (procedure)', 'neurological assessment (regime/therapy)', 'neurological examination', 'neurological examination (procedure)', 'neurological examination, nos']"
49,0034,straight leg raise test,"['straight leg raise', 'straight leg raise test response', 'straight leg raise test response (observable entity)']"
52,0034,psychiatric examination,"['psychiatric interview and evaluation', 'psychiatric interview and evaluation (procedure)', 'psychiatric interview and evaluation, nos']"
53,0034,lumbar tenderness,"['lumbar spine - tender', 'lumbar spine - tender (finding)']"
56,0034,thad lesion,['CUI-less']


In [218]:
results[results.name.str.contains('pbi')]
# results[results.normalizingName.notnull() & results.normalizingName.str.contains('tincture')]
# results[results.normalizingSieveName=='AbbreviationExpansionSieve']

Unnamed: 0,filename,name,keyPhrase,prediction,normalized,normalizingSieveName,normalizingSource,goldCui,normalizingName,normalizingSieveLevel,goldNames


In [181]:
cuiName['C0026266']

['mitral incompetence',
 'mitral insufficiency',
 'mitral regurgitation',
 'mitral regurgitation, nos',
 'mitral valve incompetence',
 'mitral valve incompetence, nos',
 'mitral valve insufficiency',
 'mitral valve insufficiency, nos',
 'mitral valve regurgitation',
 'mitral valve regurgitation (disorder)',
 'mitral valve regurgitation, nos']