In [1]:
import os
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
HTML("<style>.rendered_html th {max-width: 120px;}</style>")

In [290]:
def readTerminology(file):
    "Reads a terminology file into two dictionaries."
    cuiName = {}
    nameCui = {}
    with open(file) as f:
        for m in f.readlines():
            s = m.split('||')
            cui = s[0]
            names = s[1].strip().split('|')
            cuiName[cui] = names

            for name in names:
                if name not in nameCui:
                    nameCui[name] = []
                nameCui[name].append(cui)
    return cuiName, nameCui

def readAnnotations(path):
    "Reads all .concept files from path into single dataframe."
    annotations = pd.DataFrame([])
    for file in os.listdir(path):
        if '.concept' in file:
            df = pd.read_table(f'{path}/{file}',sep='\|\|',header=None, names=['file_id','ix','type','name','cui'])
            df['file'] = [file for x in range(len(df))]
            annotations = pd.concat([annotations, df])
    annotations = annotations[['cui','name','file']]
    return annotations

def readCuiType(cuis):
    "Reads in a dictionary mapping cuis to semantic types."
    
    # Load UMLS semantic type mapping file.
    try:
        mrsty = pd.read_table('umls/mrsty.txt',sep='|',header=0,names=['cui','tui','stn','type','atui','cvf'])[['cui','tui','type']]
        mrsty = mrsty[mrsty.cui.isin(cuis)]
    except:
        raise('NOTE: Must have previously created umls/mrsty.txt by running "Load UMLS data.ipynb" to run readCuiType()')
    
    cuiType = {}
    for x in mrsty.iterrows():
        if x[1].cui not in cuiType:
            cuiType[x[1].cui] = []
        cuiType[x[1].cui].append(x[1].type)
    return cuiType

def getStats(df):
    "Gets stats for given dataframe"
    n = len(df)
    tp = sum(df.prediction == df.goldCui)
    fp = sum((df.prediction != df.goldCui) & (df.normalized==True))
    recall = round(tp/n,2) if n > 0 else 0
    precision = round(tp/(tp+fp),2) if (tp+fp) > 0 else 0
    return n, tp, fp, recall, precision

def sieveResults(results):
    "Returns a sieve-level analysis of results."
    levels = range(1,max(results.normalizingSieveLevel)+1)
    sieves = pd.DataFrame([], columns=['sieve','n','tp','fp', 'sieve_acc', 'agg_recall', 'agg_precision'])
    
    # Results for each sieve
    for i in levels:
        df = results[results.normalizingSieveLevel==i]
        n, tp, fp, recall, precision = getStats(df)
        sieve = df.normalizingSieveName.iloc[0] if n > 0 else "Unknown"
        sieves.loc[i] = [sieve, n, tp, fp, recall, 0, 0]
        sieves.loc[i,'agg_recall'] = round(sum(sieves.tp)/sum(sieves.n),2)
        sieves.loc[i,'agg_precision'] = round(sum(sieves.tp)/(sum(sieves.tp)+sum(sieves.fp)),2)
    
    # Total results
    n, tp, fp, recall, precision = getStats(results)
    sieves.loc[i+1] = ['Total', n, tp, fp, '-', recall, precision]
    return sieves

def stratifyByCol(df, col, asc=False):
    "Stratifies the results by given column."    
    # If the column is a list, explode list into individual rows
    if (df.sample(100).applymap(type).mode(0).astype(str) == "<class 'list'>")[col][0]:
        df = df.explode(col)
        
    rows = []
    for key in set(df[col]):
        if not pd.isnull(key):
            sub = df[df[col]==key]
            rows.append([key] + list(getStats(sub)))
    return pd.DataFrame(rows, columns=[col,'n','tp','fp', 'recall', 'precision']).sort_values('precision',ascending=asc)

def stratifyByCols(df, cols, asc=False):
    for col in cols:
        # If the column is a list, explode list into individual rows
        if (df.sample(100).applymap(type).mode(0).astype(str) == "<class 'list'>")[col][0]:
            df = df.explode(col)            
    return stratifyByCol(df.assign(combined=df[cols].agg('-'.join, axis=1)), 'combined',asc=asc)

def getAmbiguous(df):
    "Find ambiguous names"
    dfMap = {}
    for i in range(len(df)):
        name = df.iloc[i]['name'].lower().strip()
        cui = df.iloc[i]['cui'].lower().strip()

        if name not in dfMap:
            dfMap[name] = []

        dfMap[name] = list(set([cui] + dfMap[name]))

    namesToCuis = pd.DataFrame(dfMap.items(),columns=['name','cuis'])
    namesToCuis['ambiguous'] = [len(x) > 1 for x in namesToCuis.cuis]
    return namesToCuis[namesToCuis.ambiguous]

In [392]:
%%time
# Setup: Load terminology into dictionary, train, test, and results
dataset = 'n2c2'
cuiName, nameCui = readTerminology(f'../resources/{dataset}_terminology.txt')
train = readAnnotations(f'../{dataset}-data/train')
# test = readAnnotations(f'../{dataset}-data/test')
results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')

# Load semantic type map
cuis = list(set(list(results.goldCui) + list(results.prediction)))
cuiType = readCuiType(cuis)

# Create analysis dataframe
cols = ['normalized','normalizingSource','normalizingSieveName','name','prediction','goldCui','namePermutations']
analysis = results[cols]
analysis = analysis.assign(goldNames=[['CUI-less'] if c=='CUI-less' else cuiName[c] if c in cuiName else ['Missing'] for c in results.goldCui])
analysis = analysis.assign(predTypes=[cuiType[c] if c in cuiType else ['Missing'] for c in results.prediction])
analysis = analysis.assign(goldTypes=[cuiType[c] if c in cuiType else ['Missing'] for c in results.goldCui])

# Sanity checks
# assert len(analysis[analysis.normalized & (analysis.predTypes=='Missing')])==0, 'Predicted CUI missing ST'
# assert len(analysis[(analysis.goldCui != 'CUI-less') & analysis.goldTypes=='Missing'])==0, 'Gold CUI missing ST'
assert len(analysis[analysis.goldNames=='Missing'])==0, 'Gold names missing'
assert len(analysis[analysis.goldTypes=='Missing'])==0, 'Gold types missing'



Wall time: 9.69 s


In [393]:
# Stratify performance by column
normalized = analysis[analysis.normalized]
stratifyByCol(normalized, 'normalizingSource')
stratifyByCol(normalized, 'normalizingSieveName')
# stratifyByCols(normalized, ['normalizingSource','normalizingSieveName'])
# display(stratifyByCols(normalized, ['predTypes','goldTypes']))
# stratifyByCol(analysis, 'predTypes', asc=True)
# stratifyByCol(analysis, 'goldTypes', asc=True)

Unnamed: 0,normalizingSieveName,n,tp,fp,recall,precision
2,HyphenationSieve,10,10,0,1.0,1.0
5,PrepositionalTransformSieve,28,27,1,0.96,0.96
1,ExactMatchSieve,4265,4022,243,0.94,0.94
4,UmlsEndingSieve,65,47,18,0.72,0.72
0,AbbreviationExpansionSieve,53,37,16,0.7,0.7
3,StemmingSieve,72,32,40,0.44,0.44


In [389]:
# nameCui['difficulty']
# remove = ['normalized','normalizingSource','normalizingSieveName']#,'goldTypes','predTypes'
# errors = analysis[(analysis.prediction != analysis.goldCui) & (analysis.normalized==True)]
# errors = errors.explode('goldNames')
# errors = errors[errors.goldNames.str.contains(', nos')]
# errors[errors.name == errors.goldNames.str.replace(', nos','')]
results[results.nameExpansion.notnull() & (results.name.str.strip()!=results.nameExpansion.str.strip())]

Unnamed: 0,filename,name,nameExpansion,prediction,normalized,normalizingSieveName,normalizingSource,goldCui,namePermutations,normalizingSieveLevel,goldNames
11,0034,dm,diabetes mellitus,C0011849,True,AbbreviationExpansionSieve,trainTerminology,C0011849,"dm,dm, nos,diabetes mellitus",3,"diabetes mellitus,diabetes mellitus (disorder)..."
25,0034,any changes in bowel habits,any changes interstitial nephritis bowel habits,,False,,,C0278008,"any changes in bowel habits,any changes in bow...",0,"[d]change in bowel habit,[d]change in bowel ha..."
49,0034,straight leg raise test,straight leg raise tubal embryo stage transfer,,False,,,C0422926,"straight leg raise test,straight leg raise tes...",0,"straight leg raise,straight leg raise test res..."
59,0034,esr,electron spin resonance,C0013845,True,AbbreviationExpansionSieve,standardTerminology,C1176468,"esr,esr, nos,electron spin resonance",3,"erythrocyte sedimentation rate,erythrocyte sed..."
82,0034,pt,patch test,C0030646,True,AbbreviationExpansionSieve,standardTerminology,C0949766,"pt,pt, nos,patch test",3,"physical therapy procedure,physical therapy pr..."
105,0034,ap,abdominal pain,C0000737,True,AbbreviationExpansionSieve,trainTerminology,C0442212,"ap,ap, nos,abdominal pain",3,"antero-posterior projection,antero-posterior p..."
137,0070,dolls eye,dolls 2nd degree burn,,False,,,C0034944,"dolls eye,dolls eye, nos,dolls 2nd degree burn...",0,
139,0070,gag,glycosaminoglycan screening,C0430047,True,AbbreviationExpansionSieve,standardTerminology,C1287907,"gag,gag, nos,glycosaminoglycan screening",3,"pharyngeal gag reflex finding,pharyngeal gag r..."
169,0070,pt,paroxysmal tachycardia,C0039236,True,AbbreviationExpansionSieve,standardTerminology,C0033707,"pt,pt, nos,paroxysmal tachycardia",3,"one stage prothrombin time,one stage prothromb..."
176,0070,the head ct scan,the head abnormality of left atrioventricular ...,,False,,,C0202691,"the head ct scan,the head ct scan, nos,the hea...",0,"cat scan of head,computed tomography of head,c..."


In [394]:
remove = ['normalized','normalizingSource','normalizingSieveName']#,'goldTypes','predTypes'
errors = analysis[(results.prediction != results.goldCui) & (results.normalized==True)]
errors = errors[(errors.normalizingSource=='standardTerminology') & (errors.normalizingSieveName=='AbbreviationExpansionSieve')]
errors = errors.loc[:, ~errors.columns.isin(remove)]
print(len(errors))
errors.style.set_properties(subset=['goldNames'], **{'width': '1000px'})

16


Unnamed: 0,name,prediction,goldCui,namePermutations,goldNames,predTypes,goldTypes
139,gag,C0430047,C1287907,"gag,gag, nos,glycosaminoglycan screening","['pharyngeal gag reflex finding', 'pharyngeal gag reflex finding (finding)']",['Laboratory Procedure'],['Finding']
726,post,C0405869,C0687676,"post,post, nos,peritoneal ovum and sperm transfer","['after values', 'after values (qualifier value)']",['Therapeutic or Preventive Procedure'],['Temporal Concept']
2241,all,C0023449,CUI-less,"all,all, nos,acute lymphoblastic leukaemia,acute lymphoblastic leukemia",['CUI-less'],['Neoplastic Process'],['Missing']
2600,ppi,C0578068,C0358591,"ppi,ppi, nos,observation of present pain intensity","['h+/k+-exchanging atpase inhibitor product', 'h+/k+-exchanging atpase inhibitor-containing product', 'product containing h+/k+-exchanging atpase inhibitor', 'product containing h+/k+-exchanging atpase inhibitor (product)', 'product containing hydrogen/potassium adenosine triphosphatase enzyme system inhibitor (product)', 'proton pump inhibitor (product)', 'proton pump inhibitor (substance)', 'proton pump inhibitor product', 'proton pump inhibitor-containing product', 'proton pump inhibitors', 'substance with h+/k+-exchanging atpase inhibitor mechanism of action', 'substance with h+/k+-exchanging atpase inhibitor mechanism of action (substance)', 'substance with hydrogen/potassium adenosine triphosphatase enzyme system inhibitor mechanism of action (substance)']",['Finding'],['Pharmacologic Substance']
2611,vs,C0234198,C0150404,"vs,vs, nos,vaginal swab,vibration sense","['observation of vital signs', 'observing patient vital signs', 'observing patient vital signs, nos', 'taking patient vital signs', 'taking patient vital signs (procedure)', 'taking patient vital signs, nos', 'vital signs, nos']",['Organism Function'],['Health Care Activity']
2649,po,C1527415,C2316867,"po,po, nos,per os,plasmodium ovale","['administration of substance via oral route', 'administration of substance via oral route (procedure)']",['Functional Concept'],['Therapeutic or Preventive Procedure']
2743,ppi,C0578068,C0358591,"ppi,ppi, nos,observation of present pain intensity","['h+/k+-exchanging atpase inhibitor product', 'h+/k+-exchanging atpase inhibitor-containing product', 'product containing h+/k+-exchanging atpase inhibitor', 'product containing h+/k+-exchanging atpase inhibitor (product)', 'product containing hydrogen/potassium adenosine triphosphatase enzyme system inhibitor (product)', 'proton pump inhibitor (product)', 'proton pump inhibitor (substance)', 'proton pump inhibitor product', 'proton pump inhibitor-containing product', 'proton pump inhibitors', 'substance with h+/k+-exchanging atpase inhibitor mechanism of action', 'substance with h+/k+-exchanging atpase inhibitor mechanism of action (substance)', 'substance with hydrogen/potassium adenosine triphosphatase enzyme system inhibitor mechanism of action (substance)']",['Finding'],['Pharmacologic Substance']
3386,avf,C0397529,C0449215,"avf,avf, nos,pulmonary arteriovenous fistula operation",['avf (body structure)'],['Therapeutic or Preventive Procedure'],['Spatial Concept']
3482,okt3,C0108779,C0085379,"okt3,okt3, nos,ortho kung t3","['muromonab-cd3', 'muromonab-cd3 (product)', 'muromonab-cd3 (substance)', 'muromonab-cd3 product', 'muromonab-cd3-containing product', 'product containing muromonab-cd3', 'product containing muromonab-cd3 (medicinal product)']","['Amino Acid, Peptide, or Protein', 'Immunologic Factor']","['Amino Acid, Peptide, or Protein', 'Pharmacologic Substance', 'Immunologic Factor']"
4065,po,C1527415,C2316867,"po,po, nos,per os,plasmodium ovale","['administration of substance via oral route', 'administration of substance via oral route (procedure)']",['Functional Concept'],['Therapeutic or Preventive Procedure']


In [390]:
# results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,ExactMatchSieve,4265,4022,243,0.94,0.94,0.94
2,UmlsEndingSieve,65,47,18,0.72,0.94,0.94
3,AbbreviationExpansionSieve,90,39,51,0.43,0.93,0.93
4,PrepositionalTransformSieve,28,27,1,0.96,0.93,0.93
5,Unknown,0,0,0,0,0.93,0.93
6,HyphenationSieve,10,10,0,1,0.93,0.93
7,DiseaseTermSynonymsSieve,34,6,28,0.18,0.92,0.92
8,StemmingSieve,71,32,39,0.45,0.92,0.92
9,Total,6621,4183,380,-,0.63,0.92


In [391]:
results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,ExactMatchSieve,4265,4022,243,0.94,0.94,0.94
2,UmlsEndingSieve,65,47,18,0.72,0.94,0.94
3,AbbreviationExpansionSieve,53,37,16,0.7,0.94,0.94
4,PrepositionalTransformSieve,28,27,1,0.96,0.94,0.94
5,Unknown,0,0,0,0,0.94,0.94
6,HyphenationSieve,10,10,0,1,0.94,0.94
7,StemmingSieve,72,32,40,0.44,0.93,0.93
8,Total,6621,4175,318,-,0.63,0.93


In [262]:
len(set(results.filename))

50

In [192]:
# getAmbiguous(train)

In [11]:
# omissions = results[results.prediction.isnull() & results.goldNames.notnull()]
# omissions[['filename','name','namePermutations','goldCui','goldNames']].head(5)