In [1]:
import os
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
HTML("<style>.rendered_html th {max-width: 120px;}</style>")

In [290]:
def readTerminology(file):
    "Reads a terminology file into two dictionaries."
    cuiName = {}
    nameCui = {}
    with open(file) as f:
        for m in f.readlines():
            s = m.split('||')
            cui = s[0]
            names = s[1].strip().split('|')
            cuiName[cui] = names

            for name in names:
                if name not in nameCui:
                    nameCui[name] = []
                nameCui[name].append(cui)
    return cuiName, nameCui

def readAnnotations(path):
    "Reads all .concept files from path into single dataframe."
    annotations = pd.DataFrame([])
    for file in os.listdir(path):
        if '.concept' in file:
            df = pd.read_table(f'{path}/{file}',sep='\|\|',header=None, names=['file_id','ix','type','name','cui'])
            df['file'] = [file for x in range(len(df))]
            annotations = pd.concat([annotations, df])
    annotations = annotations[['cui','name','file']]
    return annotations

def readCuiType(cuis):
    "Reads in a dictionary mapping cuis to semantic types."
    
    # Load UMLS semantic type mapping file.
    try:
        mrsty = pd.read_table('umls/mrsty.txt',sep='|',header=0,names=['cui','tui','stn','type','atui','cvf'])[['cui','tui','type']]
        mrsty = mrsty[mrsty.cui.isin(cuis)]
    except:
        raise('NOTE: Must have previously created umls/mrsty.txt by running "Load UMLS data.ipynb" to run readCuiType()')
    
    cuiType = {}
    for x in mrsty.iterrows():
        if x[1].cui not in cuiType:
            cuiType[x[1].cui] = []
        cuiType[x[1].cui].append(x[1].type)
    return cuiType

def getStats(df):
    "Gets stats for given dataframe"
    n = len(df)
    tp = sum(df.prediction == df.goldCui)
    fp = sum((df.prediction != df.goldCui) & (df.normalized==True))
    recall = round(tp/n,2) if n > 0 else 0
    precision = round(tp/(tp+fp),2) if (tp+fp) > 0 else 0
    return n, tp, fp, recall, precision

def sieveResults(results):
    "Returns a sieve-level analysis of results."
    levels = range(1,max(results.normalizingSieveLevel)+1)
    sieves = pd.DataFrame([], columns=['sieve','n','tp','fp', 'sieve_acc', 'agg_recall', 'agg_precision'])
    
    # Results for each sieve
    for i in levels:
        df = results[results.normalizingSieveLevel==i]
        n, tp, fp, recall, precision = getStats(df)
        sieve = df.normalizingSieveName.iloc[0] if n > 0 else "Unknown"
        sieves.loc[i] = [sieve, n, tp, fp, recall, 0, 0]
        sieves.loc[i,'agg_recall'] = round(sum(sieves.tp)/sum(sieves.n),2)
        sieves.loc[i,'agg_precision'] = round(sum(sieves.tp)/(sum(sieves.tp)+sum(sieves.fp)),2)
    
    # Total results
    n, tp, fp, recall, precision = getStats(results)
    sieves.loc[i+1] = ['Total', n, tp, fp, '-', recall, precision]
    return sieves

def stratifyByCol(df, col, asc=False):
    "Stratifies the results by given column."    
    # If the column is a list, explode list into individual rows
    if (df.sample(100).applymap(type).mode(0).astype(str) == "<class 'list'>")[col][0]:
        df = df.explode(col)
        
    rows = []
    for key in set(df[col]):
        if not pd.isnull(key):
            sub = df[df[col]==key]
            rows.append([key] + list(getStats(sub)))
    return pd.DataFrame(rows, columns=[col,'n','tp','fp', 'recall', 'precision']).sort_values('precision',ascending=asc)

def stratifyByCols(df, cols, asc=False):
    for col in cols:
        # If the column is a list, explode list into individual rows
        if (df.sample(100).applymap(type).mode(0).astype(str) == "<class 'list'>")[col][0]:
            df = df.explode(col)            
    return stratifyByCol(df.assign(combined=df[cols].agg('-'.join, axis=1)), 'combined',asc=asc)

def getAmbiguous(df):
    "Find ambiguous names"
    dfMap = {}
    for i in range(len(df)):
        name = df.iloc[i]['name'].lower().strip()
        cui = df.iloc[i]['cui'].lower().strip()

        if name not in dfMap:
            dfMap[name] = []

        dfMap[name] = list(set([cui] + dfMap[name]))

    namesToCuis = pd.DataFrame(dfMap.items(),columns=['name','cuis'])
    namesToCuis['ambiguous'] = [len(x) > 1 for x in namesToCuis.cuis]
    return namesToCuis[namesToCuis.ambiguous]

In [444]:
%%time
# Setup: Load terminology into dictionary, train, test, and results
dataset = 'n2c2'
cuiName, nameCui = readTerminology(f'../resources/{dataset}_terminology.txt')
train = readAnnotations(f'../{dataset}-data/train')
# test = readAnnotations(f'../{dataset}-data/test')
results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')

# Load semantic type map
cuis = list(set(list(results.goldCui) + list(results.prediction)))
cuiType = readCuiType(cuis)

# Create analysis dataframe
cols = ['normalized','normalizingSource','normalizingSieveName','name','prediction','goldCui','namePermutations']
analysis = results[cols]
analysis = analysis.assign(goldNames=[['CUI-less'] if c=='CUI-less' else cuiName[c] if c in cuiName else ['Missing'] for c in results.goldCui])
analysis = analysis.assign(predTypes=[cuiType[c] if c in cuiType else ['Missing'] for c in results.prediction])
analysis = analysis.assign(goldTypes=[cuiType[c] if c in cuiType else ['Missing'] for c in results.goldCui])

# Sanity checks
# assert len(analysis[analysis.normalized & (analysis.predTypes=='Missing')])==0, 'Predicted CUI missing ST'
# assert len(analysis[(analysis.goldCui != 'CUI-less') & analysis.goldTypes=='Missing'])==0, 'Gold CUI missing ST'
assert len(analysis[analysis.goldNames=='Missing'])==0, 'Gold names missing'
assert len(analysis[analysis.goldTypes=='Missing'])==0, 'Gold types missing'



Wall time: 8.45 s


In [427]:
# Stratify performance by column
normalized = analysis[analysis.normalized]
stratifyByCol(normalized, 'normalizingSource')
stratifyByCol(normalized, 'normalizingSieveName')
# stratifyByCols(normalized, ['normalizingSource','normalizingSieveName'])
# display(stratifyByCols(normalized, ['predTypes','goldTypes']))
# stratifyByCol(analysis, 'predTypes', asc=True)
# stratifyByCol(analysis, 'goldTypes', asc=True)

Unnamed: 0,normalizingSieveName,n,tp,fp,recall,precision
0,ExactMatchSieve,4265,4022,243,0.94,0.94
5,PrepositionalTransformSieve,29,27,2,0.93,0.93
2,HyphenationSieve,13,12,1,0.92,0.92
4,AbbreviationExpansionSieve,82,60,22,0.73,0.73
1,UmlsEndingSieve,65,47,18,0.72,0.72
3,StemmingSieve,74,32,42,0.43,0.43


In [446]:
df = results[results.name=='po']
df

Unnamed: 0,filename,name,nameExpansion,prediction,normalized,normalizingSieveName,normalizingSource,goldCui,namePermutations,normalizingSieveLevel,goldNames
2649,0214,po,,C1527415,True,AbbreviationExpansionSieve,standardTerminology,C2316867,"po,po, nos,orally",3,"administration of substance via oral route,adm..."
4065,101407944_PUMC,po,,C1527415,True,AbbreviationExpansionSieve,standardTerminology,C2316867,"po,po, nos,orally",3,"administration of substance via oral route,adm..."
4067,101407944_PUMC,po,,C1527415,True,AbbreviationExpansionSieve,standardTerminology,C2316867,"po,po, nos,orally",3,"administration of substance via oral route,adm..."


In [440]:
remove = ['normalized','normalizingSource','normalizingSieveName']#,'goldTypes','predTypes'
errors = analysis[(results.prediction != results.goldCui) & (results.normalized==True)]
errors = errors[(errors.normalizingSource=='standardTerminology') & (errors.normalizingSieveName=='AbbreviationExpansionSieve')]
errors = errors.loc[:, ~errors.columns.isin(remove)]
print(len(errors))
errors.style.set_properties(subset=['goldNames'], **{'width': '1000px'})

20


Unnamed: 0,name,prediction,goldCui,namePermutations,goldNames,predTypes,goldTypes
105,ap,C1550742,C0442212,"ap,ap, nos,before dinner","['antero-posterior projection', 'antero-posterior projection (qualifier value)', 'anteroposterior projection', 'anteroposterior projection (qualifier value)', 'ap projection']",['Temporal Concept'],['Spatial Concept']
731,f,C0086287,C0449215,"f,f, nos,and e fluid and electrolyte,fahrenheit,female",['avf (body structure)'],['Organism Attribute'],['Spatial Concept']
735,f,C0086287,C0449215,"f,f, nos,and e fluid and electrolyte,fahrenheit,female",['avf (body structure)'],['Organism Attribute'],['Spatial Concept']
1603,i,C0021968,C0449212,"i,i, nos,iodine","['lead i', 'lead i (body structure)']","['Pharmacologic Substance', 'Biologically Active Substance', 'Element, Ion, or Isotope']",['Spatial Concept']
2241,all,C0023449,CUI-less,"all,all, nos,acute lymphocytic leukemia",['CUI-less'],['Neoplastic Process'],['Missing']
2623,cr,C0034936,C0201975,"cr,cr, nos,conditioned reflex,controlled release,crown-rump length","['creatinine measurement', 'creatinine measurement (procedure)', 'creatinine measurement, nos']",['Mental Process'],['Laboratory Procedure']
2649,po,C1527415,C2316867,"po,po, nos,orally","['administration of substance via oral route', 'administration of substance via oral route (procedure)']",['Functional Concept'],['Therapeutic or Preventive Procedure']
2838,c,C0332287,C0439744,"c,c, nos,calorie (kilocalorie),calorie (small calorie),celsius,with","['concentric', 'concentric (qualifier value)']",['Functional Concept'],['Spatial Concept']
2912,a biopsy,C1533601,C0005558,"a biopsy,a biopsy, nos,accommodation biopsy,acetum biopsy,anode biopsy,anterior biopsy,artery biopsy,before biopsy","['biopsy', 'biopsy (procedure)', 'biopsy - action', 'biopsy - action (qualifier value)', 'biopsy, nos', 'bx - biopsy']",['Diagnostic Procedure'],['Diagnostic Procedure']
3003,amp,C0002688,C1706498,"amp,amp, nos,ampule,amputation","['ampoule - unit of product usage', 'ampoules', 'ampule (qualifier value)', 'ampule - unit of product usage', 'ampule - unit of product usage (qualifier value)', 'ampules']",['Therapeutic or Preventive Procedure'],['Quantitative Concept']


In [441]:
# results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,ExactMatchSieve,4265,4022,243,0.94,0.94,0.94
2,UmlsEndingSieve,65,47,18,0.72,0.94,0.94
3,AbbreviationExpansionSieve,82,60,22,0.73,0.94,0.94
4,PrepositionalTransformSieve,29,27,2,0.93,0.94,0.94
5,Unknown,0,0,0,0,0.94,0.94
6,HyphenationSieve,13,12,1,0.92,0.94,0.94
7,StemmingSieve,74,32,42,0.43,0.93,0.93
8,Total,6621,4200,328,-,0.63,0.93


In [442]:
results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,ExactMatchSieve,4250,4016,234,0.94,0.94,0.94
2,UmlsEndingSieve,65,47,18,0.72,0.94,0.94
3,AbbreviationExpansionSieve,78,60,18,0.77,0.94,0.94
4,PrepositionalTransformSieve,29,27,2,0.93,0.94,0.94
5,Unknown,0,0,0,0,0.94,0.94
6,HyphenationSieve,13,12,1,0.92,0.94,0.94
7,StemmingSieve,74,32,42,0.43,0.93,0.93
8,Total,6621,4194,315,-,0.63,0.93


In [262]:
len(set(results.filename))

50

In [192]:
# getAmbiguous(train)

In [11]:
# omissions = results[results.prediction.isnull() & results.goldNames.notnull()]
# omissions[['filename','name','namePermutations','goldCui','goldNames']].head(5)