In [1]:
import os
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
HTML("<style>.rendered_html th {max-width: 120px;}</style>")

In [290]:
def readTerminology(file):
    "Reads a terminology file into two dictionaries."
    cuiName = {}
    nameCui = {}
    with open(file) as f:
        for m in f.readlines():
            s = m.split('||')
            cui = s[0]
            names = s[1].strip().split('|')
            cuiName[cui] = names

            for name in names:
                if name not in nameCui:
                    nameCui[name] = []
                nameCui[name].append(cui)
    return cuiName, nameCui

def readAnnotations(path):
    "Reads all .concept files from path into single dataframe."
    annotations = pd.DataFrame([])
    for file in os.listdir(path):
        if '.concept' in file:
            df = pd.read_table(f'{path}/{file}',sep='\|\|',header=None, names=['file_id','ix','type','name','cui'])
            df['file'] = [file for x in range(len(df))]
            annotations = pd.concat([annotations, df])
    annotations = annotations[['cui','name','file']]
    return annotations

def readCuiType(cuis):
    "Reads in a dictionary mapping cuis to semantic types."
    
    # Load UMLS semantic type mapping file.
    try:
        mrsty = pd.read_table('umls/mrsty.txt',sep='|',header=0,names=['cui','tui','stn','type','atui','cvf'])[['cui','tui','type']]
        mrsty = mrsty[mrsty.cui.isin(cuis)]
    except:
        raise('NOTE: Must have previously created umls/mrsty.txt by running "Load UMLS data.ipynb" to run readCuiType()')
    
    cuiType = {}
    for x in mrsty.iterrows():
        if x[1].cui not in cuiType:
            cuiType[x[1].cui] = []
        cuiType[x[1].cui].append(x[1].type)
    return cuiType

def getStats(df):
    "Gets stats for given dataframe"
    n = len(df)
    tp = sum(df.prediction == df.goldCui)
    fp = sum((df.prediction != df.goldCui) & (df.normalized==True))
    recall = round(tp/n,2) if n > 0 else 0
    precision = round(tp/(tp+fp),2) if (tp+fp) > 0 else 0
    return n, tp, fp, recall, precision

def sieveResults(results):
    "Returns a sieve-level analysis of results."
    levels = range(1,max(results.normalizingSieveLevel)+1)
    sieves = pd.DataFrame([], columns=['sieve','n','tp','fp', 'sieve_acc', 'agg_recall', 'agg_precision'])
    
    # Results for each sieve
    for i in levels:
        df = results[results.normalizingSieveLevel==i]
        n, tp, fp, recall, precision = getStats(df)
        sieve = df.normalizingSieveName.iloc[0] if n > 0 else "Unknown"
        sieves.loc[i] = [sieve, n, tp, fp, recall, 0, 0]
        sieves.loc[i,'agg_recall'] = round(sum(sieves.tp)/sum(sieves.n),2)
        sieves.loc[i,'agg_precision'] = round(sum(sieves.tp)/(sum(sieves.tp)+sum(sieves.fp)),2)
    
    # Total results
    n, tp, fp, recall, precision = getStats(results)
    sieves.loc[i+1] = ['Total', n, tp, fp, '-', recall, precision]
    return sieves

def stratifyByCol(df, col, asc=False):
    "Stratifies the results by given column."    
    # If the column is a list, explode list into individual rows
    if (df.sample(100).applymap(type).mode(0).astype(str) == "<class 'list'>")[col][0]:
        df = df.explode(col)
        
    rows = []
    for key in set(df[col]):
        if not pd.isnull(key):
            sub = df[df[col]==key]
            rows.append([key] + list(getStats(sub)))
    return pd.DataFrame(rows, columns=[col,'n','tp','fp', 'recall', 'precision']).sort_values('precision',ascending=asc)

def stratifyByCols(df, cols, asc=False):
    for col in cols:
        # If the column is a list, explode list into individual rows
        if (df.sample(100).applymap(type).mode(0).astype(str) == "<class 'list'>")[col][0]:
            df = df.explode(col)            
    return stratifyByCol(df.assign(combined=df[cols].agg('-'.join, axis=1)), 'combined',asc=asc)

def getAmbiguous(df):
    "Find ambiguous names"
    dfMap = {}
    for i in range(len(df)):
        name = df.iloc[i]['name'].lower().strip()
        cui = df.iloc[i]['cui'].lower().strip()

        if name not in dfMap:
            dfMap[name] = []

        dfMap[name] = list(set([cui] + dfMap[name]))

    namesToCuis = pd.DataFrame(dfMap.items(),columns=['name','cuis'])
    namesToCuis['ambiguous'] = [len(x) > 1 for x in namesToCuis.cuis]
    return namesToCuis[namesToCuis.ambiguous]

In [513]:
%%time
# Setup: Load terminology into dictionary, train, test, and results
dataset = 'n2c2'
cuiName, nameCui = readTerminology(f'../resources/{dataset}_terminology.txt')
train = readAnnotations(f'../{dataset}-data/train')
# test = readAnnotations(f'../{dataset}-data/test')
results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')

# Load semantic type map
cuis = list(set(list(results.goldCui) + list(results.prediction)))
cuiType = readCuiType(cuis)

# Create analysis dataframe
cols = ['normalized','normalizingSource','normalizingSieveName','name','prediction','goldCui','namePermutations']
analysis = results[cols]
analysis = analysis.assign(goldNames=[['CUI-less'] if c=='CUI-less' else cuiName[c] if c in cuiName else ['Missing'] for c in results.goldCui])
analysis = analysis.assign(predTypes=[cuiType[c] if c in cuiType else ['Missing'] for c in results.prediction])
analysis = analysis.assign(goldTypes=[cuiType[c] if c in cuiType else ['Missing'] for c in results.goldCui])

# Sanity checks
# assert len(analysis[analysis.normalized & (analysis.predTypes=='Missing')])==0, 'Predicted CUI missing ST'
# assert len(analysis[(analysis.goldCui != 'CUI-less') & analysis.goldTypes=='Missing'])==0, 'Gold CUI missing ST'
assert len(analysis[analysis.goldNames=='Missing'])==0, 'Gold names missing'
assert len(analysis[analysis.goldTypes=='Missing'])==0, 'Gold types missing'



Wall time: 8.56 s


In [510]:
# df = stratifyByCol(analysis, 'predTypes', asc=True)
# for p in df[df.precision<.8].predTypes:
#     print(f"'{p}',")

In [514]:
# Stratify performance by column
normalized = analysis[analysis.normalized]
stratifyByCol(normalized, 'normalizingSource')
stratifyByCol(normalized, 'normalizingSieveName')
# stratifyByCols(normalized, ['normalizingSource','normalizingSieveName'])
# display(stratifyByCols(normalized, ['predTypes','goldTypes']))
# stratifyByCol(analysis, 'predTypes', asc=True)
# stratifyByCol(analysis, 'goldTypes', asc=True)

Unnamed: 0,normalizingSieveName,n,tp,fp,recall,precision
0,ExactMatchSieve,4202,3973,229,0.95,0.95
6,PrepositionalTransformSieve,39,36,3,0.92,0.92
2,HyphenationSieve,17,15,2,0.88,0.88
4,RemoveStopwordsSieve,276,244,32,0.88,0.88
5,AmbiguitySieve,36,31,5,0.86,0.86
3,AbbreviationExpansionSieve,73,61,12,0.84,0.84
1,UmlsEndingSieve,43,32,11,0.74,0.74


In [515]:
remove = ['normalized','normalizingSource','normalizingSieveName']#,'goldTypes','predTypes'
errors = analysis[(results.prediction != results.goldCui) & (results.normalized==True)]
errors = errors[(errors.normalizingSource=='standardTerminology') & (errors.normalizingSieveName=='HyphenationSieve')]
errors = errors.loc[:, ~errors.columns.isin(remove)]
print(len(errors))
errors.style.set_properties(subset=['goldNames'], **{'width': '1000px'})

2


Unnamed: 0,name,prediction,goldCui,namePermutations,goldNames,predTypes,goldTypes
3273,the heel-shin-test,C0575094,C1288236,"the heel-shin-test,heel-shin-test,the heel-shin-test, nos,heel-shin-test, nos,heel-shin-test in the,heel-shin-test with the,heel-shin-test on the,heel-shin-test of the,in heel-shin-test,heel-shin-test in,with heel-shin-test,heel-shin-test with,on heel-shin-test,heel-shin-test on,of heel-shin-test,heel-shin-test of,heel-shin-test, nos in the,nos in the heel-shin-test,,heel-shin-test, nos with the,nos with the heel-shin-test,,heel-shin-test, nos on the,nos on the heel-shin-test,,heel-shin-test, nos of the,nos of the heel-shin-test,,nos in heel-shin-test,,nos with heel-shin-test,,nos on heel-shin-test,,nos of heel-shin-test,,the-heel-shin-test,the heel shin-test,the heel-shin test,heel shin-test,heel-shin test,the-heel-shin-test, nos,the heel-shin-test,-nos,the heel shin-test, nos,the heel-shin test, nos,heel-shin-test,-nos,heel shin-test, nos,heel-shin test, nos,heel-shin-test-in the,heel-shin-test in-the,heel shin-test in the,heel-shin test in the,heel-shin-test-with the,heel-shin-test with-the,heel shin-test with the,heel-shin test with the,heel-shin-test-on the,heel-shin-test on-the,heel shin-test on the,heel-shin test on the,heel-shin-test-of the,heel-shin-test of-the,heel shin-test of the,heel-shin test of the,in-heel-shin-test,in heel shin-test,in heel-shin test,heel-shin-test-in,heel shin-test in,heel-shin test in,with-heel-shin-test,with heel shin-test,with heel-shin test,heel-shin-test-with,heel shin-test with,heel-shin test with,on-heel-shin-test,on heel shin-test,on heel-shin test,heel-shin-test-on,heel shin-test on,heel-shin test on,of-heel-shin-test,of heel shin-test,of heel-shin test,heel-shin-test-of,heel shin-test of,heel-shin test of,heel-shin-test,-nos in the,heel-shin-test, nos-in the,heel-shin-test, nos in-the,heel shin-test, nos in the,heel-shin test, nos in the,nos-in the heel-shin-test,,nos in-the heel-shin-test,,nos in the-heel-shin-test,,nos in the heel shin-test,,nos in the heel-shin test,,heel-shin-test,-nos with the,heel-shin-test, nos-with the,heel-shin-test, nos with-the,heel shin-test, nos with the,heel-shin test, nos with the,nos-with the heel-shin-test,,nos with-the heel-shin-test,,nos with the-heel-shin-test,,nos with the heel shin-test,,nos with the heel-shin test,,heel-shin-test,-nos on the,heel-shin-test, nos-on the,heel-shin-test, nos on-the,heel shin-test, nos on the,heel-shin test, nos on the,nos-on the heel-shin-test,,nos on-the heel-shin-test,,nos on the-heel-shin-test,,nos on the heel shin-test,,nos on the heel-shin test,,heel-shin-test,-nos of the,heel-shin-test, nos-of the,heel-shin-test, nos of-the,heel shin-test, nos of the,heel-shin test, nos of the,nos-of the heel-shin-test,,nos of-the heel-shin-test,,nos of the-heel-shin-test,,nos of the heel shin-test,,nos of the heel-shin test,,nos-in heel-shin-test,,nos in-heel-shin-test,,nos in heel shin-test,,nos in heel-shin test,,nos-with heel-shin-test,,nos with-heel-shin-test,,nos with heel shin-test,,nos with heel-shin test,,nos-on heel-shin-test,,nos on-heel-shin-test,,nos on heel shin-test,,nos on heel-shin test,,nos-of heel-shin-test,,nos of-heel-shin-test,,nos of heel shin-test,,nos of heel-shin test,","['heel-shin test finding', 'heel-shin test finding (finding)']",['Finding'],['Finding']
6156,fsh,C0733758,C0202022,"fsh,fsh, nos,follicle-stimulating hormone,in fsh,fsh in,with fsh,fsh with,on fsh,fsh on,of fsh,fsh of,nos in fsh,,nos with fsh,,nos on fsh,,nos of fsh,,hormone in follicle-stimulating,hormone with follicle-stimulating,hormone on follicle-stimulating,hormone of follicle-stimulating,follicle-stimulating horm1,horm1 in follicle-stimulating,horm1 with follicle-stimulating,horm1 on follicle-stimulating,horm1 of follicle-stimulating,fsh,-nos,follicle-stimulating-hormone,follicle stimulating hormone,in-fsh,fsh-in,with-fsh,fsh-with,on-fsh,fsh-on,of-fsh,fsh-of,nos-in fsh,,nos in-fsh,,nos-with fsh,,nos with-fsh,,nos-on fsh,,nos on-fsh,,nos-of fsh,,nos of-fsh,,hormone-in follicle-stimulating,hormone in-follicle-stimulating,hormone in follicle stimulating,hormone-with follicle-stimulating,hormone with-follicle-stimulating,hormone with follicle stimulating,hormone-on follicle-stimulating,hormone on-follicle-stimulating,hormone on follicle stimulating,hormone-of follicle-stimulating,hormone of-follicle-stimulating,hormone of follicle stimulating,follicle-stimulating-horm1,follicle stimulating horm1,horm1-in follicle-stimulating,horm1 in-follicle-stimulating,horm1 in follicle stimulating,horm1-with follicle-stimulating,horm1 with-follicle-stimulating,horm1 with follicle stimulating,horm1-on follicle-stimulating,horm1 on-follicle-stimulating,horm1 on follicle stimulating,horm1-of follicle-stimulating,horm1 of-follicle-stimulating,horm1 of follicle stimulating","['follicle stimulating hormone level', 'follicle stimulating hormone measurement', 'follicle stimulating hormone measurement (procedure)', 'fsh measurement']","['Amino Acid, Peptide, or Protein', 'Pharmacologic Substance', 'Hormone']",['Laboratory Procedure']


In [511]:
# results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,ExactMatchSieve,4271,4026,245,0.94,0.94,0.94
2,RemoveStopwordsSieve,282,247,35,0.88,0.94,0.94
3,UmlsEndingSieve,75,56,19,0.75,0.94,0.94
4,AbbreviationExpansionSieve,81,62,19,0.77,0.93,0.93
5,PrepositionalTransformSieve,44,40,4,0.91,0.93,0.93
6,Unknown,0,0,0,0,0.93,0.93
7,HyphenationSieve,17,15,2,0.88,0.93,0.93
8,Unknown,0,0,0,0,0.93,0.93
9,AmbiguitySieve,36,31,5,0.86,0.93,0.93
10,Total,6619,4477,329,-,0.68,0.93


In [516]:
results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,ExactMatchSieve,4202,3973,229,0.95,0.95,0.95
2,RemoveStopwordsSieve,276,244,32,0.88,0.94,0.94
3,UmlsEndingSieve,43,32,11,0.74,0.94,0.94
4,AbbreviationExpansionSieve,73,61,12,0.84,0.94,0.94
5,PrepositionalTransformSieve,39,36,3,0.92,0.94,0.94
6,Unknown,0,0,0,0,0.94,0.94
7,HyphenationSieve,17,15,2,0.88,0.94,0.94
8,Unknown,0,0,0,0,0.94,0.94
9,AmbiguitySieve,36,31,5,0.86,0.94,0.94
10,Total,6619,4392,294,-,0.66,0.94


In [262]:
len(set(results.filename))

50

In [192]:
# getAmbiguous(train)

In [11]:
# omissions = results[results.prediction.isnull() & results.goldNames.notnull()]
# omissions[['filename','name','namePermutations','goldCui','goldNames']].head(5)