In [1]:
import os
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
HTML("<style>.rendered_html th {max-width: 120px;}</style>")

In [2]:
def readTerminology(file):
    "Reads a terminology file into two dictionaries."
    cuiName = {}
    nameCui = {}
    with open(file, encoding="ISO-8859-1") as f:
        for m in f.readlines():
            s = m.split('||')
            cui = s[0]
            names = s[1].strip().split('|')
            cuiName[cui] = names

            for name in names:
                if name not in nameCui:
                    nameCui[name] = []
                nameCui[name].append(cui)
    return cuiName, nameCui

def readAnnotations(path):
    "Reads all .concept files from path into single dataframe."
    annotations = pd.DataFrame([])
    for file in os.listdir(path):
        if '.concept' in file:
            df = pd.read_table(f'{path}/{file}',sep='\|\|',header=None, names=['file_id','ix','type','name','cui'])
            df['file'] = [file for x in range(len(df))]
            annotations = pd.concat([annotations, df])
    annotations = annotations[['cui','name','file']]
    return annotations

def readCuiType(cuis):
    "Reads in a dictionary mapping cuis to semantic types."
    
    # Load UMLS semantic type mapping file.
    try:
        mrsty = pd.read_table('umls/mrsty.txt',sep='|',header=0,names=['cui','tui','stn','type','atui','cvf'])[['cui','tui','type']]
        mrsty = mrsty[mrsty.cui.isin(cuis)]
    except:
        raise('NOTE: Must have previously created umls/mrsty.txt by running "Load UMLS data.ipynb" to run readCuiType()')
    
    cuiType = {}
    for x in mrsty.iterrows():
        if x[1].cui not in cuiType:
            cuiType[x[1].cui] = []
        cuiType[x[1].cui].append(x[1].type)
    return cuiType

def getStats(df):
    "Gets stats for given dataframe"
    n = len(df)
    tp = sum(df.prediction == df.goldCui)
    fp = sum((df.prediction != df.goldCui) & (df.normalized==True))
    recall = round(tp/n,2) if n > 0 else 0
    precision = round(tp/(tp+fp),2) if (tp+fp) > 0 else 0
    return n, tp, fp, recall, precision

def sieveResults(results):
    "Returns a sieve-level analysis of results."
    levels = range(1,max(results.normalizingSieveLevel)+1)
    sieves = pd.DataFrame([], columns=['sieve','n','tp','fp', 'sieve_acc', 'agg_recall', 'agg_precision'])
    
    # Results for each sieve
    for i in levels:
        df = results[results.normalizingSieveLevel==i]
        n, tp, fp, recall, precision = getStats(df)
        sieve = df.normalizingSieveName.iloc[0] if n > 0 else "Unknown"
        sieves.loc[i] = [sieve, n, tp, fp, recall, 0, 0]
        sieves.loc[i,'agg_recall'] = round(sum(sieves.tp)/sum(sieves.n),2)
        sieves.loc[i,'agg_precision'] = round(sum(sieves.tp)/(sum(sieves.tp)+sum(sieves.fp)),2)
    
    # Total results
    n, tp, fp, recall, precision = getStats(results)
    sieves.loc[i+1] = ['Total', n, tp, fp, '-', recall, precision]
    return sieves

def stratifyByCol(df, col, asc=False):
    "Stratifies the results by given column."    
    # If the column is a list, explode list into individual rows
    if (df.sample(10).applymap(type).mode(0).astype(str) == "<class 'list'>")[col][0]:
        df = df.explode(col)
        
    rows = []
    for key in set(df[col]):
        if not pd.isnull(key):
            sub = df[df[col]==key]
            rows.append([key] + list(getStats(sub)))
    return pd.DataFrame(rows, columns=[col,'n','tp','fp', 'recall', 'precision']).sort_values('precision',ascending=asc)

def stratifyByCols(df, cols, asc=False):
    for col in cols:
        # If the column is a list, explode list into individual rows
        if (df.sample(10).applymap(type).mode(0).astype(str) == "<class 'list'>")[col][0]:
            df = df.explode(col)            
    return stratifyByCol(df.assign(combined=df[cols].agg('-'.join, axis=1)), 'combined',asc=asc)

def getAmbiguous(df):
    "Find ambiguous names"
    dfMap = {}
    for i in range(len(df)):
        name = df.iloc[i]['name'].lower().strip()
        cui = df.iloc[i]['cui'].lower().strip()

        if name not in dfMap:
            dfMap[name] = []

        dfMap[name] = list(set([cui] + dfMap[name]))

    namesToCuis = pd.DataFrame(dfMap.items(),columns=['name','cuis'])
    namesToCuis['ambiguous'] = [len(x) > 1 for x in namesToCuis.cuis]
    return namesToCuis[namesToCuis.ambiguous]

In [513]:
%%time
# Setup: Load terminology into dictionary, train, test, and results
dataset = 'n2c2'
cuiName, nameCui = readTerminology(f'../resources/{dataset}_terminology.txt')
# train = readAnnotations(f'../{dataset}-data/train')
# test = readAnnotations(f'../{dataset}-data/test')
results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')

# Load semantic type map
cuis = list(set(list(results.goldCui) + list(results.prediction)))
cuiType = readCuiType(cuis)

# Create analysis dataframe
cols = ['normalized','normalizingSource','normalizingSieveName','name','prediction','goldCui','normalizingName','keyPhrase','filename','snippet']
analysis = results[cols]
analysis = analysis.assign(goldNames=[['CUI-less'] if c=='CUI-less' else cuiName[c] if c in cuiName else ['Missing'] for c in results.goldCui])
analysis = analysis.assign(predTypes=[cuiType[c] if c in cuiType else ['Missing'] for c in results.prediction])
analysis = analysis.assign(goldTypes=[cuiType[c] if c in cuiType else ['Missing'] for c in results.goldCui])

# Sanity checks
# assert len(analysis[analysis.normalized & (analysis.predTypes=='Missing')])==0, 'Predicted CUI missing ST'
# assert len(analysis[(analysis.goldCui != 'CUI-less') & analysis.goldTypes=='Missing'])==0, 'Gold CUI missing ST'
# assert len(analysis[analysis.goldNames=='Missing'])==0, 'Gold names missing'
# assert len(analysis[analysis.goldTypes=='Missing'])==0, 'Gold types missing'

Wall time: 8.4 s


In [514]:
# Stratify performance by column
normalized = analysis[analysis.normalized]
stratifyByCol(normalized, 'normalizingSource')
stratifyByCol(normalized, 'normalizingSieveName')
# stratifyByCols(normalized, ['normalizingSource','normalizingSieveName'])
# display(stratifyByCols(normalized, ['predTypes','goldTypes']))
# stratifyByCol(analysis, 'predTypes', asc=True)
# stratifyByCol(analysis, 'goldTypes', asc=True)

Unnamed: 0,normalizingSieveName,n,tp,fp,recall,precision
5,HyphenationSieve,20,20,0,1.0,1.0
0,AbbreviationExpansionSieve,92,90,2,0.98,0.98
2,MeasurementSieve,185,181,4,0.98,0.98
4,SynonymSieve,65,64,1,0.98,0.98
3,SuffixationSieve,32,30,2,0.94,0.94
6,ExactMatchSieve,4137,3888,249,0.94,0.94
1,PrepositionalTransformSieve,49,45,4,0.92,0.92
7,RemoveStopwordsSieve,335,297,38,0.89,0.89


In [515]:
stratifyByCol(normalized[normalized.normalizingSieveName=='RemoveStopwordsSieve'], 'keyPhrase')

Unnamed: 0,keyPhrase,n,tp,fp,recall,precision
3,'s,3,3,0,1.0,1.0
6,patient,1,1,0,1.0,1.0
8,any,8,8,0,1.0,1.0
4,&apos;s,20,19,1,0.95,0.95
1,an,34,31,3,0.91,0.91
2,her,33,30,3,0.91,0.91
10,the,86,77,9,0.9,0.9
7,this,8,7,1,0.88,0.88
9,his,55,48,7,0.87,0.87
0,a,82,69,13,0.84,0.84


In [516]:
remove = ['normalized','normalizingSource']#,'normalizingSieveName','goldTypes','predTypes'
errors = analysis[(results.prediction != results.goldCui) & (results.normalized==True)]
# errors = errors[(errors.normalizingSource=='standardTerminology') & (errors.normalizingSieveName=='RemoveStopwordsSieve')]
errors = errors[errors.normalizingSieveName=='ExactMatchSieve']
# errors = errors[errors.filename=='332803550']
errors = errors.loc[:, ~errors.columns.isin(remove)]
print(len(errors))
errors.style.set_properties(subset=['goldNames'], **{'width': '1000px'})

249


Unnamed: 0,normalizingSieveName,name,prediction,goldCui,normalizingName,keyPhrase,filename,snippet,goldNames,predTypes,goldTypes
6,ExactMatchSieve,rr,C1443397,CUI-less,,,0034,< 6 fl,['CUI-less'],['Health Care Activity'],['Missing']
60,ExactMatchSieve,cpr,C0007203,C0201657,,,0034,was mildl,"['c-reactive protein level', 'c-reactive protein measurement', 'c-reactive protein measurement (procedure)']",['Therapeutic or Preventive Procedure'],['Laboratory Procedure']
77,ExactMatchSieve,sedation,C0344106,C0235195,,,0034,rr < 6,"['[d]sedation', '[d]sedation (context-dependent category)', '[d]sedation (situation)', 'sedated', 'sedated (finding)', 'sedated state', 'under sedation']",['Therapeutic or Preventive Procedure'],['Finding']
83,ExactMatchSieve,hip x-ray,C0203262,C2959422,,,0034,brief res,"['radiographic imaging of bone of hip', 'x-ray of bone of hip', 'x-ray of bone of hip (procedure)']",['Diagnostic Procedure'],['Diagnostic Procedure']
99,ExactMatchSieve,enhancement,C1627358,C0443285,,,0034,. it was,"['radiolucent', 'radiolucent (qualifier value)']",['Therapeutic or Preventive Procedure'],['Qualitative Concept']
286,ExactMatchSieve,nebulizer,C0027524,C2919541,,,0094,is given,"['administration of medication using nebuliser mask', 'administration of medication using nebulizer mask', 'administration of medication using nebulizer mask (procedure)', 'nebuliser therapy using mask', 'nebulizer therapy using mask']",['Medical Device'],['Therapeutic or Preventive Procedure']
342,ExactMatchSieve,stabbing,C1455792,C0278145,,,0094,left-side,"['knifelike pain', 'stabbing pain', 'stabbing pain (finding)']",['Qualitative Concept'],['Sign or Symptom']
352,ExactMatchSieve,stabbing,C1455792,C0278145,,,0094,", sharp p","['knifelike pain', 'stabbing pain', 'stabbing pain (finding)']",['Qualitative Concept'],['Sign or Symptom']
367,ExactMatchSieve,hydrocodone,C0020264,C0717367,,,0094,5 mg with,"['acetaminophen / hydrocodone', 'acetaminophen and hydrocodone product', 'acetaminophen- and hydrocodone-containing product', 'hydrocodone and paracetamol product', 'hydrocodone- and paracetamol-containing product', 'product containing hydrocodone and paracetamol', 'product containing hydrocodone and paracetamol (medicinal product)']","['Organic Chemical', 'Pharmacologic Substance']",['Pharmacologic Substance']
412,ExactMatchSieve,laboratory studies,C0681827,C0022885,,,0174,which sho,"['general laboratory procedure', 'general laboratory procedure (procedure)', 'general laboratory procedure -retired-', 'general laboratory procedure, nos', 'investig.- lab.,general', 'investigation - lab.,general', 'lab. test - general', 'laboratory procedure', 'laboratory procedure (procedure)', 'laboratory procedure - general - nos', 'laboratory procedure - general - nos (context-dependent category)', 'laboratory procedure - general - nos (procedure)', 'laboratory procedure - general - nos (situation)', 'laboratory procedure nos', 'laboratory procedure nos (procedure)', 'laboratory procedures', 'laboratory procedures (procedure)', 'laboratory procedures -general', 'laboratory procedures -general (context-dependent category)', 'laboratory procedures -general (situation)', 'laboratory test', 'laboratory test (procedure)', 'laboratory test, nos', 'procedure, lab.-general', 'test, lab. - general']",['Laboratory Procedure'],['Laboratory Procedure']


In [510]:
# results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,MeasurementSieve,185,181,4,0.98,0.98,0.98
2,ExactMatchSieve,4137,3888,249,0.94,0.94,0.94
3,AbbreviationExpansionSieve,92,90,2,0.98,0.94,0.94
4,RemoveStopwordsSieve,335,297,38,0.89,0.94,0.94
5,SynonymSieve,65,64,1,0.98,0.94,0.94
6,SuffixationSieve,32,30,2,0.94,0.94,0.94
7,PrepositionalTransformSieve,49,45,4,0.92,0.94,0.94
8,HyphenationSieve,20,20,0,1,0.94,0.94
9,Total,6617,4615,300,-,0.7,0.94


In [511]:
results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,MeasurementSieve,185,181,4,0.98,0.98,0.98
2,ExactMatchSieve,4298,3908,390,0.91,0.91,0.91
3,AbbreviationExpansionSieve,92,88,4,0.96,0.91,0.91
4,RemoveStopwordsSieve,337,288,49,0.85,0.91,0.91
5,SynonymSieve,65,64,1,0.98,0.91,0.91
6,SuffixationSieve,33,30,3,0.91,0.91,0.91
7,PrepositionalTransformSieve,49,45,4,0.92,0.91,0.91
8,HyphenationSieve,20,20,0,1,0.91,0.91
9,Total,6619,4624,455,-,0.7,0.91


In [512]:
print('Exact match\t',455/4624)
print('Exact match\t',300/4615)
print('Abbreviation\t',290/4591)
print('Synonym  \t',296/4542)
print('Stopwords\t',296/4598)

Exact match	 0.09839965397923875
Exact match	 0.06500541711809317
Abbreviation	 0.0631670659986931
Synonym  	 0.06516952884191986
Stopwords	 0.06437581557198782


In [447]:
# getAmbiguous(train)
omissions = analysis[analysis.prediction.isnull()]
omissions = omissions[['Missing' not in x for x in omissions.goldNames]]
print(len(omissions),len(set(omissions.name)))
omissions[['filename','name','goldNames']].style.set_properties(subset=['goldNames'], **{'width': '1000px'})

1436 1286


Unnamed: 0,filename,name,goldNames
0,0034,right le pain,"['pain in right leg', 'pain in right leg (finding)', 'pain in right lower limb', 'pain in right lower limb (finding)']"
9,0034,r leg pain,"['pain in right leg', 'pain in right leg (finding)', 'pain in right lower limb', 'pain in right lower limb (finding)']"
10,0034,right le pain,"['pain in right leg', 'pain in right leg (finding)', 'pain in right lower limb', 'pain in right lower limb (finding)']"
25,0034,any changes in bowel habits,"['[d]change in bowel habit', '[d]change in bowel habit (context-dependent category)', '[d]change in bowel habit (situation)', 'abnormal bowel habits', 'altered bowel function', 'altered bowel function (finding)', 'altered bowel habit', 'altered bowel habits', 'change in bowel habit', 'change in bowel habit (context-dependent category)', 'change in bowel habit (situation)', 'change in bowel pattern']"
26,0034,non-insulin-requiring diabetes mellitus,"['diabetes mellitus - adult onset', 'diabetes mellitus -adult onset', 'diabetes mellitus type 2', 'diabetes mellitus type 2 (disorder)', 'diabetes mellitus type ii', 'diabetes mellitus: [adult onset] or [noninsulin dependent]', 'diabetes mellitus: [adult onset] or [noninsulin dependent] (disorder)', 'maturity onset diabetes', 'maturity onset diabetes mellitus', 'ncdmm', 'non-insulin dependent diabetes mellitus', 'non-insulin-dependent diabetes mellitus', 'noninsulin dependent diab.mell', 'type 2 diabetes mellitus', 'type ii diabetes mellitus', 'type ii diabetes mellitus (disorder)']"
42,0034,organomegaly,"['abdominal organomegaly', 'abdominal organomegaly (disorder)']"
48,0034,neurologic examination,"['assessing neurological performance', 'assessing neurological status', 'nervous sys.exam.-gener', 'nervous system examination - general', 'nervous system-general exam.', 'neurological assessment', 'neurological assessment (procedure)', 'neurological assessment (regime/therapy)', 'neurological examination', 'neurological examination (procedure)', 'neurological examination, nos']"
49,0034,straight leg raise test,"['straight leg raise', 'straight leg raise test response', 'straight leg raise test response (observable entity)']"
52,0034,psychiatric examination,"['psychiatric interview and evaluation', 'psychiatric interview and evaluation (procedure)', 'psychiatric interview and evaluation, nos']"
53,0034,lumbar tenderness,"['lumbar spine - tender', 'lumbar spine - tender (finding)']"


In [475]:
df = analysis[analysis.normalizingSieveName=='UmlsEndingSieve']
df.keyPhrase = ['finding of' if x.split(' ')[0]=='finding' else x.split(' ')[-1] for x in df.normalizingName]
stratifyByCol(df, 'keyPhrase')
df[df.keyPhrase=='nos']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,normalized,normalizingSource,normalizingSieveName,name,prediction,goldCui,normalizingName,keyPhrase,filename,snippet,goldNames,predTypes,goldTypes
320,True,standardTerminology,UmlsEndingSieve,a dressing,C0013119,C0013119,"dressing, nos",nos,0094,over the,"[dressing - item, dressing - item (physical ob...",[Medical Device],[Medical Device]
365,True,standardTerminology,UmlsEndingSieve,solution,C0037633,C0037633,"solution, nos",nos,0094,. 5. neur,"[solution (substance), solution, nos]",[Substance],[Substance]
626,True,standardTerminology,UmlsEndingSieve,ulceration,C3887532,C0586322,"ulceration, nos",nos,0286,in the le,"[ulcer of big toe, ulcer of big toe (disorder)...",[Pathologic Function],[Disease or Syndrome]
1027,True,standardTerminology,UmlsEndingSieve,foot,C0016504,C0016504,"foot, nos",nos,0431,dragging,"[foot structure, foot structure (body structur...","[Body Part, Organ, or Organ Component]","[Body Part, Organ, or Organ Component]"
1124,True,standardTerminology,UmlsEndingSieve,sternum,C0038293,C0038293,"sternum, nos",nos,0467,but not t,"[bone structure of sternum, bone structure of ...","[Body Part, Organ, or Organ Component]","[Body Part, Organ, or Organ Component]"
1141,True,standardTerminology,UmlsEndingSieve,sclera,C0036410,C0036410,"sclera, nos",nos,0467,anicteric,"[sclera, nos, scleral structure, scleral struc...","[Body Part, Organ, or Organ Component]","[Body Part, Organ, or Organ Component]"
1865,True,standardTerminology,UmlsEndingSieve,ulceration,C3887532,C0041582,"ulceration, nos",nos,622086964,after im,"[ulcer, ulcer (disorder), ulcer (morphologic a...",[Pathologic Function],[Pathologic Function]
2204,True,standardTerminology,UmlsEndingSieve,gallstone,C0008350,C0008350,"gallstone, nos",nos,0038,was noted,"[biliary calculus (disorder), biliary calculus...",[Disease or Syndrome],[Disease or Syndrome]
2273,True,standardTerminology,UmlsEndingSieve,his ace inhibitor,C0003015,C0003015,"ace inhibitor, nos",nos,0086,began . t,"[ace inhibitor product, ace inhibitor, nos, an...",[Pharmacologic Substance],[Pharmacologic Substance]
2400,True,standardTerminology,UmlsEndingSieve,an enterotomy,C0192579,C0192579,"enterotomy, nos",nos,0098,and was a,"[enterotomy, nos, incision of intestine, incis...",[Therapeutic or Preventive Procedure],[Therapeutic or Preventive Procedure]


In [448]:
df = omissions.explode('goldNames')
df = df[df.name+', nos'==df.goldNames]
df['new_prediction'] = [nameCui[x][0] for x in df.goldNames]
print(sum(df.new_prediction==df.goldCui),len(df))
df

41 41


Unnamed: 0,normalized,normalizingSource,normalizingSieveName,name,prediction,goldCui,normalizingName,keyPhrase,filename,snippet,goldNames,predTypes,goldTypes,new_prediction
71,False,,,discharge,,C0012621,,discharge drainage,0034,from your,"discharge, nos",[Missing],[Body Substance],C0012621
97,False,,,back,,C1995000,,,0034,revealed,"back, nos",[Missing],[Body Location or Region],C1995000
104,False,,,pelvis,,C0030797,,,0034,and r hip,"pelvis, nos",[Missing],"[Body Part, Organ, or Organ Component]",C0030797
318,False,,,drainage,,C0012621,,drainage discharge,0094,or tender,"drainage, nos",[Missing],[Body Substance],C0012621
365,False,,,solution,,C0037633,,tion,0094,. 5. neur,"solution, nos",[Missing],[Substance],C0037633
1027,False,,,foot,,C0016504,,,0431,dragging,"foot, nos",[Missing],"[Body Part, Organ, or Organ Component]",C0016504
1124,False,,,sternum,,C0038293,,,0467,but not t,"sternum, nos",[Missing],"[Body Part, Organ, or Organ Component]",C0038293
1141,False,,,sclera,,C0036410,,,0467,anicteric,"sclera, nos",[Missing],"[Body Part, Organ, or Organ Component]",C0036410
1312,False,,,penis,,C0030851,,,0477,. he had,"penis, nos",[Missing],"[Body Part, Organ, or Organ Component]",C0030851
1351,False,,,penis,,C0030851,,,0477,and pelvi,"penis, nos",[Missing],"[Body Part, Organ, or Organ Component]",C0030851


In [181]:
cuiName['C0026266']

['mitral incompetence',
 'mitral insufficiency',
 'mitral regurgitation',
 'mitral regurgitation, nos',
 'mitral valve incompetence',
 'mitral valve incompetence, nos',
 'mitral valve insufficiency',
 'mitral valve insufficiency, nos',
 'mitral valve regurgitation',
 'mitral valve regurgitation (disorder)',
 'mitral valve regurgitation, nos']