In [1]:
import os
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
HTML("<style>.rendered_html th {max-width: 120px;}</style>")

In [53]:
def readTerminology(file):
    "Reads a terminology file into two dictionaries."
    cuiName = {}
    nameCui = {}
    with open(file, encoding="ISO-8859-1") as f:
        for m in f.readlines():
            s = m.split('||')
            cui = s[0]
            names = s[1].strip().split('|')
            cuiName[cui] = names

            for name in names:
                if name not in nameCui:
                    nameCui[name] = []
                nameCui[name].append(cui)
    return cuiName, nameCui

def readAnnotations(path):
    "Reads all .concept files from path into single dataframe."
    annotations = pd.DataFrame([])
    for file in os.listdir(path):
        if '.concept' in file:
            df = pd.read_table(f'{path}/{file}',sep='\|\|',header=None, names=['file_id','ix','type','name','cui'])
            df['file'] = [file for x in range(len(df))]
            annotations = pd.concat([annotations, df])
    annotations = annotations[['cui','name','file']]
    return annotations

def readCuiType(cuis):
    "Reads in a dictionary mapping cuis to semantic types."
    
    # Load UMLS semantic type mapping file.
    try:
        mrsty = pd.read_table('umls/mrsty.txt',sep='|',header=0,names=['cui','tui','stn','type','atui','cvf'])[['cui','tui','type']]
        mrsty = mrsty[mrsty.cui.isin(cuis)]
    except:
        raise('NOTE: Must have previously created umls/mrsty.txt by running "Load UMLS data.ipynb" to run readCuiType()')
    
    cuiType = {}
    for x in mrsty.iterrows():
        if x[1].cui not in cuiType:
            cuiType[x[1].cui] = []
        cuiType[x[1].cui].append(x[1].type)
    return cuiType

def getStats(df):
    "Gets stats for given dataframe"
    n = len(df)
    tp = sum(df.prediction == df.goldCui)
    fp = sum((df.prediction != df.goldCui) & (df.normalized==True))
    recall = round(tp/n,2) if n > 0 else 0
    precision = round(tp/(tp+fp),2) if (tp+fp) > 0 else 0
    return n, tp, fp, recall, precision

def sieveResults(results):
    "Returns a sieve-level analysis of results."
    levels = range(1,max(results.normalizingSieveLevel)+1)
    sieves = pd.DataFrame([], columns=['sieve','n','tp','fp', 'sieve_acc', 'agg_recall', 'agg_precision'])
    
    # Results for each sieve
    for i in levels:
        df = results[results.normalizingSieveLevel==i]
        n, tp, fp, recall, precision = getStats(df)
        sieve = df.normalizingSieveName.iloc[0] if n > 0 else "Unknown"
        sieves.loc[i] = [sieve, n, tp, fp, recall, 0, 0]
        sieves.loc[i,'agg_recall'] = round(sum(sieves.tp)/sum(sieves.n),2) if sum(sieves.n) > 0 else 0
        sieves.loc[i,'agg_precision'] = round(sum(sieves.tp)/(sum(sieves.tp)+sum(sieves.fp)),2) if sum(sieves.n) > 0 else 0
    
    # Total results
    n, tp, fp, recall, precision = getStats(results)
    sieves.loc[i+1] = ['Total', n, tp, fp, '-', recall, precision]
    return sieves

def stratifyByCol(df, col, asc=False):
    "Stratifies the results by given column."    
    # If the column is a list, explode list into individual rows
    if (df.sample(10).applymap(type).mode(0).astype(str) == "<class 'list'>")[col][0]:
        df = df.explode(col)
        
    rows = []
    for key in set(df[col]):
        if not pd.isnull(key):
            sub = df[df[col]==key]
            rows.append([key] + list(getStats(sub)))
    return pd.DataFrame(rows, columns=[col,'n','tp','fp', 'recall', 'precision']).sort_values('precision',ascending=asc)

def stratifyByCols(df, cols, asc=False):
    for col in cols:
        # If the column is a list, explode list into individual rows
        if (df.sample(10).applymap(type).mode(0).astype(str) == "<class 'list'>")[col][0]:
            df = df.explode(col)            
    return stratifyByCol(df.assign(combined=df[cols].agg('-'.join, axis=1)), 'combined',asc=asc)

def getAmbiguous(df):
    "Find ambiguous names"
    dfMap = {}
    for i in range(len(df)):
        name = df.iloc[i]['name'].lower().strip()
        cui = df.iloc[i]['cui'].lower().strip()

        if name not in dfMap:
            dfMap[name] = []

        dfMap[name] = list(set([cui] + dfMap[name]))

    namesToCuis = pd.DataFrame(dfMap.items(),columns=['name','cuis'])
    namesToCuis['ambiguous'] = [len(x) > 1 for x in namesToCuis.cuis]
    return namesToCuis[namesToCuis.ambiguous]

In [103]:
%%time
# Setup: Load terminology into dictionary, train, test, and results
dataset = 'CUILESS2016'
cuiName, nameCui = readTerminology(f'../resources/n2c2_terminology.txt')
# train = readAnnotations(f'../{dataset}-data/train')
# test = readAnnotations(f'../{dataset}-data/test')
results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')

# Load semantic type map
cuis = list(set(list(results.goldCui) + list(results.prediction)))
cuiType = readCuiType(cuis)

# Create analysis dataframe
cols = ['normalized','normalizingSource','normalizingSieveName','name','prediction','goldCui','normalizingName','keyPhrase','filename','snippet']
analysis = results[cols]
analysis = analysis.assign(goldNames=[['CUI-less'] if c=='CUI-less' else cuiName[c] if c in cuiName else ['Missing'] for c in results.goldCui])
analysis = analysis.assign(predTypes=[cuiType[c] if c in cuiType else ['Missing'] for c in results.prediction])
analysis = analysis.assign(goldTypes=[cuiType[c] if c in cuiType else ['Missing'] for c in results.goldCui])

# Sanity checks
# assert len(analysis[analysis.normalized & (analysis.predTypes=='Missing')])==0, 'Predicted CUI missing ST'
# assert len(analysis[(analysis.goldCui != 'CUI-less') & analysis.goldTypes=='Missing'])==0, 'Gold CUI missing ST'
# assert len(analysis[analysis.goldNames=='Missing'])==0, 'Gold names missing'
# assert len(analysis[analysis.goldTypes=='Missing'])==0, 'Gold types missing'
sieveResults(results)

Wall time: 8.55 s


Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,Unknown,0,0,0,0,0.0,0.0
2,ExactMatchSieve,932,849,83,0.91,0.91,0.91
3,AbbreviationExpansionSieve,17,12,5,0.71,0.91,0.91
4,RemoveStopwordsSieve,1,1,0,1,0.91,0.91
5,SynonymSieve,25,22,3,0.88,0.91,0.91
6,SuffixationSieve,15,12,3,0.8,0.91,0.91
7,PrepositionalTransformSieve,7,7,0,1,0.91,0.91
8,HyphenationSieve,12,12,0,1,0.91,0.91
9,Total,1688,915,94,-,0.54,0.91


In [94]:
# Stratify performance by column
normalized = analysis[analysis.normalized]
normalized['wordCount'] = [len(x.split()) for x in normalized.name]
stratifyByCol(normalized, 'normalizingSource')
stratifyByCol(normalized, 'normalizingSieveName')
stratifyByCol(normalized, 'wordCount')
# stratifyByCols(normalized, ['normalizingSource','normalizingSieveName'])
# display(stratifyByCols(normalized, ['predTypes','goldTypes']))
# stratifyByCol(analysis, 'predTypes', asc=True)
# stratifyByCol(analysis, 'goldTypes', asc=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,wordCount,n,tp,fp,recall,precision
3,4,10,10,0,1.0,1.0
4,5,9,9,0,1.0,1.0
1,2,320,301,19,0.94,0.94
0,1,625,555,70,0.89,0.89
2,3,46,41,5,0.89,0.89


In [95]:
stratifyByCol(normalized[normalized.normalizingSieveName=='AbbreviationExpansionSieve'], 'keyPhrase')
# nameCui['stepoff']

Unnamed: 0,keyPhrase,n,tp,fp,recall,precision
0,bs,1,1,0,1.0,1.0
1,nd,1,1,0,1.0,1.0
2,dm,1,1,0,1.0,1.0
3,wt,1,1,0,1.0,1.0
5,twi,1,1,0,1.0,1.0
8,abd,1,1,0,1.0,1.0
9,nt,1,1,0,1.0,1.0
10,hi,1,1,0,1.0,1.0
11,hoh,2,2,0,1.0,1.0
12,si,2,2,0,1.0,1.0


In [104]:
view = results[['name','goldNames','normalized']]
words = ['impair','reduce','limited','decreas','depress','diminish']
# words = ['rise','raise','rising','spik','elevat','increas','copious']
view = view[[any(token in x for token in words) for x in view.name]]
view = view[~view.normalized]
print(len(view))
view.style.set_properties(subset=['goldNames'], **{'width': '1000px'})

15


Unnamed: 0,name,goldNames,normalized
301,decreased lung sounds,"decreased air entry,decreased breath sounds,decreased breath sounds (finding),decreased transmission of breath sounds,decreased bs,diminished breath sounds",False
676,decreased respiratory rate,"bradypnea,bradypnoea,slow breathing,slow respiration,slow respiration (finding)",False
824,decreased platelets,"decreased platelet count,platelet count below reference range,platelet count below reference range (finding)",False
876,igg level decreased,"igg - immunoglobulin g,immunoglobulin g (& [level]),immunoglobulin g (& [level]) (procedure),immunoglobulin g level,immunoglobulin g measurement,immunoglobulin g measurement (procedure)",False
877,igg level decreased,"igg - immunoglobulin g,immunoglobulin g (& [level]),immunoglobulin g (& [level]) (procedure),immunoglobulin g level,immunoglobulin g measurement,immunoglobulin g measurement (procedure)",False
995,decreased range of motion,"joint movement restrained,joint movement restricted,limitation of joint motion,limitation of joint movement,limitation of joint movement (finding),limited range of motion,range of joint movement reduced,restriction of joint motion",False
1037,decreased movement,"finding of movement,finding of movement (finding),observation of movement,movement",False
1043,decreasing white blood cell count,"decreased blood leucocyte number,decreased blood leukocyte number,decreased blood leukocyte number (finding)",False
1090,diminished breath sounds,"decreased air entry,decreased breath sounds,decreased breath sounds (finding),decreased transmission of breath sounds,decreased bs,decreased lung sounds",False
1096,ventricular systolic function depressed,"depression of left ventricular systolic function,depression of left ventricular systolic function (finding),left ventricular systolic function,lv systolic function depressed,left ventricular systolic function depressed,lv systolic function",False


In [68]:
remove = ['normalized','normalizingSource','snippet']#,'normalizingSieveName','goldTypes','predTypes'
errors = analysis[(results.prediction != results.goldCui) & (results.normalized==True)]
# errors = errors[(errors.normalizingSource=='standardTerminology') & (errors.normalizingSieveName=='RemoveStopwordsSieve')]
errors = errors[errors.normalizingSieveName=='SynonymSieve']
# errors = errors[errors.filename=='332803550']
errors = errors.loc[:, ~errors.columns.isin(remove)]
print(len(errors))
errors.style.set_properties(subset=['goldNames'], **{'width': '1000px'})

2


Unnamed: 0,normalizingSieveName,name,prediction,goldCui,normalizingName,keyPhrase,filename,goldNames,predTypes,goldTypes
395,SynonymSieve,hearing problems,C0260662,C1384666,hearing problem,problems problem,17697-021699,"['decreased hearing', 'decreased hearing (finding)', 'difficulty hearing', 'hard of hearing', 'hearing impaired', 'hearing impairment', 'hearing loss', 'hearing loss (disorder)', 'hearing loss (finding)', 'hearing loss, nos', 'hypoacusis', 'impaired hearing', 'loss of hearing']",['Finding'],['Disease or Syndrome']
1443,SynonymSieve,discharge,C1827072,C0406834,drainage,discharge drainage,13237-009098,"['discharge from wound', 'discharging wound', 'leaking wound', 'weeping wound', 'wound discharge', 'wound discharge (finding)', 'wound discharge finding', 'wound discharge finding (finding)', 'wound discharge observable', 'wound discharge observable (observable entity)', 'wound discharge observations', 'wound leak', 'wound oozing', 'wound, leaking', 'wound, leaking (morphologic abnormality)']",['Finding'],['Finding']


In [48]:
# results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,MeasurementSieve,1,0,1,0,0.0,0.0
2,ExactMatchSieve,1021,703,318,0.69,0.69,0.69
3,AbbreviationExpansionSieve,13,5,8,0.38,0.68,0.68
4,RemoveStopwordsSieve,2,1,1,0.5,0.68,0.68
5,SynonymSieve,5,2,3,0.4,0.68,0.68
6,SuffixationSieve,15,7,8,0.47,0.68,0.68
7,PrepositionalTransformSieve,16,11,5,0.69,0.68,0.68
8,HyphenationSieve,21,10,11,0.48,0.68,0.68
9,Total,3468,739,355,-,0.21,0.68


In [511]:
results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,MeasurementSieve,185,181,4,0.98,0.98,0.98
2,ExactMatchSieve,4298,3908,390,0.91,0.91,0.91
3,AbbreviationExpansionSieve,92,88,4,0.96,0.91,0.91
4,RemoveStopwordsSieve,337,288,49,0.85,0.91,0.91
5,SynonymSieve,65,64,1,0.98,0.91,0.91
6,SuffixationSieve,33,30,3,0.91,0.91,0.91
7,PrepositionalTransformSieve,49,45,4,0.92,0.91,0.91
8,HyphenationSieve,20,20,0,1,0.91,0.91
9,Total,6619,4624,455,-,0.7,0.91


In [69]:
# getAmbiguous(train)
omissions = analysis[analysis.prediction.isnull()]
omissions = omissions[['Missing' not in x for x in omissions.goldNames]]
print(len(omissions),len(set(omissions.name)))
omissions[['filename','name','goldNames']].style.set_properties(subset=['goldNames'], **{'width': '1000px'})

431 373


Unnamed: 0,filename,name,goldNames
0,00174-002042,suicidal ideations,"['feeling suicidal', 'feeling suicidal (finding)', 'suicidal ideation', 'suicidal thoughts', 'suicidal thoughts (finding)']"
5,00174-002042,moaning,"['groaning respiration', 'groaning respiration (finding)', 'moaning respiration']"
6,00174-002042,responsive to voice,"['responds to voice', 'responds to voice (finding)']"
7,00174-002042,responsive to pain,"['finding of response to pain', 'finding of response to pain (finding)']"
8,00174-002042,increased pressure,"['increased cerebrospinal fluid pressure', 'increased cerebrospinal fluid pressure (finding)', 'increased cerebrospinal fluid pressure (finding) [ambiguous]', 'increased intracranial press.', 'increased intracranial pressure', 'raised intracranial pressure', 'raised intracranial pressure (finding)']"
10,00174-002042,total bilirubin elevated,"['increased bilirubin level', 'increased bilirubin level (finding)', 'serum bilirubin raised', 'serum bilirubin raised (finding)', 'serum bilirubin raised (situation)']"
12,00174-002042,clamped jaw,"['[d] trismus', 'trismus', 'trismus (disorder)', 'trismus (finding)']"
14,00174-002042,elevated inr,"['inr raised', 'inr raised (finding)', 'international normalised ratio (inr) raised', 'international normalised ratio raised', 'international normalized ratio (inr) raised', 'international normalized ratio (inr) raised (finding)', 'international normalized ratio raised', 'international normalized ratio raised (finding)']"
16,00174-002042,elevated inr,"['inr raised', 'inr raised (finding)', 'international normalised ratio (inr) raised', 'international normalised ratio raised', 'international normalized ratio (inr) raised', 'international normalized ratio (inr) raised (finding)', 'international normalized ratio raised', 'international normalized ratio raised (finding)']"
20,00796-013231,luminal irregularities,"['luminal irregularities of coronary artery', 'luminal irregularities of coronary artery (finding)']"


In [475]:
df = analysis[analysis.normalizingSieveName=='UmlsEndingSieve']
df.keyPhrase = ['finding of' if x.split(' ')[0]=='finding' else x.split(' ')[-1] for x in df.normalizingName]
stratifyByCol(df, 'keyPhrase')
df[df.keyPhrase=='nos']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,normalized,normalizingSource,normalizingSieveName,name,prediction,goldCui,normalizingName,keyPhrase,filename,snippet,goldNames,predTypes,goldTypes
320,True,standardTerminology,UmlsEndingSieve,a dressing,C0013119,C0013119,"dressing, nos",nos,0094,over the,"[dressing - item, dressing - item (physical ob...",[Medical Device],[Medical Device]
365,True,standardTerminology,UmlsEndingSieve,solution,C0037633,C0037633,"solution, nos",nos,0094,. 5. neur,"[solution (substance), solution, nos]",[Substance],[Substance]
626,True,standardTerminology,UmlsEndingSieve,ulceration,C3887532,C0586322,"ulceration, nos",nos,0286,in the le,"[ulcer of big toe, ulcer of big toe (disorder)...",[Pathologic Function],[Disease or Syndrome]
1027,True,standardTerminology,UmlsEndingSieve,foot,C0016504,C0016504,"foot, nos",nos,0431,dragging,"[foot structure, foot structure (body structur...","[Body Part, Organ, or Organ Component]","[Body Part, Organ, or Organ Component]"
1124,True,standardTerminology,UmlsEndingSieve,sternum,C0038293,C0038293,"sternum, nos",nos,0467,but not t,"[bone structure of sternum, bone structure of ...","[Body Part, Organ, or Organ Component]","[Body Part, Organ, or Organ Component]"
1141,True,standardTerminology,UmlsEndingSieve,sclera,C0036410,C0036410,"sclera, nos",nos,0467,anicteric,"[sclera, nos, scleral structure, scleral struc...","[Body Part, Organ, or Organ Component]","[Body Part, Organ, or Organ Component]"
1865,True,standardTerminology,UmlsEndingSieve,ulceration,C3887532,C0041582,"ulceration, nos",nos,622086964,after im,"[ulcer, ulcer (disorder), ulcer (morphologic a...",[Pathologic Function],[Pathologic Function]
2204,True,standardTerminology,UmlsEndingSieve,gallstone,C0008350,C0008350,"gallstone, nos",nos,0038,was noted,"[biliary calculus (disorder), biliary calculus...",[Disease or Syndrome],[Disease or Syndrome]
2273,True,standardTerminology,UmlsEndingSieve,his ace inhibitor,C0003015,C0003015,"ace inhibitor, nos",nos,0086,began . t,"[ace inhibitor product, ace inhibitor, nos, an...",[Pharmacologic Substance],[Pharmacologic Substance]
2400,True,standardTerminology,UmlsEndingSieve,an enterotomy,C0192579,C0192579,"enterotomy, nos",nos,0098,and was a,"[enterotomy, nos, incision of intestine, incis...",[Therapeutic or Preventive Procedure],[Therapeutic or Preventive Procedure]


In [448]:
df = omissions.explode('goldNames')
df = df[df.name+', nos'==df.goldNames]
df['new_prediction'] = [nameCui[x][0] for x in df.goldNames]
print(sum(df.new_prediction==df.goldCui),len(df))
df

41 41


Unnamed: 0,normalized,normalizingSource,normalizingSieveName,name,prediction,goldCui,normalizingName,keyPhrase,filename,snippet,goldNames,predTypes,goldTypes,new_prediction
71,False,,,discharge,,C0012621,,discharge drainage,0034,from your,"discharge, nos",[Missing],[Body Substance],C0012621
97,False,,,back,,C1995000,,,0034,revealed,"back, nos",[Missing],[Body Location or Region],C1995000
104,False,,,pelvis,,C0030797,,,0034,and r hip,"pelvis, nos",[Missing],"[Body Part, Organ, or Organ Component]",C0030797
318,False,,,drainage,,C0012621,,drainage discharge,0094,or tender,"drainage, nos",[Missing],[Body Substance],C0012621
365,False,,,solution,,C0037633,,tion,0094,. 5. neur,"solution, nos",[Missing],[Substance],C0037633
1027,False,,,foot,,C0016504,,,0431,dragging,"foot, nos",[Missing],"[Body Part, Organ, or Organ Component]",C0016504
1124,False,,,sternum,,C0038293,,,0467,but not t,"sternum, nos",[Missing],"[Body Part, Organ, or Organ Component]",C0038293
1141,False,,,sclera,,C0036410,,,0467,anicteric,"sclera, nos",[Missing],"[Body Part, Organ, or Organ Component]",C0036410
1312,False,,,penis,,C0030851,,,0477,. he had,"penis, nos",[Missing],"[Body Part, Organ, or Organ Component]",C0030851
1351,False,,,penis,,C0030851,,,0477,and pelvi,"penis, nos",[Missing],"[Body Part, Organ, or Organ Component]",C0030851


In [181]:
cuiName['C0026266']

['mitral incompetence',
 'mitral insufficiency',
 'mitral regurgitation',
 'mitral regurgitation, nos',
 'mitral valve incompetence',
 'mitral valve incompetence, nos',
 'mitral valve insufficiency',
 'mitral valve insufficiency, nos',
 'mitral valve regurgitation',
 'mitral valve regurgitation (disorder)',
 'mitral valve regurgitation, nos']