In [1]:
import os
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
HTML("<style>.rendered_html th {max-width: 120px;}</style>")

In [2]:
def readTerminology(file):
    "Reads a terminology file into two dictionaries."
    cuiName = {}
    nameCui = {}
    with open(file, encoding="ISO-8859-1") as f:
        for m in f.readlines():
            s = m.split('||')
            cui = s[0]
            names = s[1].strip().split('|')
            cuiName[cui] = names

            for name in names:
                if name not in nameCui:
                    nameCui[name] = []
                nameCui[name].append(cui)
    return cuiName, nameCui

def readAnnotations(path):
    "Reads all .concept files from path into single dataframe."
    annotations = pd.DataFrame([])
    for file in os.listdir(path):
        if '.concept' in file:
            df = pd.read_table(f'{path}/{file}',sep='\|\|',header=None, names=['file_id','ix','type','name','cui'])
            df['file'] = [file for x in range(len(df))]
            annotations = pd.concat([annotations, df])
    annotations = annotations[['cui','name','file']]
    return annotations

def readCuiType(cuis):
    "Reads in a dictionary mapping cuis to semantic types."
    
    # Load UMLS semantic type mapping file.
    try:
        mrsty = pd.read_table('umls/mrsty.txt',sep='|',header=0,names=['cui','tui','stn','type','atui','cvf'])[['cui','tui','type']]
        mrsty = mrsty[mrsty.cui.isin(cuis)]
    except:
        raise('NOTE: Must have previously created umls/mrsty.txt by running "Load UMLS data.ipynb" to run readCuiType()')
    
    cuiType = {}
    for x in mrsty.iterrows():
        if x[1].cui not in cuiType:
            cuiType[x[1].cui] = []
        cuiType[x[1].cui].append(x[1].type)
    return cuiType

def getStats(df):
    "Gets stats for given dataframe"
    n = len(df)
    tp = sum(df.prediction == df.goldCui)
    fp = sum((df.prediction != df.goldCui) & (df.normalized==True))
    recall = round(tp/n,2) if n > 0 else 0
    precision = round(tp/(tp+fp),2) if (tp+fp) > 0 else 0
    return n, tp, fp, recall, precision

def sieveResults(results):
    "Returns a sieve-level analysis of results."
    levels = range(1,max(results.normalizingSieveLevel)+1)
    sieves = pd.DataFrame([], columns=['sieve','n','tp','fp', 'sieve_acc', 'agg_recall', 'agg_precision'])
    
    # Results for each sieve
    for i in levels:
        df = results[results.normalizingSieveLevel==i]
        n, tp, fp, recall, precision = getStats(df)
        sieve = df.normalizingSieveName.iloc[0] if n > 0 else "Unknown"
        sieves.loc[i] = [sieve, n, tp, fp, recall, 0, 0]
        sieves.loc[i,'agg_recall'] = round(sum(sieves.tp)/sum(sieves.n),2) if sum(sieves.n) > 0 else 0
        sieves.loc[i,'agg_precision'] = round(sum(sieves.tp)/(sum(sieves.tp)+sum(sieves.fp)),2) if sum(sieves.n) > 0 else 0
    
    # Total results
    n, tp, fp, recall, precision = getStats(results)
    sieves.loc[i+1] = ['Total', n, tp, fp, '-', recall, precision]
    return sieves

def stratifyByCol(df, col, asc=False):
    "Stratifies the results by given column."    
    # If the column is a list, explode list into individual rows
    if (df.sample(10, replace=True).applymap(type).mode(0).astype(str) == "<class 'list'>")[col][0]:
        df = df.explode(col)
        
    rows = []
    for key in set(df[col]):
        if not pd.isnull(key):
            sub = df[df[col]==key]
            rows.append([key] + list(getStats(sub)))
    return pd.DataFrame(rows, columns=[col,'n','tp','fp', 'recall', 'precision']).sort_values('precision',ascending=asc)

def stratifyByCols(df, cols, asc=False):
    for col in cols:
        # If the column is a list, explode list into individual rows
        if (df.sample(10, replace=True).applymap(type).mode(0).astype(str) == "<class 'list'>")[col][0]:
            df = df.explode(col)            
    return stratifyByCol(df.assign(combined=df[cols].agg('-'.join, axis=1)), 'combined',asc=asc)

def getAmbiguous(df):
    "Find ambiguous names"
    dfMap = {}
    for i in range(len(df)):
        name = df.iloc[i]['name'].lower().strip()
        cui = df.iloc[i]['cui'].lower().strip()

        if name not in dfMap:
            dfMap[name] = []

        dfMap[name] = list(set([cui] + dfMap[name]))

    namesToCuis = pd.DataFrame(dfMap.items(),columns=['name','cuis'])
    namesToCuis['ambiguous'] = [len(x) > 1 for x in namesToCuis.cuis]
    return namesToCuis[namesToCuis.ambiguous]

In [12]:
%%time
# Setup: Load terminology into dictionary, train, test, and results
dataset = 'CUILESS2016'
cuiName, nameCui = readTerminology(f'../resources/n2c2_terminology.txt')
# train = readAnnotations(f'../{dataset}-data/train')
# test = readAnnotations(f'../{dataset}-data/test')
results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')

# Load semantic type map
cuis = list(set(list(results.goldCui) + list(results.prediction)))
# cuiType = readCuiType(cuis)

# Create analysis dataframe
cols = ['normalized','normalizingSource','normalizingSieveName','name','prediction','goldCui','normalizingName','keyPhrase','filename','snippet']
analysis = results[cols]
analysis = analysis.assign(goldNames=[['CUI-less'] if c=='CUI-less' else cuiName[c] if c in cuiName else ['Missing'] for c in results.goldCui])
# analysis = analysis.assign(predTypes=[cuiType[c] if c in cuiType else ['Missing'] for c in results.prediction])
# analysis = analysis.assign(goldTypes=[cuiType[c] if c in cuiType else ['Missing'] for c in results.goldCui])

# Sanity checks
# assert len(analysis[analysis.normalized & (analysis.predTypes=='Missing')])==0, 'Predicted CUI missing ST'
# assert len(analysis[(analysis.goldCui != 'CUI-less') & analysis.goldTypes=='Missing'])==0, 'Gold CUI missing ST'
# assert len(analysis[analysis.goldNames=='Missing'])==0, 'Gold names missing'
# assert len(analysis[analysis.goldTypes=='Missing'])==0, 'Gold types missing'
sieveResults(results)

Wall time: 2.74 s


Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,Unknown,0,0,0,0,0.0,0.0
2,ExactMatchSieve,933,850,83,0.91,0.91,0.91
3,AbbreviationExpansionSieve,17,12,5,0.71,0.91,0.91
4,RemoveStopwordsSieve,1,1,0,1,0.91,0.91
5,SynonymSieve,27,24,3,0.89,0.91,0.91
6,SuffixationSieve,15,12,3,0.8,0.91,0.91
7,PrepositionalTransformSieve,18,18,0,1,0.91,0.91
8,HyphenationSieve,12,12,0,1,0.91,0.91
9,Total,1688,929,94,-,0.55,0.91


In [13]:
results[results.goldCui.str.contains('\|')]
# results.head()

Unnamed: 0,filename,name,keyPhrase,prediction,normalized,normalizingSieveName,normalizingSource,goldCui,normalizingName,normalizingSieveLevel,goldNames,snippet


In [5]:
def getStats(df):
    "Gets stats for given dataframe"
    n = len(df)
    tp = sum(df.prediction == df.goldCui)
    fp = sum((df.prediction != df.goldCui) & (df.normalized==True))
    recall = round(tp/n,2) if n > 0 else 0
    precision = round(tp/(tp+fp),2) if (tp+fp) > 0 else 0
    return n, tp, fp, recall, precision
getStats(results)

(1688, 915, 94, 0.54, 0.91)

In [45]:
# Stratify performance by column
normalized = analysis[analysis.normalized]
normalized['wordCount'] = [len(x.split()) for x in normalized.name]
stratifyByCol(normalized, 'normalizingSource')
stratifyByCol(normalized, 'normalizingSieveName')
# stratifyByCol(normalized, 'wordCount')
# stratifyByCols(normalized, ['normalizingSource','normalizingSieveName'])
# display(stratifyByCols(normalized, ['predTypes','goldTypes']))
# stratifyByCol(analysis, 'predTypes', asc=True)
# stratifyByCol(analysis, 'goldTypes', asc=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normalized['wordCount'] = [len(x.split()) for x in normalized.name]


Unnamed: 0,normalizingSieveName,n,tp,fp,recall,precision
3,PrepositionalTransformSieve,35,30,5,0.86,0.86
5,HyphenationSieve,10,8,2,0.8,0.8
6,ExactMatchSieve,1437,1048,389,0.73,0.73
2,AbbreviationExpansionSieve,27,15,12,0.56,0.56
4,SuffixationSieve,16,8,8,0.5,0.5
1,SynonymSieve,12,4,8,0.33,0.33
0,MeasurementSieve,1,0,1,0.0,0.0
7,RemoveStopwordsSieve,1,0,1,0.0,0.0


In [38]:
stratifyByCol(normalized[normalized.normalizingSieveName=='SuffixationSieve'], 'keyPhrase')

Unnamed: 0,keyPhrase,n,tp,fp,recall,precision
0,sions,2,1,1,0.5,0.5


In [30]:
view = results[['name','prediction','goldNames','normalized']]
words = ['impair','reduce','limited','decreas','depress','diminish']
words = ['rise','raise','rising','spik','elevat','increas','copious']
# words = ['creatinine']
view = view[[any(token in x for token in words) for x in view.name]]
view = view[~view.normalized & view.prediction.isnull()]
print(len(view))
view.style.set_properties(subset=['goldNames'], **{'width': '1000px'})

0


Unnamed: 0,name,prediction,goldNames,normalized


In [47]:
remove = ['normalized','normalizingSource','snippet']#,'normalizingSieveName','goldTypes','predTypes'
errors = analysis[(results.prediction != results.goldCui) & (results.normalized==True)]
# errors = errors[(errors.normalizingSource=='standardTerminology') & (errors.normalizingSieveName=='RemoveStopwordsSieve')]
# errors = errors[errors.normalizingSieveName=='SuffixationSieve']
# errors = errors[errors.filename=='332803550']
errors = errors.loc[:, ~errors.columns.isin(remove)]
print(len(errors))
errors.style.set_properties(subset=['goldNames'], **{'width': '1000px'})

426


Unnamed: 0,normalizingSieveName,name,prediction,goldCui,normalizingName,keyPhrase,filename,goldNames
3,ExactMatchSieve,murmur,C0018808,C0577820,,,16247-028319,"['heart murmur absent', 'heart murmur absent (context-dependent category)', 'heart murmur absent (finding)', 'heart murmur absent (situation)']"
12,ExactMatchSieve,left ventricular systolic function,C1299337,C3266756,,,16247-028319,"['normal left ventricular systolic function', 'normal left ventricular systolic function (finding)']"
28,ExactMatchSieve,sinus rhythm,C0232201,C1960142,,,14158-075452,"['ecg: sinus rhythm', 'electrocardiogram: sinus rhythm', 'electrocardiogram: sinus rhythm (finding)']"
33,ExactMatchSieve,thickening,C0205400,C0205400|C0015028,,,00587-400001,['Missing']
40,ExactMatchSieve,left ventricular function,C0080310,C1287715,,,12627-109059,"['finding of left ventricular function', 'finding of left ventricular function (finding)', 'left ventricular function - finding']"
45,ExactMatchSieve,right ventricular function,C0080311,C1287720,,,18318-102656,"['finding of right ventricular function', 'finding of right ventricular function (finding)', 'right ventricular function - finding']"
57,ExactMatchSieve,lad,C0497156,C0232297,,,16093-011230,"['left axis deviation', 'left axis deviation (finding)', 'left axis deviation by ekg']"
60,ExactMatchSieve,m,C0018808,C0577820,,,16093-011230,"['heart murmur absent', 'heart murmur absent (context-dependent category)', 'heart murmur absent (finding)', 'heart murmur absent (situation)']"
78,AbbreviationExpansionSieve,bp,C1271104,C0005823,blood pressure,bp,04525-003099,"['blood pressure, nos']"
85,ExactMatchSieve,jvp,C0558494,C0428897,,,04525-003099,"['jugular venous pressure', 'jugular venous pressure (observable entity)']"


In [48]:
# results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,MeasurementSieve,1,0,1,0,0.0,0.0
2,ExactMatchSieve,1021,703,318,0.69,0.69,0.69
3,AbbreviationExpansionSieve,13,5,8,0.38,0.68,0.68
4,RemoveStopwordsSieve,2,1,1,0.5,0.68,0.68
5,SynonymSieve,5,2,3,0.4,0.68,0.68
6,SuffixationSieve,15,7,8,0.47,0.68,0.68
7,PrepositionalTransformSieve,16,11,5,0.69,0.68,0.68
8,HyphenationSieve,21,10,11,0.48,0.68,0.68
9,Total,3468,739,355,-,0.21,0.68


In [511]:
results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,MeasurementSieve,185,181,4,0.98,0.98,0.98
2,ExactMatchSieve,4298,3908,390,0.91,0.91,0.91
3,AbbreviationExpansionSieve,92,88,4,0.96,0.91,0.91
4,RemoveStopwordsSieve,337,288,49,0.85,0.91,0.91
5,SynonymSieve,65,64,1,0.98,0.91,0.91
6,SuffixationSieve,33,30,3,0.91,0.91,0.91
7,PrepositionalTransformSieve,49,45,4,0.92,0.91,0.91
8,HyphenationSieve,20,20,0,1,0.91,0.91
9,Total,6619,4624,455,-,0.7,0.91


In [31]:
# getAmbiguous(train)
omissions = analysis[analysis.prediction.isnull()]
omissions = omissions[['Missing' not in x for x in omissions.goldNames]]
print(len(omissions),len(set(omissions.name)))
omissions[['filename','name','goldNames']].style.set_properties(subset=['goldNames'], **{'width': '1000px'})

233 211


Unnamed: 0,filename,name,goldNames
18,06889-019975,radiating murmurs,"['[d]heart murmur', '[d]heart murmur (context-dependent category)', '[d]heart murmur (situation)', '[d]heart murmur nos', '[d]heart murmur nos (context-dependent category)', '[d]heart murmur nos (situation)', 'cardiac murmur', 'cardiac murmur, nos', 'finding of heart murmur', 'finding of heart murmur (finding)', 'heart murmur', 'heart murmur (finding)', 'heart murmur (observable entity)', 'heart murmur [d]', 'heart murmur [d] (finding)', 'heart murmur [d] (situation)', 'heart murmur, nos', 'murmur', 'murmur (finding)', 'murmur, nos', 'observation of heart murmur']"
20,06889-019975,irregular rate,"['heart beats irregular', 'heart irregular', 'heart irregular (finding)', 'irregular heart beat', 'irregular heart beat (finding)', 'irregular heart rate', 'irregular heart rate (disorder)', 'irregular heart rate (disorder) [ambiguous]', 'irregular heart rate (finding)', 'palpitations - irregular']"
28,06889-019975,hypothermic,"['body temperature below normal', 'body temperature below normal (finding)', 'decreased body temperature', 'decreased body temperature (finding)', 'hypothermia', 'hypothermia (finding)', 'hypothermia - disorder', 'state of hypothermia', 'temperature subnormal']"
32,17697-021699,deteriorated,"['general health deterioration', 'general health deterioration (finding)']"
33,17697-021699,hemiplegic left,"['left hemiplegia', 'left hemiplegia (disorder)']"
35,17697-021699,displacement,"['deviation', 'deviation, nos', 'displaced', 'displaced (qualifier value)', 'displacement (morphologic abnormality)', 'displacement, nos']"
38,17697-021699,cavernomas,"['[m]cavernous haemangioma', '[m]cavernous hemangioma', 'cavernous haemangioma', 'cavernous haemangioma (disorder)', 'cavernous hemangioma', 'cavernous hemangioma (disorder)', 'cavernous hemangioma (morphologic abnormality)', 'cavernous naevus', 'cavernous nevus', 'strawberry haemangioma', 'strawberry haemangioma (disorder)', 'strawberry hemangioma']"
39,17697-021699,flow abnormalities,"['cerebrospinal fluid flow finding', 'cerebrospinal fluid flow finding (finding)', 'csf (cerebrospinal fluid) flow finding']"
40,17697-021699,left sided neglect,"['neglect of left side of body', 'neglect of left side of body (finding)']"
41,17697-021699,left neglect,"['neglect of left side of body', 'neglect of left side of body (finding)']"


In [475]:
df = analysis[analysis.normalizingSieveName=='UmlsEndingSieve']
df.keyPhrase = ['finding of' if x.split(' ')[0]=='finding' else x.split(' ')[-1] for x in df.normalizingName]
stratifyByCol(df, 'keyPhrase')
df[df.keyPhrase=='nos']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,normalized,normalizingSource,normalizingSieveName,name,prediction,goldCui,normalizingName,keyPhrase,filename,snippet,goldNames,predTypes,goldTypes
320,True,standardTerminology,UmlsEndingSieve,a dressing,C0013119,C0013119,"dressing, nos",nos,0094,over the,"[dressing - item, dressing - item (physical ob...",[Medical Device],[Medical Device]
365,True,standardTerminology,UmlsEndingSieve,solution,C0037633,C0037633,"solution, nos",nos,0094,. 5. neur,"[solution (substance), solution, nos]",[Substance],[Substance]
626,True,standardTerminology,UmlsEndingSieve,ulceration,C3887532,C0586322,"ulceration, nos",nos,0286,in the le,"[ulcer of big toe, ulcer of big toe (disorder)...",[Pathologic Function],[Disease or Syndrome]
1027,True,standardTerminology,UmlsEndingSieve,foot,C0016504,C0016504,"foot, nos",nos,0431,dragging,"[foot structure, foot structure (body structur...","[Body Part, Organ, or Organ Component]","[Body Part, Organ, or Organ Component]"
1124,True,standardTerminology,UmlsEndingSieve,sternum,C0038293,C0038293,"sternum, nos",nos,0467,but not t,"[bone structure of sternum, bone structure of ...","[Body Part, Organ, or Organ Component]","[Body Part, Organ, or Organ Component]"
1141,True,standardTerminology,UmlsEndingSieve,sclera,C0036410,C0036410,"sclera, nos",nos,0467,anicteric,"[sclera, nos, scleral structure, scleral struc...","[Body Part, Organ, or Organ Component]","[Body Part, Organ, or Organ Component]"
1865,True,standardTerminology,UmlsEndingSieve,ulceration,C3887532,C0041582,"ulceration, nos",nos,622086964,after im,"[ulcer, ulcer (disorder), ulcer (morphologic a...",[Pathologic Function],[Pathologic Function]
2204,True,standardTerminology,UmlsEndingSieve,gallstone,C0008350,C0008350,"gallstone, nos",nos,0038,was noted,"[biliary calculus (disorder), biliary calculus...",[Disease or Syndrome],[Disease or Syndrome]
2273,True,standardTerminology,UmlsEndingSieve,his ace inhibitor,C0003015,C0003015,"ace inhibitor, nos",nos,0086,began . t,"[ace inhibitor product, ace inhibitor, nos, an...",[Pharmacologic Substance],[Pharmacologic Substance]
2400,True,standardTerminology,UmlsEndingSieve,an enterotomy,C0192579,C0192579,"enterotomy, nos",nos,0098,and was a,"[enterotomy, nos, incision of intestine, incis...",[Therapeutic or Preventive Procedure],[Therapeutic or Preventive Procedure]


In [448]:
df = omissions.explode('goldNames')
df = df[df.name+', nos'==df.goldNames]
df['new_prediction'] = [nameCui[x][0] for x in df.goldNames]
print(sum(df.new_prediction==df.goldCui),len(df))
df

41 41


Unnamed: 0,normalized,normalizingSource,normalizingSieveName,name,prediction,goldCui,normalizingName,keyPhrase,filename,snippet,goldNames,predTypes,goldTypes,new_prediction
71,False,,,discharge,,C0012621,,discharge drainage,0034,from your,"discharge, nos",[Missing],[Body Substance],C0012621
97,False,,,back,,C1995000,,,0034,revealed,"back, nos",[Missing],[Body Location or Region],C1995000
104,False,,,pelvis,,C0030797,,,0034,and r hip,"pelvis, nos",[Missing],"[Body Part, Organ, or Organ Component]",C0030797
318,False,,,drainage,,C0012621,,drainage discharge,0094,or tender,"drainage, nos",[Missing],[Body Substance],C0012621
365,False,,,solution,,C0037633,,tion,0094,. 5. neur,"solution, nos",[Missing],[Substance],C0037633
1027,False,,,foot,,C0016504,,,0431,dragging,"foot, nos",[Missing],"[Body Part, Organ, or Organ Component]",C0016504
1124,False,,,sternum,,C0038293,,,0467,but not t,"sternum, nos",[Missing],"[Body Part, Organ, or Organ Component]",C0038293
1141,False,,,sclera,,C0036410,,,0467,anicteric,"sclera, nos",[Missing],"[Body Part, Organ, or Organ Component]",C0036410
1312,False,,,penis,,C0030851,,,0477,. he had,"penis, nos",[Missing],"[Body Part, Organ, or Organ Component]",C0030851
1351,False,,,penis,,C0030851,,,0477,and pelvi,"penis, nos",[Missing],"[Body Part, Organ, or Organ Component]",C0030851


In [181]:
cuiName['C0026266']

['mitral incompetence',
 'mitral insufficiency',
 'mitral regurgitation',
 'mitral regurgitation, nos',
 'mitral valve incompetence',
 'mitral valve incompetence, nos',
 'mitral valve insufficiency',
 'mitral valve insufficiency, nos',
 'mitral valve regurgitation',
 'mitral valve regurgitation (disorder)',
 'mitral valve regurgitation, nos']