In [72]:
import os
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
HTML("<style>.rendered_html th {max-width: 120px;}</style>")

In [73]:
def readTerminology(file):
    "Reads a terminology file into two dictionaries."
    cuiName = {}
    nameCui = {}
    with open(file, encoding="ISO-8859-1") as f:
        for m in f.readlines():
            s = m.split('||')
            cui = s[0]
            names = s[1].strip().split('|')
            cuiName[cui] = names

            for name in names:
                if name not in nameCui:
                    nameCui[name] = []
                nameCui[name].append(cui)
    return cuiName, nameCui

def readAnnotations(path):
    "Reads all .concept files from path into single dataframe."
    annotations = pd.DataFrame([])
    for file in os.listdir(path):
        if '.concept' in file:
            df = pd.read_table(f'{path}/{file}',sep='\|\|',header=None, names=['file_id','ix','type','name','cui'])
            df['file'] = [file for x in range(len(df))]
            annotations = pd.concat([annotations, df])
    annotations = annotations[['cui','name','file']]
    return annotations

def readCuiType(cuis):
    "Reads in a dictionary mapping cuis to semantic types."
    
    # Load UMLS semantic type mapping file.
    try:
        mrsty = pd.read_table('umls/mrsty.txt',sep='|',header=0,names=['cui','tui','stn','type','atui','cvf'])[['cui','tui','type']]
        mrsty = mrsty[mrsty.cui.isin(cuis)]
    except:
        raise('NOTE: Must have previously created umls/mrsty.txt by running "Load UMLS data.ipynb" to run readCuiType()')
    
    cuiType = {}
    for x in mrsty.iterrows():
        if x[1].cui not in cuiType:
            cuiType[x[1].cui] = []
        cuiType[x[1].cui].append(x[1].type)
    return cuiType

def getStats(df):
    "Gets stats for given dataframe"
    n = len(df)
    tp = sum(df.prediction == df.goldCui)
    fp = sum((df.prediction != df.goldCui) & (df.normalized==True))
    recall = round(tp/n,2) if n > 0 else 0
    precision = round(tp/(tp+fp),2) if (tp+fp) > 0 else 0
    return n, tp, fp, recall, precision

def sieveResults(results):
    "Returns a sieve-level analysis of results."
    levels = range(1,max(results.normalizingSieveLevel)+1)
    sieves = pd.DataFrame([], columns=['sieve','n','tp','fp', 'sieve_acc', 'agg_recall', 'agg_precision'])
    
    # Results for each sieve
    for i in levels:
        df = results[results.normalizingSieveLevel==i]
        n, tp, fp, recall, precision = getStats(df)
        sieve = df.normalizingSieveName.iloc[0] if n > 0 else "Unknown"
        sieves.loc[i] = [sieve, n, tp, fp, recall, 0, 0]
        sieves.loc[i,'agg_recall'] = round(sum(sieves.tp)/sum(sieves.n),2)
        sieves.loc[i,'agg_precision'] = round(sum(sieves.tp)/(sum(sieves.tp)+sum(sieves.fp)),2)
    
    # Total results
    n, tp, fp, recall, precision = getStats(results)
    sieves.loc[i+1] = ['Total', n, tp, fp, '-', recall, precision]
    return sieves

def stratifyByCol(df, col, asc=False):
    "Stratifies the results by given column."    
    # If the column is a list, explode list into individual rows
    if (df.sample(10).applymap(type).mode(0).astype(str) == "<class 'list'>")[col][0]:
        df = df.explode(col)
        
    rows = []
    for key in set(df[col]):
        if not pd.isnull(key):
            sub = df[df[col]==key]
            rows.append([key] + list(getStats(sub)))
    return pd.DataFrame(rows, columns=[col,'n','tp','fp', 'recall', 'precision']).sort_values('precision',ascending=asc)

def stratifyByCols(df, cols, asc=False):
    for col in cols:
        # If the column is a list, explode list into individual rows
        if (df.sample(10).applymap(type).mode(0).astype(str) == "<class 'list'>")[col][0]:
            df = df.explode(col)            
    return stratifyByCol(df.assign(combined=df[cols].agg('-'.join, axis=1)), 'combined',asc=asc)

def getAmbiguous(df):
    "Find ambiguous names"
    dfMap = {}
    for i in range(len(df)):
        name = df.iloc[i]['name'].lower().strip()
        cui = df.iloc[i]['cui'].lower().strip()

        if name not in dfMap:
            dfMap[name] = []

        dfMap[name] = list(set([cui] + dfMap[name]))

    namesToCuis = pd.DataFrame(dfMap.items(),columns=['name','cuis'])
    namesToCuis['ambiguous'] = [len(x) > 1 for x in namesToCuis.cuis]
    return namesToCuis[namesToCuis.ambiguous]

In [167]:
%%time
# Setup: Load terminology into dictionary, train, test, and results
dataset = 'n2c2'
cuiName, nameCui = readTerminology(f'../resources/{dataset}_terminology.txt')
# train = readAnnotations(f'../{dataset}-data/train')
# test = readAnnotations(f'../{dataset}-data/test')
results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')

# Load semantic type map
cuis = list(set(list(results.goldCui) + list(results.prediction)))
# cuiType = readCuiType(cuis)

# Create analysis dataframe
cols = ['normalized','normalizingSource','normalizingSieveName','name','prediction','goldCui','normalizingName','keyPhrase','filename']
analysis = results[cols]
analysis = analysis.assign(goldNames=[['CUI-less'] if c=='CUI-less' else cuiName[c] if c in cuiName else ['Missing'] for c in results.goldCui])
# analysis = analysis.assign(predTypes=[cuiType[c] if c in cuiType else ['Missing'] for c in results.prediction])
# analysis = analysis.assign(goldTypes=[cuiType[c] if c in cuiType else ['Missing'] for c in results.goldCui])

# Sanity checks
# assert len(analysis[analysis.normalized & (analysis.predTypes=='Missing')])==0, 'Predicted CUI missing ST'
# assert len(analysis[(analysis.goldCui != 'CUI-less') & analysis.goldTypes=='Missing'])==0, 'Gold CUI missing ST'
# assert len(analysis[analysis.goldNames=='Missing'])==0, 'Gold names missing'
# assert len(analysis[analysis.goldTypes=='Missing'])==0, 'Gold types missing'

CPU times: user 1.59 s, sys: 293 ms, total: 1.88 s
Wall time: 1.96 s


In [168]:
# Stratify performance by column
normalized = analysis[analysis.normalized]
stratifyByCol(normalized, 'normalizingSource')
stratifyByCol(normalized, 'normalizingSieveName')
# stratifyByCols(normalized, ['normalizingSource','normalizingSieveName'])
# display(stratifyByCols(normalized, ['predTypes','goldTypes']))
# stratifyByCol(analysis, 'predTypes', asc=True)
# stratifyByCol(analysis, 'goldTypes', asc=True)

Unnamed: 0,normalizingSieveName,n,tp,fp,recall,precision
4,HyphenationSieve,21,21,0,1.0,1.0
3,SynonymSieve,44,43,1,0.98,0.98
5,AbbreviationExpansionSieve,63,62,1,0.98,0.98
0,ExactMatchSieve,4325,4071,254,0.94,0.94
2,SuffixationSieve,36,34,2,0.94,0.94
1,PrepositionalTransformSieve,53,49,4,0.92,0.92
6,RemoveStopwordsSieve,337,294,43,0.87,0.87


In [169]:
stratifyByCol(normalized[normalized.normalizingSieveName=='RemoveStopwordsSieve'], 'keyPhrase')

Unnamed: 0,keyPhrase,n,tp,fp,recall,precision
12,ly,15,13,2,0.87,0.87
0,sed,1,1,0,1.0,1.0
1,ness,1,1,0,1.0,1.0
2,ies,2,2,0,1.0,1.0
3,lent,1,1,0,1.0,1.0
4,red,1,1,0,1.0,1.0
5,dic,1,1,0,1.0,1.0
6,gent,2,2,0,1.0,1.0
7,ted,1,1,0,1.0,1.0
8,brady,1,1,0,1.0,1.0


In [170]:
remove = ['normalized','normalizingSource']#,'normalizingSieveName','goldTypes','predTypes'
errors = analysis[(results.prediction != results.goldCui) & (results.normalized==True)]
# errors = errors[(errors.normalizingSource=='standardTerminology') & (errors.normalizingSieveName=='SuffixationSieve')]
errors = errors[errors.normalizingSieveName=='RemoveStopwordsSieve']
errors = errors.loc[:, ~errors.columns.isin(remove)]
print(len(errors))
errors.style.set_properties(subset=['goldNames'], **{'width': '1000px'})

1


Unnamed: 0,normalizingSieveName,name,prediction,goldCui,normalizingName,keyPhrase,filename,goldNames
4330,SuffixationSieve,fairly,C2911689,CUI-less,fair,ly,314,['CUI-less']


In [171]:
# results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,ExactMatchSieve,4325,4071,254,0.94,0.94,0.94
2,AbbreviationExpansionSieve,63,62,1,0.98,0.94,0.94
3,RemoveStopwordsSieve,337,294,43,0.87,0.94,0.94
4,SynonymSieve,44,43,1,0.98,0.94,0.94
5,SuffixationSieve,36,34,2,0.94,0.94,0.94
6,PrepositionalTransformSieve,53,49,4,0.92,0.94,0.94
7,HyphenationSieve,21,21,0,1.0,0.94,0.94
8,Total,6630,4574,305,-,0.69,0.94


In [173]:
results = pd.read_csv(f'../{dataset}-data/output/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,ExactMatchSieve,4325,4071,254,0.94,0.94,0.94
2,AbbreviationExpansionSieve,63,62,1,0.98,0.94,0.94
3,RemoveStopwordsSieve,337,294,43,0.87,0.94,0.94
4,SynonymSieve,44,43,1,0.98,0.94,0.94
5,SuffixationSieve,36,34,2,0.94,0.94,0.94
6,PrepositionalTransformSieve,53,49,4,0.92,0.94,0.94
7,HyphenationSieve,21,21,0,1.0,0.94,0.94
8,Total,6630,4574,305,-,0.69,0.94


In [151]:
# getAmbiguous(train)
omissions = analysis[analysis.prediction.isnull()]
omissions = omissions[['Missing' not in x for x in omissions.goldNames]]
print(len(omissions),len(set(omissions.name)))
omissions[['filename','name','goldNames']].style.set_properties(subset=['goldNames'], **{'width': '1000px'})

1492 1318


Unnamed: 0,filename,name,goldNames
16,974381789,antiarrhythmic agents,"['antiarrhythmic agent', 'antiarrhythmic agent (substance)', 'antiarrhythmic drug', 'antiarrhythmic drug (product)', 'antiarrhythmic drug (substance)', 'antiarrhythmic drug, nos', 'medicinal product acting as antiarrhythmic agent (product)']"
17,974381789,decreased exercise tolerance,"['impaired exercise tolerance', 'impaired exercise tolerance (finding)']"
19,974381789,dc electrical cardioversion,"['direct current cardioversion', 'direct current cardioversion (procedure)']"
25,974381789,cardizem cd,"['diltiazem hydrochloride', 'diltiazem hydrochloride (substance)', 'diltiazem hydrochloride 2', 'diltiazem hydrochloride 2 (product)', 'diltiazem hydrochloride 2 (substance)', 'diltiazem hydrochloride product', 'diltiazem hydrochloride product (product)', 'diltiazem hydrochloride product (substance)', 'latiazem hydrochloride']"
31,974381789,sulfa antibiotics,"['medicinal product containing sulfonamide and acting as antibacterial agent (product)', 'product containing sulfonamide and antibiotic', 'product containing sulfonamide and antibiotic (product)', 'substance with sulfonamide structure and antibacterial mechanism of action', 'substance with sulfonamide structure and antibacterial mechanism of action (substance)', 'sulfonamide -class of antibiotic-', 'sulfonamide -class of antibiotic- (product)', 'sulfonamide -class of antibiotic- (substance)', 'sulfonamide antibacterial', 'sulfonamide antibacterial agent', 'sulfonamide antibiotic product', 'sulfonamide, nos', 'sulfonilamide', 'sulfonilamide, nos', 'sulphonamide -class of antibiotic-', 'sulphonilamide']"
37,974381789,neck vein distension,"['dilation of jugular vein', 'distention of jugular vein', 'distention of jugular vein (finding)']"
57,974381789,qt prolongation,"['increased q-t interval', 'increased q-t interval (finding)', 'increased q-t interval -retired-', 'increased qt interval', 'prolonged qt interval', 'prolonged qt interval (finding)']"
58,974381789,pulmonary function tests,"['lung function test nos', 'lung function test nos (procedure)', 'lung function tests', 'pft, nos', 'pulmonary function test', 'pulmonary function test, nos']"
64,974381789,premature ventricular contractions,"['premature ventricular complex', 'ventricular ectopic beats', 'ventricular ectopic beats (disorder)', 'ventricular ectopic beats (finding)', 'ventricular ectopic complex', 'ventricular extrasystoles', 'ventricular premature beats', 'ventricular premature beats (disorder)', 'ventricular premature complex', 'ventricular premature complex (disorder)', 'ventricular premature contractions', 'ventricular premature depolarisation', 'ventricular premature depolarization', 'ventricular premature systoles']"
73,974381789,atrial fibrillation with a rapid ventricular rate,"['atrial fibrillation with rapid ventricular response', 'atrial fibrillation with rapid ventricular response (disorder)']"


In [83]:
results[results.name.str.contains('ies')]
# results[results.normalizingName.notnull() & results.normalizingName.str.contains('tincture')]
# results[results.normalizingSieveName=='AbbreviationExpansionSieve']

Unnamed: 0,filename,name,keyPhrase,prediction,normalized,normalizingSieveName,normalizingSource,goldCui,normalizingName,normalizingSieveLevel,goldNames
74,974381789,t-wave abnormalities,s,,False,,,C0438167,,0,"ecg: t wave abnormal,ecg: t wave abnormal (fin..."
194,0362,any further studies,s,,False,,,C0420266,,0,"further care referral nos,further care referra..."
216,0362,denies any drug allergies,s,,False,,,C0262581,,0,"no known drug allergies,no known drug allergie..."
235,0362,lower extremities,s,,False,,,C0230420,,0,"both lower extremities,both lower extremities ..."
343,0468,iron studies,,C0337439,True,ExactMatchSieve,trainTerminology,C0337439,,1,"iron level,iron measurement,iron measurement (..."
...,...,...,...,...,...,...,...,...,...,...,...
5844,0250,no known drug allergies,,C0262581,True,ExactMatchSieve,trainTerminology,C0262581,,1,"no known drug allergies,no known drug allergie..."
6185,0476,iron studies,,C0337439,True,ExactMatchSieve,trainTerminology,C0337439,,1,"iron level,iron measurement,iron measurement (..."
6372,498710998,her rehabilitation activities,ation,,False,,,C0034991,,0,"rehabilitation,rehabilitation care,rehabilitat..."
6389,498710998,abnormali-ties,s,,False,,,C0205161,,0,"abnormal,abnormal (modifier) (qualifier value)..."


In [None]:
cuiName['C0026266']

['mitral incompetence',
 'mitral insufficiency',
 'mitral regurgitation',
 'mitral regurgitation, nos',
 'mitral valve incompetence',
 'mitral valve incompetence, nos',
 'mitral valve insufficiency',
 'mitral valve insufficiency, nos',
 'mitral valve regurgitation',
 'mitral valve regurgitation (disorder)',
 'mitral valve regurgitation, nos']