In [1]:
import os
import pandas as pd

In [2]:
def getStats(df):
    "Gets stats for given dataframe"
    n = len(df)
    tp = sum(df.prediction == df.goldCui)
    fp = sum((df.prediction != df.goldCui) & (df.normalized==True))
    recall = round(tp/n,2) if n > 0 else 0
    precision = round(tp/(tp+fp),2) if (tp+fp) > 0 else 0
    return n, tp, fp, recall, precision

def sieveResults(results):
    "Returns a sieve-level analysis of results."
    levels = range(1,max(results.normalizingSieveLevel)+1)
    sieves = pd.DataFrame([], columns=['sieve','n','tp','fp', 'sieve_acc', 'agg_recall', 'agg_precision'])
    
    # Results for each sieve
    for i in levels:
        df = results[results.normalizingSieveLevel==i]
        n, tp, fp, recall, precision = getStats(df)
        sieve = df.normalizingSieveName.iloc[0] if n > 0 else "Unknown"
        sieves.loc[i] = [sieve, n, tp, fp, recall, 0, 0]
        sieves.loc[i,'agg_recall'] = round(sum(sieves.tp)/sum(sieves.n),2)
        sieves.loc[i,'agg_precision'] = round(sum(sieves.tp)/(sum(sieves.tp)+sum(sieves.fp)),2)
    
    # Total results
    n, tp, fp, recall, precision = getStats(results)
    sieves.loc[i+1] = ['Total', n, tp, fp, '-', recall, precision]
    return sieves

def getAmbiguous(df):
    "Find ambiguous names"
    dfMap = {}
    for i in range(len(df)):
        name = df.iloc[i]['name'].lower().strip()
        cui = df.iloc[i]['cui'].lower().strip()

        if name not in dfMap:
            dfMap[name] = []

        dfMap[name] = list(set([cui] + dfMap[name]))

    namesToCuis = pd.DataFrame(dfMap.items(),columns=['name','cuis'])
    namesToCuis['ambiguous'] = [len(x) > 1 for x in namesToCuis.cuis]
    return namesToCuis[namesToCuis.ambiguous]

terminology = pd.read_table('../ncbi-data/TERMINOLOGY.txt',sep='\|\|',header=None, names=['cui','name'])
terminology['file'] = ""
def lookup(cui, dic):
    result = dic[dic.cui.str.contains(cui)].values
    name = result[0][1] if len(result) > 0 else "Not Found"
    targetCui = f'({result[0][0]})' if len(result) > 0 else ""
    file = f'file={result[0][2]}' if len(result) > 0 else ""
    print(f'{cui}:\t{name} {targetCui} {file}')



In [323]:
# NCBI results
pred_path = '../ncbi-data/output'
gold_path = '../ncbi-data/test'
results = pd.read_csv(f'{pred_path}/results.txt',sep='\t')
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,ExactMatchSieve,725,655,70,0.9,0.9,0.9
2,AbbreviationExpansionSieve,11,10,1,0.91,0.9,0.9
3,PrepositionalTransformSieve,1,1,0,1,0.9,0.9
4,Unknown,0,0,0,0,0.9,0.9
5,HyphenationSieve,5,1,4,0.2,0.9,0.9
6,AffixationSieve,5,5,0,1,0.9,0.9
7,DiseaseTermSynonymsSieve,14,10,4,0.71,0.9,0.9
8,StemmingSieve,3,2,1,0.67,0.9,0.9
9,PartialMatchSieve,9,1,8,0.11,0.89,0.89
10,Total,964,685,88,-,0.71,0.89


In [3]:
# N2C2 results
train_path = '../n2c2-data/train'
pred_path = '../n2c2-data/output'
gold_path = '../n2c2-data/test'
results = pd.read_csv(f'{pred_path}/results.txt',sep='\t')
results = results[[str(x)[0]!='D' for x in results.prediction]] # Filter DUI predictions
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,ExactMatchSieve,947,918,29,0.97,0.97,0.97
2,AbbreviationExpansionSieve,5,3,2,0.6,0.97,0.97
3,PrepositionalTransformSieve,3,2,1,0.67,0.97,0.97
4,Unknown,0,0,0,0,0.97,0.97
5,HyphenationSieve,7,7,0,1,0.97,0.97
6,AffixationSieve,14,9,5,0.64,0.96,0.96
7,DiseaseTermSynonymsSieve,8,0,8,0,0.95,0.95
8,StemmingSieve,11,5,6,0.45,0.95,0.95
9,PartialMatchSieve,45,20,25,0.44,0.93,0.93
10,Total,1698,964,76,-,0.57,0.93


In [5]:
results[results.goldCui=='C0002771']

Unnamed: 0,filename,name,nameExpansion,prediction,normalized,normalizingSieveName,normalizingSource,goldCui,namePermutations,normalizingSieveLevel,goldNames
30,34,analgesics,analgesics,,False,,,C0002771,"analgesics,in analgesics,analgesics in,with an...",0,"his pain medication,pain medications,her pain ..."
91,34,analgesics,analgesics,,False,,,C0002771,"analgesics,in analgesics,analgesics in,with an...",0,"his pain medication,pain medications,her pain ..."


In [464]:
getAmbiguous(train).head()

Unnamed: 0,name,cuis,ambiguous
15,pain,"[c0030193, c1960986, c1960997]",True
31,tender,"[c0234234, c0234233]",True
36,edema,"[c0522035, c0013604]",True
58,ca,"[c0201925, c0006826]",True
63,platelets,"[c0032181, c0443116]",True


In [419]:
lookup('C0559692', terminology)
lookup('C0559692',train) #C0559692	C1289919
results[results.goldCui=='C1289919']

C0559692:	Not Found  
C0559692:	intravenous fluid (C0559692) file=0098.concept


Unnamed: 0,filename,name,nameExpansion,prediction,normalized,normalizingSieveName,normalizingSource,goldCui,namePermutations,normalizingSieveLevel,goldNames
542,250,intravenous fluid,,C0559692,True,ExactMatchSieve,trainTerminology,C1289919,intravenous fluid,1,iv fluids


In [392]:
# Error analysis
errors = results[(results.prediction != results.goldCui) & (results.normalized==True)].sort_values('normalizingSource')
errors = errors[['filename','name','namePermutations','goldNames','normalizingSource','normalizingSieveName','prediction','goldCui']]
print(errors.groupby('normalizingSource').count()['name'])
errors[errors.goldNames.notnull()]

normalizingSource
standardTerminology     6
trainTerminology       45
Name: name, dtype: int64


Unnamed: 0,filename,name,namePermutations,goldNames,normalizingSource,normalizingSieveName,prediction,goldCui
11,34,dm,dm,"diabetes mellitus,diabetes",standardTerminology,ExactMatchSieve,C538008,C0011849
734,318,f,"f,in f,f in,with f,f with,on f,f on,of f,f of,...",avf,standardTerminology,DiseaseTermSynonymsSieve,102510,C0449215
730,318,f,"f,in f,f in,with f,f with,on f,f on,of f,f of,...",avf,standardTerminology,DiseaseTermSynonymsSieve,102510,C0449215
1263,477,a pca,"a pca,pca in a,pca with a,pca on a,pca of a,a-...",pca,standardTerminology,StemmingSieve,C535506,C0078944
982,431,weakness,weakness,"muscle weakness,decreased strength,muscular we...",trainTerminology,ExactMatchSieve,C3714552,C0151786
1038,431,weakness,weakness,"muscle weakness,decreased strength,muscular we...",trainTerminology,ExactMatchSieve,C3714552,C0151786
1416,156406283,systolic,systolic,"an sbp,sbp,the systolic blood pressure",trainTerminology,ExactMatchSieve,C0232257,C0871470
1438,156406283,left,left,left arm,trainTerminology,ExactMatchSieve,C0205091,C0230330
1586,284487129,occlusion,occlusion,obstruction,trainTerminology,ExactMatchSieve,C0441597,C0028778
1610,284487129,catheterization,"catheterization,in catheterization,catheteriza...",a catheterization,trainTerminology,StemmingSieve,C0085590,C0007430


In [356]:
omissions = results[results.prediction.isnull() & results.goldNames.notnull()]
omissions[['filename','name','namePermutations','goldCui','goldNames']].head(5)

Unnamed: 0,filename,name,namePermutations,goldCui,goldNames
13,34,this problem,"this problem,problem in this,problem with this...",C0033213,problems
30,34,analgesics,"analgesics,in analgesics,analgesics in,with an...",C0002771,"his pain medication,pain medications,her pain ..."
91,34,analgesics,"analgesics,in analgesics,analgesics in,with an...",C0002771,"his pain medication,pain medications,her pain ..."
102,34,mildly,"mildly,in mildly,mildly in,with mildly,mildly ...",C2945599,"mild,gentle"
177,70,some effacement of the sulci,"some effacement of the sulci,some effacement i...",CUI-less,"study,associated mesenteric stranding,this stu..."


In [416]:
files = os.listdir(train_path)
train = pd.DataFrame([])
for file in files:
    if '.concept' in file:
        df = pd.read_table(f'{train_path}/{file}',sep='\|\|',header=None, names=['file_id','ix','type','name','cui'])
        df['file'] = [file for x in range(len(df))]
        train = pd.concat([train, df])
train = train[['cui','name','file']]
train.head()

  """


Unnamed: 0,cui,name,file
0,C0001339,ACUTE PANCREATITIS,0038.concept
1,C0001339,ACUTE PANCREATITIS,0038.concept
2,C0017168,GASTROESOPHAGEAL REFLUX DISEASE,0038.concept
3,C0017168,GERD,0038.concept
4,C0000737,abdominal pain,0038.concept


In [355]:
i = 0
files = os.listdir(pred_path)
pred = pd.DataFrame([])
gold = pd.DataFrame([])
for file in files:
    if '.concept' in file:
        pred = pd.concat([pred, pd.read_table(f'{pred_path}/{file}',sep='\|\|',header=None, names=['file_id','ix','mention','cui'])])
        gold = pd.concat([gold, pd.read_table(f'{gold_path}/{file}',sep='\|\|',header=None, names=['file_id','ix','mention','cui'])])
        i += 1
print(i, len(pred),len(gold))
pred

  import sys
  


17 2059 2083


Unnamed: 0,file_id,ix,mention,cui
0,0034.concept,56|69,right le pain,"C565467,D018497,D018771,D001416,C564899,C00301..."
1,0034.concept,527|531,pain,D010146
2,0034.concept,534|542,headache,C0018681
3,0034.concept,545|556,temperature,C0204688
4,0034.concept,617|626,oxycodone,
...,...,...,...,...
74,974381789.concept,2273|2293,t-wave abnormalities,"D002869,D019465,D018376,C537398,C538400,C53731..."
75,974381789.concept,2258|2268,st segment,"C0520887,C565006,C567416,C537775,C538115,C5637..."
76,974381789.concept,2250|2257,diffuse,C0205219
77,974381789.concept,2652|2670,septal hypertrophy,"C566238,C566239,C563600,C0442004,C563565,C5677..."


In [27]:
df = pd.read_csv('test4.txt',sep='\t',header=None)
df.to_csv('test4_pipe.txt',sep='|',header=None,index=None)

In [18]:
df[1] = df[1].replace('"','')
df.iloc[57]

0                                                   58
1    Fast thinker. \nVery efficient to work. Gets a...
Name: 57, dtype: object