In [1]:
import os
import pandas as pd

In [55]:
def getStats(df):
    "Gets stats for given dataframe"
    n = len(df)
    tp = sum(df.prediction == df.goldCui)
    fp = sum((df.prediction != df.goldCui) & (df.normalized==True))
    recall = round(tp/n,2) if n > 0 else 0
    precision = round(tp/(tp+fp),2) if (tp+fp) > 0 else 0
    return n, tp, fp, recall, precision

def sieveResults(results):
    "Returns a sieve-level analysis of results."
    levels = range(1,max(results.normalizingSieveLevel)+1)
    sieves = pd.DataFrame([], columns=['sieve','n','tp','fp', 'sieve_acc', 'agg_recall', 'agg_precision'])
    
    # Results for each sieve
    for i in levels:
        df = results[results.normalizingSieveLevel==i]
        n, tp, fp, recall, precision = getStats(df)
        sieve = df.normalizingSieveName.iloc[0] if n > 0 else "Unknown"
        sieves.loc[i] = [sieve, n, tp, fp, recall, 0, 0]
        sieves.loc[i,'agg_recall'] = round(sum(sieves.tp)/sum(sieves.n),2)
        sieves.loc[i,'agg_precision'] = round(sum(sieves.tp)/(sum(sieves.tp)+sum(sieves.fp)),2)
    
    # Total results
    n, tp, fp, recall, precision = getStats(results)
    sieves.loc[i+1] = ['Total', n, tp, fp, '-', recall, precision]
    return sieves

def getAmbiguous(df):
    "Find ambiguous names"
    dfMap = {}
    for i in range(len(df)):
        name = df.iloc[i]['name'].lower().strip()
        cui = df.iloc[i]['cui'].lower().strip()

        if name not in dfMap:
            dfMap[name] = []

        dfMap[name] = list(set([cui] + dfMap[name]))

    namesToCuis = pd.DataFrame(dfMap.items(),columns=['name','cuis'])
    namesToCuis['ambiguous'] = [len(x) > 1 for x in namesToCuis.cuis]
    return namesToCuis[namesToCuis.ambiguous]

terminology = pd.read_table('../resources/n2c2_terminology.txt',sep='\|\|',header=None, names=['cui','name'])
terminology['file'] = ""
def lookup(cui, dic):
    dic = dic[dic.columns[:3]]
    dic.columns = ['cui','name','file']
    result = dic[dic.cui.str.contains(cui)].values
    name = result[0][1] if len(result) > 0 else "Not Found"
    targetCui = f'({result[0][0]})' if len(result) > 0 else ""
    file = f'file={result[0][2]}' if len(result) > 0 else ""
    print(f'{cui}:\t{name} {targetCui} {file}')



In [29]:
# # NCBI results
# pred_path = '../ncbi-data/output'
# gold_path = '../ncbi-data/test'
# results = pd.read_csv(f'{pred_path}/results.txt',sep='\t')
# sieveResults(results)

In [68]:
# N2C2 results
train_path = '../n2c2-data/train'
pred_path = '../n2c2-data/output'
gold_path = '../n2c2-data/test'
results = pd.read_csv(f'{pred_path}/results.txt',sep='\t')
# results = results[[str(x)[0]!='D' for x in results.prediction]] # Filter DUI predictions
sieveResults(results)#.to_csv('../results/n2c2_umls_sieve_level.csv',sep='|',header=None,index=None)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,ExactMatchSieve,1353,1264,89,0.93,0.93,0.93
2,AbbreviationExpansionSieve,7,4,3,0.57,0.93,0.93
3,PrepositionalTransformSieve,14,12,2,0.86,0.93,0.93
4,NumberReplacementSieve,1,0,1,0,0.93,0.93
5,HyphenationSieve,7,7,0,1,0.93,0.93
6,AffixationSieve,12,8,4,0.67,0.93,0.93
7,DiseaseTermSynonymsSieve,26,3,23,0.12,0.91,0.91
8,StemmingSieve,17,8,9,0.47,0.91,0.91
9,Total,2062,1306,131,-,0.63,0.91


In [31]:
train = pd.read_csv('train.txt',sep='|')
train[train.STY=='Quantitative Concept'][['CUI','name']].drop_duplicates()
qc = set(train[train.STY=='Quantitative Concept'].CUI)
results[results.goldCui.isin(qc)].head()

Unnamed: 0,filename,name,nameExpansion,prediction,normalized,normalizingSieveName,normalizingSource,goldCui,namePermutations,normalizingSieveLevel,goldNames
117,34,increased,,C0205217,True,ExactMatchSieve,trainTerminology,C0205217,increased,1,"augmented,increased,increased by"
229,70,changes,,C0443172,True,ExactMatchSieve,trainTerminology,C0443172,changes,1,"change in,changed,changed status,state changes..."
234,70,large,,C0549177,True,ExactMatchSieve,trainTerminology,C0549177,large,1,"big,great,large"
434,174,some,,C0205392,True,ExactMatchSieve,trainTerminology,C0205392,some,1,some
538,250,small,,C0700321,True,ExactMatchSieve,trainTerminology,C0700321,small,1,"little,small"


In [5]:
results[results.goldCui=='C0002771']

Unnamed: 0,filename,name,nameExpansion,prediction,normalized,normalizingSieveName,normalizingSource,goldCui,namePermutations,normalizingSieveLevel,goldNames
30,34,analgesics,analgesics,,False,,,C0002771,"analgesics,in analgesics,analgesics in,with an...",0,"his pain medication,pain medications,her pain ..."
91,34,analgesics,analgesics,,False,,,C0002771,"analgesics,in analgesics,analgesics in,with an...",0,"his pain medication,pain medications,her pain ..."


In [24]:
# getAmbiguous(train).head()

In [65]:
lookup('C0027524', terminology)
lookup('C2919541', terminology)
# lookup('C0232172',train)
# results[results.goldCui=='C0086439']

C0027524:	nebuliser|nebulizer (C0027524) file=
C2919541:	administration of medication using nebuliser mask|administration of medication using nebulizer mask|nebuliser therapy using mask|nebulizer therapy using mask (C2919541) file=


In [64]:
# Error analysis
errors = results[(results.prediction != results.goldCui) & (results.normalized==True)].sort_values('normalizingSource')
errors = errors[['filename','name','namePermutations','goldNames','normalizingSource','normalizingSieveName','prediction','goldCui']]
print(errors.groupby('normalizingSource').count()['name'])
errors[errors.normalizingSieveName=='ExactMatchSieve']

normalizingSource
standardTerminology    92
trainTerminology       39
Name: name, dtype: int64


Unnamed: 0,filename,name,namePermutations,goldNames,normalizingSource,normalizingSieveName,prediction,goldCui
932,390,hypokinesis,hypokinesis,"hypokinesis of cardiac wall,hypokinetic",standardTerminology,ExactMatchSieve,C0086439,C0232172
1451,156406283,thalamic,thalamic,"thalamic haemorrhage,thalamic hemorrhage,thala...",standardTerminology,ExactMatchSieve,C0430798,C0472376
1386,156406283,sore,sore,"sore pain,sore to touch,soreness,tenderness,te...",standardTerminology,ExactMatchSieve,C1455785,C0234233
1347,477,nebulizer,nebulizer,administration of medication using nebuliser m...,standardTerminology,ExactMatchSieve,C0027524,C2919541
1334,477,nebulizer,nebulizer,administration of medication using nebuliser m...,standardTerminology,ExactMatchSieve,C0027524,C2919541
...,...,...,...,...,...,...,...,...
1433,156406283,percodan,percodan,"aspirin / oxycodone,aspirin- and oxycodone-con...",trainTerminology,ExactMatchSieve,C2684258,C0717448
1440,156406283,left,left,"left upper extremity,left upper extremity stru...",trainTerminology,ExactMatchSieve,C0205091,C0230330
543,250,intravenous fluid,intravenous fluid,"fluid for intravenous administration,intraveno...",trainTerminology,ExactMatchSieve,C0559692,C1289919
1482,284487129,anterior myocardial infarction,anterior myocardial infarction,"acute anterior myocardial infarction,acute myo...",trainTerminology,ExactMatchSieve,C0340320,C2349195


In [356]:
omissions = results[results.prediction.isnull() & results.goldNames.notnull()]
omissions[['filename','name','namePermutations','goldCui','goldNames']].head(5)

Unnamed: 0,filename,name,namePermutations,goldCui,goldNames
13,34,this problem,"this problem,problem in this,problem with this...",C0033213,problems
30,34,analgesics,"analgesics,in analgesics,analgesics in,with an...",C0002771,"his pain medication,pain medications,her pain ..."
91,34,analgesics,"analgesics,in analgesics,analgesics in,with an...",C0002771,"his pain medication,pain medications,her pain ..."
102,34,mildly,"mildly,in mildly,mildly in,with mildly,mildly ...",C2945599,"mild,gentle"
177,70,some effacement of the sulci,"some effacement of the sulci,some effacement i...",CUI-less,"study,associated mesenteric stranding,this stu..."


In [416]:
files = os.listdir(train_path)
train = pd.DataFrame([])
for file in files:
    if '.concept' in file:
        df = pd.read_table(f'{train_path}/{file}',sep='\|\|',header=None, names=['file_id','ix','type','name','cui'])
        df['file'] = [file for x in range(len(df))]
        train = pd.concat([train, df])
train = train[['cui','name','file']]
train.head()

  """


Unnamed: 0,cui,name,file
0,C0001339,ACUTE PANCREATITIS,0038.concept
1,C0001339,ACUTE PANCREATITIS,0038.concept
2,C0017168,GASTROESOPHAGEAL REFLUX DISEASE,0038.concept
3,C0017168,GERD,0038.concept
4,C0000737,abdominal pain,0038.concept


In [355]:
i = 0
files = os.listdir(pred_path)
pred = pd.DataFrame([])
gold = pd.DataFrame([])
for file in files:
    if '.concept' in file:
        pred = pd.concat([pred, pd.read_table(f'{pred_path}/{file}',sep='\|\|',header=None, names=['file_id','ix','mention','cui'])])
        gold = pd.concat([gold, pd.read_table(f'{gold_path}/{file}',sep='\|\|',header=None, names=['file_id','ix','mention','cui'])])
        i += 1
print(i, len(pred),len(gold))
pred

  import sys
  


17 2059 2083


Unnamed: 0,file_id,ix,mention,cui
0,0034.concept,56|69,right le pain,"C565467,D018497,D018771,D001416,C564899,C00301..."
1,0034.concept,527|531,pain,D010146
2,0034.concept,534|542,headache,C0018681
3,0034.concept,545|556,temperature,C0204688
4,0034.concept,617|626,oxycodone,
...,...,...,...,...
74,974381789.concept,2273|2293,t-wave abnormalities,"D002869,D019465,D018376,C537398,C538400,C53731..."
75,974381789.concept,2258|2268,st segment,"C0520887,C565006,C567416,C537775,C538115,C5637..."
76,974381789.concept,2250|2257,diffuse,C0205219
77,974381789.concept,2652|2670,septal hypertrophy,"C566238,C566239,C563600,C0442004,C563565,C5677..."


In [27]:
df = pd.read_csv('test4.txt',sep='\t',header=None)
df.to_csv('test4_pipe.txt',sep='|',header=None,index=None)

In [18]:
df[1] = df[1].replace('"','')
df.iloc[57]

0                                                   58
1    Fast thinker. \nVery efficient to work. Gets a...
Name: 57, dtype: object