In [162]:
import os
import pandas as pd

In [199]:
def getStats(df):
    "Gets stats for given dataframe"
    n = len(df)
    tp = sum(df.prediction == df.goldCui)
    fp = sum((df.prediction != df.goldCui) & (df.normalized==True))
    recall = round(tp/n,2) if n > 0 else 0
    precision = round(tp/(tp+fp),2) if (tp+fp) > 0 else 0
    return n, tp, fp, recall, precision

def sieveResults(results):
    "Returns a sieve-level analysis of results."
    levels = range(1,max(results.normalizingSieveLevel)+1)
    sieves = pd.DataFrame([], columns=['sieve','n','tp','fp', 'sieve_acc', 'agg_recall', 'agg_precision'])
    
    # Results for each sieve
    for i in levels:
        df = results[results.normalizingSieveLevel==i]
        n, tp, fp, recall, precision = getStats(df)
        sieve = df.normalizingSieveName.iloc[0] if n > 0 else "Unknown"
        sieves.loc[i] = [sieve, n, tp, fp, recall, 0, 0]
        sieves.loc[i,'agg_recall'] = round(sum(sieves.tp)/sum(sieves.n),2)
        sieves.loc[i,'agg_precision'] = round(sum(sieves.tp)/(sum(sieves.tp)+sum(sieves.fp)),2)
    
    # Total results
    n, tp, fp, recall, precision = getStats(results)
    sieves.loc[i+1] = ['Total', n, tp, fp, '-', recall, precision]
    return sieves

In [195]:
# NCBI results
pred_path = '../ncbi-data/output'
gold_path = '../ncbi-data/test'
results = pd.read_csv(f'{pred_path}/results.txt',sep='\t')
# sieveResults(results)

In [236]:
# N2C2 results
pred_path = '../n2c2-data/output'
gold_path = '../n2c2-data/test'
results = pd.read_csv(f'{pred_path}/results.txt',sep='\t')
results = results[[str(x)[0]!='D' for x in results.prediction]] # Filter DUI predictions
sieveResults(results)

Unnamed: 0,sieve,n,tp,fp,sieve_acc,agg_recall,agg_precision
1,ExactMatchSieve,970,933,37,0.96,0.96,0.96
2,AbbreviationExpansionSieve,5,3,2,0.6,0.96,0.96
3,PrepositionalTransformSieve,2,1,1,0.5,0.96,0.96
4,Unknown,0,0,0,0,0.96,0.96
5,HyphenationSieve,2,2,0,1,0.96,0.96
6,AffixationSieve,10,6,4,0.6,0.96,0.96
7,DiseaseTermSynonymsSieve,6,0,6,0,0.95,0.95
8,StemmingSieve,10,4,6,0.4,0.94,0.94
9,Unknown,0,0,0,0,0.94,0.94
10,PartialMatchSieve,36,15,21,0.42,0.93,0.93


In [238]:
# Error analysis
# results[[',' in str(x) for x in list(results.prediction)]] multiple CUIs
errors = results[(results.prediction != results.goldCui) & (results.normalized==True)].sort_values('normalizingSource')
errors = errors[['filename','name','namePermutations','goldNames','normalizingSource','normalizingSieveName','prediction','goldCui']]
print(errors.groupby('normalizingSource').count()['name'])
errors[errors.normalizingSource=='standardTerminology']

normalizingSource
normalizedNameToCuiListMap    22
standardTerminology            5
trainTerminology              29
Name: name, dtype: int64


Unnamed: 0,filename,name,namePermutations,goldNames,normalizingSource,normalizingSieveName,prediction,goldCui
1263,477,a pca,"[a pca, pca in a, pca with a, pca on a, pca of...","[a pca, pca in a, pca with a, pca on a, pca of...",standardTerminology,StemmingSieve,C535506,C0078944
11,34,dm,[dm],[dm],standardTerminology,ExactMatchSieve,C538008,C0011849
2052,974381789,short,"[short, in short, short in, with short, short ...","[short, in short, short in, with short, short ...",standardTerminology,DiseaseTermSynonymsSieve,C537327,C1806781
736,318,1,[1],[1],standardTerminology,ExactMatchSieve,C565162,C0449212
730,318,f,"[f, in f, f in, with f, f with, on f, f on, of...","[f, in f, f in, with f, f with, on f, f on, of...",standardTerminology,DiseaseTermSynonymsSieve,102510,C0449215


In [36]:
files = os.listdir(pred_path)
i = 0
pred = pd.DataFrame([])
gold = pd.DataFrame([])
for file in files:
    if '.concept' in file:
        pred = pd.concat([pred, pd.read_table(f'{pred_path}/{file}',sep='\|\|',header=None, names=['file_id','ix','mention','cui'])])
        gold = pd.concat([gold, pd.read_table(f'{gold_path}/{file}',sep='\|\|',header=None, names=['file_id','ix','mention','cui'])])
        i += 1
print(i, len(pred),len(gold))
pred

  import sys
  


17 2063 2083


Unnamed: 0,file_id,ix,mention,cui
0,0034.concept,56|69,right le pain,
1,0034.concept,527|531,pain,D010146
2,0034.concept,534|542,headache,C0018681
3,0034.concept,545|556,temperature,C0204688
4,0034.concept,617|626,oxycodone,
...,...,...,...,...
74,974381789.concept,2273|2293,t-wave abnormalities,
75,974381789.concept,2258|2268,st segment,
76,974381789.concept,2250|2257,diffuse,C0205219
77,974381789.concept,2652|2670,septal hypertrophy,


In [27]:
df = pd.read_csv('test4.txt',sep='\t',header=None)
df.to_csv('test4_pipe.txt',sep='|',header=None,index=None)

In [18]:
df[1] = df[1].replace('"','')
df.iloc[57]

0                                                   58
1    Fast thinker. \nVery efficient to work. Gets a...
Name: 57, dtype: object