In [219]:
#Import all the libraries we need
import os
import json
import pandas as pd

#Get a list of all the folders. The name of the folder is also the id number for the data package. Each folder
#contains the text of the pdf as a txt file and the json result from GNRD.
packages = []
annotation_file = open('annotator_results.txt', 'r')
next(annotation_file)
for line in annotation_file:
    line = line.strip()
    row = line.split('\t')
    data_package = row[1]
    if data_package in packages:
        pass
    else:
        packages.append(data_package)
#packages = next(os.walk('pdf_text'))[1]
#packages.remove('133')
print(len(packages))
'110' in packages
#Should output '215' and 'True' if working properly.

215


True

In [220]:
#Iterate over all the folders and grab the json results from each. Use the csv to get a list of unique
#names that GNRD found. From the json, get important metadata, like total words, and figure out which names
#were updated to current valide names by GNVerifier.
gnrd_results = {}
gnrd_verifier = {} #holds the names that were updated by GNVerifier
tot_words = {}
for p in packages:
    gnrd_names = []
    if p != '133':
        [json_file] = [x for x in os.listdir('dryad-paper/' + p) if x.endswith(".json")]
        [csv_file] = [x for x in os.listdir('dryad-paper/' + p) if x.endswith(".csv")]
        jdata = open('dryad-paper/' + p + '/' + json_file,'r')
        result = json.load(jdata)
        metadata = result['metadata']
        total_words = metadata['totalWords']
        cdata = open('dryad-paper/' + p + '/' + csv_file,'r')
        names = []
        next(cdata)
        for line in cdata:
            row = line.split(',')
            names.append(row[2])
        gnrd_results[p] = names
        tot_words[p] = total_words
        json_names = result['names']
        matches = []
        #This loop looks for names updated by GNVerifier. We are assuming that a name has been updated
        #if the matchType = Exact and isSynonym = True. matchType = NoMatch is given for abbreviations.
        for n in json_names:
            verbatim_name = n['verbatim']
            match_type = n['verification']['matchType']
            #print(p)
            if match_type == 'NoMatch':
                continue
            synonym = n['verification']['bestResult']['isSynonym']
            if match_type == 'Exact' and synonym == True:
                matches.append(verbatim_name)
        matches = set(matches)
        gnrd_verifier[p] = matches
print('completed successfully')            

completed successfully


In [221]:
#Initialize the dictionary for the human annotator data
annotator_results = {}
for p in packages:
    annotator_results[p]=[]

In [222]:
#Read the file that contains the names found by the human annotators. Go over every row of the file and create a
#dictionary using the data package ID number as the key and a list of the name strings as the value.
# the name string is in a tuple with the type of string, i.e. scientific or vernacular name
annotation_file.seek(0)
next(annotation_file)
for line in annotation_file:
    line = line.strip()
    row = line.split('\t')
    source_type = row[2]
    if source_type == 'data':
        continue
    else:
        string = row[0]
        data_package = row[1]
        source = row[3]
        string_type = row[4]
        ls = annotator_results[data_package]
        ls.append(tuple([string,string_type]))
        annotator_results[data_package] = ls
print('completed successfully')

completed successfully


In [223]:
#Let's compare the list of names returned by GNRD with the list of names returned by the annotators
#for now, let's ignore the vernacular names in the annotator data
#performance metrics for each document are returned in results.tsv
#false positives and false negatives are returned in errors.txt

error_file = open('errors.txt','w')
error_file.write('data_package' + '\t' + 'name' + '\t' + 'error_type' + '\n')

df = pd.DataFrame(columns=['data_package','true_positives','false_positives','false_negatives','returned_results','annotator_results','precision',
                           'recall','F1','total_words'])

for p in packages:
    #grab the list for the data package from the gnrd results and the annotator results.
    #separate the name strings from the odds value in the gnrd list and the string type in the annotator list
    gnrd = gnrd_results[p] #name string and odds value
    human = annotator_results[p] #name string and string type
    gnrd_set = set(gnrd)
    human_string = []
    for n,s in human:
        if s != 'vernacular': #lets ignore vernacular names for the moment
            if n.isupper() == True:
                n = n.title()
            human_string.append(n)
    human_set = set(human_string)
    #now we have a set of name strings for gnrd and a set for the annotators
    #let's compare them
    g = len(gnrd_set) #total strings returned by GNRD
    h = len(human_set) #total strings returned by annotator
    overlap_set = gnrd_set.intersection(human_set) #this is the set of strings that are in the gnrd and annotator lists
    o = len(overlap_set)
    gnrd_only = gnrd_set.difference(human_set) #what is in gnrd list that is not in annotator list - false positives
    human_only = human_set.difference(gnrd_set) #what is in annotator list that is not in gnrd list - false negatives
    for x in gnrd_only:
        error_file.write(p + '\t' + x + '\t' + 'false positive' + '\n')
    for y in human_only:
        error_file.write(p + '\t' + y + '\t' + 'false negative' + '\n')
    go = len(gnrd_only)
    ho = len(human_only)
    if go + o == g and ho + o == h: #adding a sanity check
        pass
    else:
        print('problem')
    if g == 0 or h == 0:
        precision = 'NULL'
        recall = 'NULL'
        F1 = 'NULL'
    else:
        precision = o / g
        recall = o / h
        if precision == 0 and recall == 0:
            F1 = 'NULL'
        else:
            F1 = 2*((precision*recall)/(precision+recall))
    #print(df)
    y = {'data_package':p,'true_positives':o,'false_positives':go,'false_negatives':ho,'returned_results':g,'annotator_results':h,
                    'precision':precision,'recall':recall,'F1':F1,'total_words':tot_words[p]}
    #print(y)
    df = df.append(y,ignore_index=True)
df.to_csv('results.tsv', sep = '\t')
#print(df)

In [224]:
#Now we have a dataframe that contains performance data for each pdf 
#let's calculate performance overall
TOT = df.sum(axis=0)
TP = TOT.iloc[1]
RR = TOT.iloc[4]
AR = TOT.iloc[5]
TW = TOT.iloc[6]
FP = TOT.iloc[2]
FN = TOT.iloc[3]
print('True Positives ' + str(TP))
print('Returned Results ' + str(RR))
print('Annotator Results ' + str(AR))
print('Total Words ' + str(TW))
print('False Positives ' + str(FP))
print('False Negatives ' + str(FN))

True Positives 7952
Returned Results 8691
Annotator Results 9753
Total Words 1562799
False Positives 739
False Negatives 1801


In [225]:
precision = TP / RR
recall = TP / AR
F1 = 2*((precision*recall)/(precision+recall))
print(F1)
print('precision is ' + str(precision))
print('recall is ' + str(recall))

0.8622858382129691
precision is 0.9149695086871477
recall is 0.8153388700912539


In [226]:
#this counts and returns the number of names that were updated by GNVerifier.
tot_v = 0
for k,v in gnrd_verifier.items():
    y = len(v)
    tot_v = tot_v + y
    #print(k + ' ' + str(y))
print(tot_v)

1254
