In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('https://github.com/allenai/S2AFF/raw/main/data/gold_affiliation_annotations.csv')

In [3]:
data = []
for i, row in df.iterrows():
    elt = {}
    elt['label'] = row.original_affiliation
    elt['split'] = row.split
    rors = []
    for r in eval(row.labels):
        if 'ror.org' in r:
            rors.append(r.replace('https://ror.org/', ''))
    elt['ror'] = rors
    if rors:
        data.append(elt)

In [4]:
data

[{'label': 'Chinese Academy of Sciences (CAS)',
  'split': 'train',
  'ror': ['034t30j35']},
 {'label': 'Laboratory of Immunologic and Inflammatory Diseases, Institute for Nutritional Sciences, Shanghai Institutes for Biological Sciences, Chinese Academy of Sciences; Graduate School of the Chinese Academy of Sciences, Shanghai, PR China.',
  'split': 'train',
  'ror': ['034t30j35', '011rf9d86']},
 {'label': 'CCAST (World Laboratory), P.O. Box 8730, Beijing 100080, People’s Republic of China',
  'split': 'train',
  'ror': ['02egfyg20', '034t30j35']},
 {'label': 'Inst. of Comput. Technol., Chinese Acad. of Sci., Beijing, China',
  'split': 'train',
  'ror': ['034t30j35', '0090r4d87']},
 {'label': 'Chinese academy of Sciences',
  'split': 'train',
  'ror': ['034t30j35']},
 {'label': 'National Center for Drug Screening, Shanghai Institute of Materia Medica, Shanghai Institutes for Biological Sciences, Chinese Academy of Sciences, Shanghai 201203, P. R. China',
  'split': 'train',
  'ror': 

In [5]:
import requests, json

def match(conditions):
    return requests.post("http://localhost:5004/match", json=conditions).json()


def compute_precision_recall(data, split = None):

    index_prefix = 'matcher'
    local = True
    match_type = 'ror'

    nb_TP, nb_FP, nb_FN = 0, 0, 0
    false_positive, false_negative = [], []
    for ix, d in enumerate(data):
        
        if split and d['split'] != split:
            continue
        
        if ix%100==0:
            print(ix, end=',')
        if d.get(match_type):
            res = match({'query': d['label'], 'year':'2020', 'type': match_type, 'index_prefix': index_prefix})
            for x in res['results']:
                if x in d[match_type]:
                    nb_TP += 1
                else:
                    nb_FP += 1
                    false_positive.append(d)
            for x in d[match_type]:
                if x not in res['results']:
                    nb_FN += 1
                    false_negative.append(d)

    precision = nb_TP / (nb_TP + nb_FP)
    recall = nb_TP / (nb_TP + nb_FN)
    res = {'precision' : precision, 'recall' : recall}
    return {'res': res, 'false_positive': false_positive, 'false_negative': false_negative} 


In [11]:
res = compute_precision_recall(data)
res['res']

0,100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,

{'precision': 0.9025875190258752, 'recall': 0.505541346973572}

In [9]:
res_test = compute_precision_recall(data, 'test')
res_test['res']

300,500,800,1000,1500,1700,1900,

{'precision': 0.9119170984455959, 'recall': 0.565008025682183}

In [10]:
res_test['false_positive']

[{'label': 'Illinois Genetic Algorithms Laboratory, University of Illinois, Urbana, IL#TAB#',
  'split': 'test',
  'ror': ['047426m28']},
 {'label': 'Department of General Engineering University of Illinois, Urbana, IL 61801, USA#TAB#',
  'split': 'test',
  'ror': ['047426m28']},
 {'label': 'Department of General Engineering, Illinois Genetic Algorithms Laboratory, University of Illinois, Urbana, IL 61801, USA#TAB#',
  'split': 'test',
  'ror': ['047426m28']},
 {'label': 'Illinois Genetic Algorithms Laboratory University of Illinois Urbana, IL 61801 USA deg@illigal.ge.uiuc.edu#TAB#',
  'split': 'test',
  'ror': ['047426m28']},
 {'label': 'DAMTP, CMS, University of Cambridge, Wilberforce Road, Cambridge CB3 0WA, UK and Department of Physics, University of Regina, Regina, SK, Canada S4S 0A2#TAB#',
  'split': 'test',
  'ror': ['013meh722']},
 {'label': 'Department of Physics, State University of New York at Stony Brook, Stony Brook, New York',
  'split': 'test',
  'ror': ['05qghxh33']},
 

In [20]:
res['false_positive'][0]

{'label': 'The Laboratory of Photochemistry, Center for Molecular Science, Institute of Chemistry, Chinese Academy of Sciences, Beijing 100080, PR China',
 'split': 'train',
 'ror': ['034t30j35']}

In [None]:
# The affiliation-matcher also gives 048y1rc66 which seems ok...

In [21]:
res_test['false_positive']

[{'label': 'Inst. of Software, Peking Univ., Beijing, China',
  'split': 'test',
  'ror': ['02v51f717']},
 {'label': 'Illinois Genetic Algorithms Laboratory, University of Illinois, Urbana, IL#TAB#',
  'split': 'test',
  'ror': ['047426m28']},
 {'label': 'Department of General Engineering University of Illinois, Urbana, IL 61801, USA#TAB#',
  'split': 'test',
  'ror': ['047426m28']},
 {'label': 'Department of General Engineering, Illinois Genetic Algorithms Laboratory, University of Illinois, Urbana, IL 61801, USA#TAB#',
  'split': 'test',
  'ror': ['047426m28']},
 {'label': 'Illinois Genetic Algorithms Laboratory University of Illinois Urbana, IL 61801 USA deg@illigal.ge.uiuc.edu#TAB#',
  'split': 'test',
  'ror': ['047426m28']},
 {'label': 'Department of Applied Mathematics and Theoretical Physics, Cambridge University, Wilberforce Road, CB3 0WA Cambridge, UK',
  'split': 'test',
  'ror': ['013meh722']},
 {'label': 'Department of Applied Mathematics and Theoretical Physics, Centre fo