In [8]:
import requests, json

def match(conditions):
    return requests.post("http://localhost:5004/match_api", json=conditions).json()

def get_annotated_data():
    url = 'https://storage.gra.cloud.ovh.net/v1/AUTH_32c5d10cb0fe4519b957064a111717e3/models/match_pubmed_affiliations_with_countries_v3.json'
    data = requests.get(url).json()
    json.dump(data, open('match_pubmed_affiliations_with_countries_v3.json', 'w'), indent=2, ensure_ascii=False)
    return data

def compute_precision_recall(match_type, index_prefix=''):

    data = get_annotated_data()

    nb_TP, nb_FP, nb_FN = 0, 0, 0
    false_positive, false_negative = [], []
    for ix, d in enumerate(data):
        if ix%100==0:
            print(ix, end=',')
        if d.get(match_type):
            res = match({'query': d['label'], 'year':'2020', 'type': match_type, 'index_prefix': index_prefix})
            for x in res['results']:
                if x in d[match_type]:
                    nb_TP += 1
                else:
                    nb_FP += 1
                    false_positive.append(d)
            for x in d[match_type]:
                if x not in res['results']:
                    nb_FN += 1
                    false_negative.append(d)

    precision = nb_TP / (nb_TP + nb_FP)
    recall = nb_TP / (nb_TP + nb_FN)
    res = {'precision' : precision, 'recall' : recall}
    return {'res': res, 'false_positive': false_positive, 'false_negative': false_negative} 


In [15]:
metrics_country = compute_precision_recall(match_type = 'country', index_prefix='matcher')

0,100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300,2400,2500,2600,2700,2800,2900,3000,3100,3200,3300,3400,3500,3600,3700,3800,3900,4000,4100,4200,4300,4400,4500,4600,4700,

In [17]:
metrics_country['res']

{'precision': 0.9991119005328597, 'recall': 0.9361348034116913}

In [18]:
metrics_country['false_positive']

[{'label': 'Northern Ireland Public Health Research Network, School of Health Sciences, Ulster University, Newtownabbey, UK.',
  'rnsr': [],
  'siren': [],
  'grid': ['grid.454053.3', 'grid.12641.30'],
  'country': ['gb']},
 {'label': 'Harvard Medical School, Boston, Massachusetts, Beth Israel Deaconess Medical Center, Boston, Massachusetts.',
  'rnsr': [],
  'siren': [],
  'grid': ['grid.471403.5', 'grid.239395.7'],
  'country': ['us']},
 {'label': 'Department of Radiation Oncology, Emory University, Atlanta, Georgia.',
  'rnsr': [],
  'siren': [],
  'grid': ['grid.189967.8'],
  'country': ['us']},
 {'label': 'Beth Israel Deaconess Medical Center, Harvard Medical School, Boston, Massachusetts (E.R.G.).',
  'rnsr': [],
  'siren': [],
  'grid': ['grid.471403.5', 'grid.239395.7'],
  'country': ['us']}]

In [10]:
metrics_grid = compute_precision_recall(match_type = 'grid', index_prefix='matcher')
metrics_grid['res']

0,100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300,2400,2500,2600,2700,2800,2900,3000,3100,3200,3300,3400,3500,3600,3700,3800,3900,4000,4100,4200,4300,4400,4500,4600,4700,

{'precision': 0.6086804340217011, 'recall': 0.6748156771439658}

In [7]:
metrics_grid['false_positive']

[{'label': 'Laboratory of Clinical Immunology and Microbiology, Division of Intramural Research, National Institute of Allergy and Infectious Diseases, National Institutes of Health, Bethesda, MD, USA.',
  'rnsr': [],
  'siren': [],
  'grid': ['grid.419681.3'],
  'country': ['us']},
 {'label': 'Treatment and Research Center for Infectious Diseases, The Fifth Medical Center of PLA General Hospital, National Clinical Research Center for Infectious Diseases, Beijing, China.',
  'rnsr': [],
  'siren': [],
  'grid': ['grid.414252.4'],
  'country': ['cn']},
 {'label': 'Riga Stradins University, Riga, Latvia.',
  'rnsr': [],
  'siren': [],
  'grid': ['grid.17330.36'],
  'country': ['lv']},
 {'label': "Department of Diabetes, School of Life Course Sciences, King's College London, London, UK, Bariatric and Metabolic Surgery, King's College Hospital, London, UK. Electronic address: francesco.rubino@kcl.ac.uk.",
  'rnsr': [],
  'siren': [],
  'grid': ['grid.13097.3c'],
  'country': ['gb']},
 {'la

In [6]:
metrics_rnsr = compute_precision_recall(match_type = 'rnsr', index_prefix='matcher')
metrics_rnsr['res']

0,100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300,2400,2500,2600,2700,2800,2900,3000,3100,3200,3300,3400,3500,3600,3700,3800,3900,4000,4100,4200,4300,4400,4500,4600,4700,

{'precision': 0.9909365558912386, 'recall': 0.800976800976801}