In [1]:
import urllib.request, urllib.error, urllib.parse
import json
import os
import pandas as pd
from tqdm import tqdm

REST_URL = "https://services.bioportal.lirmm.fr"
API_KEY = "97be3b10-804a-4c98-9407-05caf1629ebb"
ONTO_SELECTED = "&ontologies=CISP-2,SNOMED35VF,CIF,WHO-ARTFRE,STY,ATCFRE,CIM-11,MEDLINEPLUS,MTHMSTFRE,MSHFRE,MDRFRE" #All French UMLS and SNOMED 3.5 ontologies
OPTIONS_1 = "&longest_only=true&exclude_numbers=false&whole_word_only=true&exclude_synonyms=false&expand_mappings=false&fast_context=false&certainty=false" #Match longest only because we want the whole concept, not the parts
OPTIONS_2 = "&temporality=false&experiencer=false&negation=false&lemmatize=false&score_threshold=0&confidence_threshold=0&display_links=false&display_context=false"
#We wanted to use fast_context, but this ran into several problems parsing punctuation. We'll leave this for future work.
PREFERENCE_STRING = ONTO_SELECTED + OPTIONS_1 + OPTIONS_2

def get_json(url):
    opener = urllib.request.build_opener()
    opener.addheaders = [('Authorization', 'apikey token=' + API_KEY)]
    return json.loads(opener.open(url).read())

In [None]:
dir_path = os.getcwd()
f = open(os.path.join(dir_path, "../wmt22gold.txt"), "r", encoding = "utf8")
fr_sent = [line.strip() for line in f.readlines()]
f.close()

#Get annotations
annotations_fr = []
for sentence in tqdm(fr_sent): #Use the ICL VPN for this - it seems to get privileged access
    #Additionally, the annotator uses "%20" to break up words, and therefore encounters problems with sentences which have the % symbol.
    #As I judge that it will take too long for me to understand how to locally download and query ontologies (involving SPARQL), I'd rather not waste time reinventing the wheel.
    #We will replace "%" with "pour cent" (FR translation) to preserve meaning whenever it is present. #We add spaces to ensure stuff like "95%" is parsed correctly on the receiving end.
    sentence = sentence.replace("%", " pour cent ")
    annotations_per_sentence = []
    annotations = get_json(REST_URL + "/annotator?text=" + urllib.parse.quote(sentence) + PREFERENCE_STRING)
    #We must disambiguate duplicate term occurrences from the same term being annotated by different ontologies.
    #Because we take the longest annotated sequence, any duplicates will "hit" the same subsequence. We can exploit this.
    found = set()
    for result in annotations:
        tag_loc = (result["annotations"][0]["from"], result["annotations"][0]["to"]) 
        #Concepts which occur in different ontologies but refer to the same words will have the same from/to on their first annotation
        if (tag_loc not in found):
            info = (result["annotatedClass"]["prefLabel"], len(result["annotations"])) #Take a count of concepts across the sentence
            annotations_per_sentence.append(info)
            found.add(tag_loc)
    annotations_fr.append(annotations_per_sentence)

In [61]:
#Generate results
sent_IDs = []
concepts = []
counts = []
for i in range(len(annotations_fr)):
    for annotation_info in annotations_fr[i]:
        sent_IDs.append(i)
        concepts.append(annotation_info[0])
        counts.append(annotation_info[1])
term_list = pd.DataFrame(data = {"sent_ID" : sent_IDs, "concept" : concepts, "count" : counts})
term_list.to_csv("gold_onto_concepts.txt", sep = "\t", header = True, index = False) 

In [67]:
#Now, repeat this workflow for all predictions we've gathered so far - 32 of them
from os import listdir
from os.path import isfile, join
base_path = "../predictions/opus_en_fr_base/"
big_path = "../predictions/opus_en_fr_big/"
base_files = [join(base_path, f) for f in listdir(base_path) if isfile(join(base_path, f))] #16 files, each comprising all our term predictions
big_files = [join(big_path, f) for f in listdir(big_path) if isfile(join(big_path, f))]
all_filenames = base_files + big_files

In [72]:
for filename in all_filenames:
    destination = filename.replace("predictions", "results/ontology_annotations").replace("_pred", "_onto_concepts")
    f = open(os.path.join(dir_path, filename), "r", encoding = "utf8")
    fr_sent = [line.strip() for line in f.readlines()]
    f.close()
    annotations_fr = []
    for sentence in tqdm(fr_sent): 
        sentence = sentence.replace("%", " pour cent ")
        annotations_per_sentence = []
        annotations = get_json(REST_URL + "/annotator?text=" + urllib.parse.quote(sentence) + PREFERENCE_STRING)
        found = set()
        for result in annotations:
            tag_loc = (result["annotations"][0]["from"], result["annotations"][0]["to"]) 
            if (tag_loc not in found):
                info = (result["annotatedClass"]["prefLabel"], len(result["annotations"])) #Take a count of concepts across the sentence
                annotations_per_sentence.append(info)
                found.add(tag_loc)
        annotations_fr.append(annotations_per_sentence)
    sent_IDs = []
    concepts = []
    counts = []
    for i in range(len(annotations_fr)):
        for annotation_info in annotations_fr[i]:
            sent_IDs.append(i)
            concepts.append(annotation_info[0])
            counts.append(annotation_info[1])
    term_list = pd.DataFrame(data = {"sent_ID" : sent_IDs, "concept" : concepts, "count" : counts})
    term_list.to_csv(destination, sep = "\t", header = True, index = False) 

100%|██████████| 588/588 [03:26<00:00,  2.85it/s]
100%|██████████| 588/588 [03:41<00:00,  2.66it/s]
100%|██████████| 588/588 [03:14<00:00,  3.03it/s]
100%|██████████| 588/588 [03:01<00:00,  3.23it/s]
100%|██████████| 588/588 [03:07<00:00,  3.14it/s]
100%|██████████| 588/588 [03:02<00:00,  3.22it/s]
100%|██████████| 588/588 [04:00<00:00,  2.44it/s]
100%|██████████| 588/588 [03:48<00:00,  2.57it/s]
100%|██████████| 588/588 [04:06<00:00,  2.38it/s]
100%|██████████| 588/588 [06:10<00:00,  1.59it/s]
100%|██████████| 588/588 [04:47<00:00,  2.05it/s]
100%|██████████| 588/588 [03:19<00:00,  2.95it/s]
100%|██████████| 588/588 [03:07<00:00,  3.14it/s]
100%|██████████| 588/588 [03:10<00:00,  3.09it/s]
100%|██████████| 588/588 [03:09<00:00,  3.10it/s]
100%|██████████| 588/588 [03:07<00:00,  3.13it/s]
100%|██████████| 588/588 [03:14<00:00,  3.02it/s]
100%|██████████| 588/588 [03:15<00:00,  3.00it/s]
100%|██████████| 588/588 [03:08<00:00,  3.12it/s]
100%|██████████| 588/588 [03:05<00:00,  3.17it/s]


In [5]:
#Now, repeat this workflow for the TFIDF model predictions
from os import listdir
from os.path import isfile, join
supp_path = "../predictions/"
supp_files = [join(supp_path, f) for f in listdir(supp_path) if isfile(join(supp_path, f))] #4 files
dir_path = os.getcwd()

In [6]:
supp_files

['../predictions/opus_base_AoN_tfidf_wce_unsampled_pred.txt',
 '../predictions/opus_base_simple_tfidf_wce_pred.txt',
 '../predictions/opus_big_fine_tfidf_wce_unsampled_pred.txt',
 '../predictions/opus_big_simple_tfidf_wce_unsampled_pred.txt']

In [7]:
for filename in supp_files:
    destination = filename.replace("predictions", "results/ontology_annotations").replace("_pred", "_onto_concepts")
    f = open(os.path.join(dir_path, filename), "r", encoding = "utf8")
    fr_sent = [line.strip() for line in f.readlines()]
    f.close()
    annotations_fr = []
    for sentence in tqdm(fr_sent): 
        sentence = sentence.replace("%", " pour cent ")
        annotations_per_sentence = []
        annotations = get_json(REST_URL + "/annotator?text=" + urllib.parse.quote(sentence) + PREFERENCE_STRING)
        found = set()
        for result in annotations:
            tag_loc = (result["annotations"][0]["from"], result["annotations"][0]["to"]) 
            if (tag_loc not in found):
                info = (result["annotatedClass"]["prefLabel"], len(result["annotations"])) #Take a count of concepts across the sentence
                annotations_per_sentence.append(info)
                found.add(tag_loc)
        annotations_fr.append(annotations_per_sentence)
    sent_IDs = []
    concepts = []
    counts = []
    for i in range(len(annotations_fr)):
        for annotation_info in annotations_fr[i]:
            sent_IDs.append(i)
            concepts.append(annotation_info[0])
            counts.append(annotation_info[1])
    term_list = pd.DataFrame(data = {"sent_ID" : sent_IDs, "concept" : concepts, "count" : counts})
    term_list.to_csv(destination, sep = "\t", header = True, index = False) 

100%|██████████| 588/588 [02:50<00:00,  3.44it/s]
100%|██████████| 588/588 [02:40<00:00,  3.66it/s]
100%|██████████| 588/588 [02:41<00:00,  3.63it/s]
100%|██████████| 588/588 [02:45<00:00,  3.56it/s]
