Aim: utilise multiple sources to achieve rich, accurate annotations of life science literature.

Objectives:
    1) For each annotated term, return a tuple (term, class), where class is one of the
       following:
            chemical
            biomolecule
            drug
            pathway_process
            anatomy
            species
            disease
            technique / methodology
    2) Design and implement a model to discern between 'meaningful' terms and terms to
       ignore (i.e, stopwords). A threshold could be determined by testing differing
       amounts of most common words from a corpus of lay text. 
    3) Some system to detect and deal with ambiguity.

Sources:
    Pubtator
    Bio2RDF
    DBpedia
    Wikipedia
    Wordnet
    


In [27]:
# get pubtator annotations
import requests
import json
import pandas as pd
from IPython.display import display
base = 'https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/RESTful/tmTool.cgi/BioConcept'
retmode = 'json'
pmid = '27864336'
url = '{0}/{1}/{2}' .format(base, pmid, retmode)
r = requests.get(url).text
j = json.loads(r)
text = j['text']
pbt_ann = []
# pbt_to_class = {
#     'gene': 'biomolecule',
#     'species': 'species',
#     'chemical': 'biomolecule',
#     'disease': 'disease',
# }
for a in j['denotations']:
    c = a['obj'].split(':')[0].lower()
    try:
#         c = pbt_to_class[c]
        pass
    except:
        print('Unmapped class: ', c)
    strt = a['span']['begin']
    end = a['span']['end']
    s = text[strt:end]
    pbt_ann.append((s, c))
pbt_ann = set(pbt_ann)
pbt = pd.DataFrame(list(pbt_ann), columns=['term', 'pubtator_class']).set_index('term')
display(pbt)

Unnamed: 0_level_0,pubtator_class
term,Unnamed: 1_level_1
mouse,species
BMPR1a,gene
Id2,gene
BMP4,gene
BMPR2,gene
BMPR1b,gene
Smad1/5,gene
Gata4,gene
Noggin,gene
Id1,chemical


In [195]:
# get DBpedia Spotlight annotations
base = "http://spotlight.sztaki.hu:2222/rest/annotate?text={0}&confidence={1}&support={2}"
headers = {"Accept": "application/json"}
text = text
confidence = 0.4
support = 5
url = base.format(text, confidence, support)
r = requests.get(url, headers=headers).text
j = json.loads(r)
annotations = j['Resources'] # dbpedia resources form the 'S' component of SVO triples.

from SPARQLWrapper import SPARQLWrapper, JSON
sparql = SPARQLWrapper('http://dbpedia.org/sparql')
sparql.setReturnFormat(JSON)

def checkDBPredirect(uri):
    q = 'SELECT * WHERE {{ {0} <http://dbpedia.org/ontology/wikiPageRedirects> ?redirect}}'.format(uri)
    sparql.setQuery(q)
    results = sparql.query().convert()
    try:
        uri = '<{0}>' .format(results['results']['bindings'][0]['redirect']['value'])
    except:
        pass
    return uri

dbp_to_class = { # map dbpedia types to our standard classes
    'biomolecule': 'biomolecule',
    'protein': 'biomolecule',
    'species': 'species',
    'eukaryote': 'species',
    'animal': 'species',
    'mammal': 'species',
    'disease': 'disease',
}

dbp_ann = {}
for a in annotations:
    term = a['@surfaceForm']
    types = a['@types'].lower().replace('dbpedia:', '').split(',')
    uri = '<{0}>' .format(a['@URI'])
    uri = checkDBPredirect(uri)
    try:
        m = list(set([dbp_to_class[t] for t in types]))
        if len(m) > 1:
            print('More than one class assigned for term: ' + term)
    except:
        m = ['NO CLASS']
    dbp_ann[term] = {
        'orig_types': a['@types'],
        'dbpedia_class': m[0],
        'resource': uri,   
    }
dbp = pd.DataFrame(dbp_ann).transpose()

In [191]:
# create lists of dbpedia URIs to assign dbpedia annotations without classes
class_props = {
    'method': [
        'rdf:type yago:WikicatBiologicalTechniquesAndTools',
        'rdf:type yago:WikicatLaboratoryTechniques',
        'rdf:type yago:WikicatMolecularBiologyTechniques',
        'rdf:type yago:Method105660268',
        'rdf:type yago:Invention105633385',
        'rdf:type yago:Technique105665146',
        'rdf:type yago:WikicatBiochemistryMethods',
        'rdf:type yago:WikicatProteinMethods',
        'dct:subject dbc:Laboratory_techniques',
        'dct:subject dbc:Molecular_biology_techniques',
        'dct:subject dbc:Protein_methods',
    ],
    'pathway_process': [
        'dct:subject <http://dbpedia.org/resource/Category:Cellular_processes>',
        'rdf:type yago:WikicatCellularProcesses',
    ]
}

base = '''
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX dct: <http://purl.org/dc/terms/subject>
    PREFIX yago: <http://dbpedia.org/class/yago/>
    PREFIX dbc: <http://dbpedia.org/resource/>
    SELECT ?resource WHERE {{
        ?resource {0}
    }}
'''

class_URIs = {}
for c, pr in class_props.items():
    uris = []
    for p in pr:
        q = base.format(p)
        sparql.setQuery(q)
        r = sparql.query().convert()
        uris += ['<{0}>' .format(d['resource']['value']) for d in r['results']['bindings']]
    class_URIs[c] = uris

for c, ur in class_URIs.items():
    print(len(ur), c)

9744 method
65 pathway_process


In [201]:
# assign classes
classless = dbp.loc[dbp['dbpedia_class']=='NO CLASS']['resource'].values

for c, ur in class_URIs.items():
    m = map(lambda x: x if x in ur else False, classless)
    f = filter(lambda x: x, m)
    for uri in f:
        idx = dbp.loc[dbp['resource']==uri].index[0]
        dbp.loc[idx, 'dbpedia_class'] = c
# display(dbp)

classless = dbp.loc[dbp['dbpedia_class']=='NO CLASS']['resource'].values
print(classless)

['<http://dbpedia.org/resource/Madame_Bovary>'
 '<http://dbpedia.org/resource/Cell_(biology)>'
 '<http://dbpedia.org/resource/Cellular_differentiation>'
 '<http://dbpedia.org/resource/Gamete>'
 '<http://dbpedia.org/resource/Gene>'
 '<http://dbpedia.org/resource/Germ_cell>'
 '<http://dbpedia.org/resource/Germ_cell>'
 '<http://dbpedia.org/resource/Induced_pluripotent_stem_cell>'
 '<http://dbpedia.org/resource/Ligand>'
 '<http://dbpedia.org/resource/Phosphorylation>'
 '<http://dbpedia.org/resource/Cell_potency>'
 '<http://dbpedia.org/resource/Protein>'
 '<http://dbpedia.org/resource/Receptor_(biochemistry)>'
 '<http://dbpedia.org/resource/Spermatogenesis>'
 '<http://dbpedia.org/resource/Stem_cell>'
 '<http://dbpedia.org/resource/Transcription_factor>']


In [202]:
# join dfs
df = pbt.join([dbp])
display(df)

Unnamed: 0,pubtator_class,dbpedia_class,orig_types,resource
BMP4,gene,biomolecule,"DBpedia:Biomolecule,DBpedia:Protein",<http://dbpedia.org/resource/Bone_morphogeneti...
BMPR1a,gene,biomolecule,"DBpedia:Biomolecule,DBpedia:Protein",<http://dbpedia.org/resource/BMPR1A>
BMPR1b,gene,biomolecule,"DBpedia:Biomolecule,DBpedia:Protein",<http://dbpedia.org/resource/BMPR1B>
BMPR2,gene,biomolecule,"DBpedia:Biomolecule,DBpedia:Protein",<http://dbpedia.org/resource/BMPR2>
EB,,NO CLASS,"Schema:CreativeWork,DBpedia:Work,DBpedia:Writt...",<http://dbpedia.org/resource/Madame_Bovary>
Gata4,gene,biomolecule,"DBpedia:Biomolecule,DBpedia:Protein",<http://dbpedia.org/resource/GATA4>
Id1,chemical,biomolecule,"DBpedia:Biomolecule,DBpedia:Protein",<http://dbpedia.org/resource/ID1>
Id1,gene,biomolecule,"DBpedia:Biomolecule,DBpedia:Protein",<http://dbpedia.org/resource/ID1>
Id2,gene,biomolecule,"DBpedia:Biomolecule,DBpedia:Protein",<http://dbpedia.org/resource/ID2>
Noggin,gene,biomolecule,"DBpedia:Biomolecule,DBpedia:Protein",<http://dbpedia.org/resource/Noggin_(protein)>


In [224]:
# use GO annotations for class assignment
classless = list(df.loc[df['dbpedia_class']=='NO CLASS'].index)
sparql = SPARQLWrapper('http://pubmed.bio2rdf.org/sparql')
sparql.setReturnFormat(JSON)
base = '''
SELECT DISTINCT ?concept ?type WHERE {{
?concept rdf:type <http://bio2rdf.org/go_vocabulary:Resource> .
?concept dcterms:title "{0}"@en . 
?concept <http://bio2rdf.org/obo_vocabulary:namespace> ?type
}}
'''
goa = {}
for t in classless:
    q = base.format(t)
    sparql.setQuery(q)
    r = sparql.query().convert()
    try:
        goa[t] = {
            'goa_uri': r['results']['bindings'][0]['concept']['value'],
            'goa_class': r['results']['bindings'][0]['type']['value'],
        }
    except: pass
goa = pd.DataFrame(goa).transpose()
display(goa)

Unnamed: 0,goa_class,goa_uri
phosphorylation,biological_process,http://bio2rdf.org/go:0016310
spermatogenesis,biological_process,http://bio2rdf.org/go:0007283


In [225]:
goa_to_class = {
    'biological_process': 'pathway_process',
}
df = df.join(goa)
display(df)

Unnamed: 0,pubtator_class,dbpedia_class,orig_types,resource,goa_class,goa_uri
BMP4,gene,biomolecule,"DBpedia:Biomolecule,DBpedia:Protein",<http://dbpedia.org/resource/Bone_morphogeneti...,,
BMPR1a,gene,biomolecule,"DBpedia:Biomolecule,DBpedia:Protein",<http://dbpedia.org/resource/BMPR1A>,,
BMPR1b,gene,biomolecule,"DBpedia:Biomolecule,DBpedia:Protein",<http://dbpedia.org/resource/BMPR1B>,,
BMPR2,gene,biomolecule,"DBpedia:Biomolecule,DBpedia:Protein",<http://dbpedia.org/resource/BMPR2>,,
EB,,NO CLASS,"Schema:CreativeWork,DBpedia:Work,DBpedia:Writt...",<http://dbpedia.org/resource/Madame_Bovary>,,
Gata4,gene,biomolecule,"DBpedia:Biomolecule,DBpedia:Protein",<http://dbpedia.org/resource/GATA4>,,
Id1,chemical,biomolecule,"DBpedia:Biomolecule,DBpedia:Protein",<http://dbpedia.org/resource/ID1>,,
Id1,gene,biomolecule,"DBpedia:Biomolecule,DBpedia:Protein",<http://dbpedia.org/resource/ID1>,,
Id2,gene,biomolecule,"DBpedia:Biomolecule,DBpedia:Protein",<http://dbpedia.org/resource/ID2>,,
Noggin,gene,biomolecule,"DBpedia:Biomolecule,DBpedia:Protein",<http://dbpedia.org/resource/Noggin_(protein)>,,


In [226]:
classless = list(df.loc[df['dbpedia_class']=='NO CLASS'].index)
print(classless)

['EB', 'cells', 'differentiation', 'gametes', 'genes', 'germ cell', 'germ cells', 'iPS cells', 'ligand', 'phosphorylation', 'pluripotent', 'proteins', 'receptors', 'spermatogenesis', 'stem', 'transcription']
