### Aims
##### Utilise multiple sources to create rich, accurate annotation of life science literature.
    Harness known annotation tools (pubtator, dbpedia spotlight).
    Implement incremental capacities to reduce future search needs.
    Test thresholds of word commonality to distinguish stop words.
    Detect and deal with amibuity. 


### Sources:
    1. Pubtator
    2. DBpedia, DBpedia spotlight
    3. BIO2RDF: GOA
    4. Wikipedia
    5. Wordnet
    

### Objectives:
    1. Map terms to classes:
            chemical
            biomolecule
            drug
            process
            anatomy
            species
            disease
            method

In [1]:
# get pubtator annotations
import requests
import json
import pandas as pd
import numpy as np
from IPython.display import display
base = 'https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/RESTful/tmTool.cgi/BioConcept'
retmode = 'json'
pmid = '27864336'
url = '{0}/{1}/{2}' .format(base, pmid, retmode)
r = requests.get(url).text
j = json.loads(r)
text = j['text']
pbt_ann = []
# pbt_to_class = {
#     'gene': 'biomolecule',
#     'species': 'species',
#     'chemical': 'biomolecule',
#     'disease': 'disease',
# }
for a in j['denotations']:
    c = a['obj'].split(':')[0].lower()
    try:
#         c = pbt_to_class[c]
        pass
    except:
        print('Unmapped class: ', c)
    strt = a['span']['begin']
    end = a['span']['end']
    s = text[strt:end]
    pbt_ann.append((s, c))
pbt_ann = set(pbt_ann)
pbt = pd.DataFrame(list(pbt_ann), columns=['term', 'pubtator_class']).set_index('term')
# display( pbt)

In [2]:
# get DBpedia Spotlight annotations
base = "http://spotlight.sztaki.hu:2222/rest/annotate?text={0}&confidence={1}&support={2}"
headers = {"Accept": "application/json"}
text = text
confidence = 0.1
support = 5
url = base.format(text, confidence, support)
r = requests.get(url, headers=headers).text
j = json.loads(r)
annotations = j['Resources'] # dbpedia resources form the 'S' component of SVO triples.

from SPARQLWrapper import SPARQLWrapper, JSON
sparql = SPARQLWrapper('http://dbpedia.org/sparql')
sparql.setReturnFormat(JSON)

def checkDBPredirect(uri):
    q = 'SELECT * WHERE {{ {0} <http://dbpedia.org/ontology/wikiPageRedirects> ?redirect}}'.format(uri)
    sparql.setQuery(q)
    results = sparql.query().convert()
    try:
        uri = '<{0}>' .format(results['results']['bindings'][0]['redirect']['value'])
    except:
        pass
    return uri

dbp_to_class = { # map dbpedia types to our standard classes
    'biomolecule': 'biomolecule',
    'protein': 'biomolecule',
    'species': 'species',
    'eukaryote': 'species',
    'animal': 'species',
    'mammal': 'species',
    'disease': 'disease',
}

dbp_ann = {}
for a in annotations:
    term = a['@surfaceForm']
    types = a['@types'].lower().replace('dbpedia:', '').split(',')
    uri = '<{0}>' .format(a['@URI'])
    uri = checkDBPredirect(uri)
    try:
        m = list(set([dbp_to_class[t] for t in types]))
        if len(m) > 1:
            print('More than one class assigned for term: ' + term)
    except:
        m = [np.nan]
    dbp_ann[term] = {
#         'dbp_spl_types': a['@types'],
        'dbpedia_class': m[0],
        'resource': uri,   
    }
dbp = pd.DataFrame(dbp_ann).transpose()

In [3]:
# create lists of dbpedia URIs to assign dbpedia annotations without classes
class_props = {
    'method': [
        'rdf:type yago:WikicatBiologicalTechniquesAndTools',
        'rdf:type yago:WikicatLaboratoryTechniques',
        'rdf:type yago:WikicatMolecularBiologyTechniques',
        'rdf:type yago:Method105660268',
        'rdf:type yago:Invention105633385',
        'rdf:type yago:Technique105665146',
        'rdf:type yago:WikicatBiochemistryMethods',
        'rdf:type yago:WikicatProteinMethods',
        'dct:subject dbc:Laboratory_techniques',
        'dct:subject dbc:Molecular_biology_techniques',
        'dct:subject dbc:Protein_methods',
    ],
    'pathway_process': [
        'dct:subject <http://dbpedia.org/resource/Category:Cellular_processes>',
        'rdf:type yago:WikicatCellularProcesses',
    ]
}

base = '''
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX dct: <http://purl.org/dc/terms/subject>
    PREFIX yago: <http://dbpedia.org/class/yago/>
    PREFIX dbc: <http://dbpedia.org/resource/>
    SELECT ?resource WHERE {{
        ?resource {0}
    }}
'''

class_URIs = {}
for c, pr in class_props.items():
    uris = []
    for p in pr:
        q = base.format(p)
        sparql.setQuery(q)
        r = sparql.query().convert()
        uris += ['<{0}>' .format(d['resource']['value']) for d in r['results']['bindings']]
    class_URIs[c] = uris

for c, ur in class_URIs.items():
    print(len(ur), c)

9744 method
65 pathway_process


In [4]:
# use other dbpedia data to assign classes
classless = dbp.loc[dbp['dbpedia_class'].isnull()]['resource'].values

for c, ur in class_URIs.items():
    m = map(lambda x: x if x in ur else False, classless)
    f = filter(lambda x: x, m)
    for uri in f:
        idx = dbp.loc[dbp['resource']==uri].index[0]
        dbp.loc[idx, 'dbpedia_class'] = c
# # display( dbp)

In [5]:
# join dfs
df = pbt.join([dbp])
# display( df)

In [6]:
# use GO annotations for class assignment
sparql = SPARQLWrapper('http://pubmed.bio2rdf.org/sparql')
sparql.setReturnFormat(JSON)

base = '''
SELECT DISTINCT ?concept ?type WHERE {{
?concept rdf:type <http://bio2rdf.org/go_vocabulary:Resource> .
?concept dcterms:title "{0}"@en . 
?concept <http://bio2rdf.org/obo_vocabulary:namespace> ?type
}}
'''
goa = {}
for t in df.index:
    q = base.format(t)
    sparql.setQuery(q)
    r = sparql.query().convert()
    try:
        goa[t] = {
            'goa_uri': r['results']['bindings'][0]['concept']['value'],
            'goa_class': r['results']['bindings'][0]['type']['value'],
        }
    except: pass
goa = pd.DataFrame(goa).transpose()
# display( goa)

In [7]:
df = df.join(goa)
# display( df)

In [13]:
# get meta data for terms with classes
df = df.loc[
    (df['pubtator_class'].notnull()) |
    (df['dbpedia_class'].notnull()) |
    (df['goa_class'].notnull())
]
display(df)
# get wikipedia info

Unnamed: 0,pubtator_class,dbpedia_class,resource,goa_class,goa_uri
BMP4,gene,biomolecule,<http://dbpedia.org/resource/Bone_morphogeneti...,,
BMPR1a,gene,biomolecule,<http://dbpedia.org/resource/BMPR1A>,,
BMPR1b,gene,biomolecule,<http://dbpedia.org/resource/BMPR1B>,,
BMPR2,gene,biomolecule,<http://dbpedia.org/resource/BMPR2>,,
Gata4,gene,biomolecule,<http://dbpedia.org/resource/GATA4>,,
Id1,chemical,biomolecule,<http://dbpedia.org/resource/ID1>,,
Id1,gene,biomolecule,<http://dbpedia.org/resource/ID1>,,
Id2,gene,biomolecule,<http://dbpedia.org/resource/ID2>,,
Noggin,gene,biomolecule,<http://dbpedia.org/resource/Noggin_(protein)>,,
PCR,,method,<http://dbpedia.org/resource/Polymerase_chain_...,,
