In [1]:
import os
os.chdir("..")

In [2]:
"""
Iterate over the PubMED articles that mention infecious diseases from the
disease ontology.
"""
import rdflib
from pylru import lrudecorator
import pubcrawler.article as pubcrawler
from annotator.keyword_annotator import KeywordAnnotator
from annotator.annotator import AnnoDoc
import re
import json
import pymongo

In [3]:
print("Loading disease ontology...")
disease_ontology = rdflib.Graph()
disease_ontology.parse(
    "http://purl.obolibrary.org/obo/doid.owl",
    format="xml"
)
print("disease ontology loaded")

Loading disease ontology...
disease ontology loaded


In [4]:
disease_ontology

<Graph identifier=Nbda20d900ed74f759fe6397bf8111869 (<class 'rdflib.graph.Graph'>)>

In [5]:
def get_annotation_keywords():
    qres = disease_ontology.query("""
    prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
    prefix obo: <http://purl.obolibrary.org/obo/>
    prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?entity ?label
    WHERE {
        # only resolve diseases by infectious agent
        ?entity rdfs:subClassOf* obo:DOID_0050117
        ; oboInOwl:hasNarrowSynonym|oboInOwl:hasRelatedSynonym|oboInOwl:hasExactSynonym|rdfs:label ?label
    }
    """)
    def remove_parenthetical_notes(label):
        label = re.sub(r"\s\(.*\)","", label)
        label = re.sub(r"\s\[.*\]","", label)
        assert(len(label) > 0)
        return label
    return list(set([remove_parenthetical_notes(str(r[1])) for r in qres]))

In [6]:
with open("annotation_keywords", "w+") as f:
    for item in get_annotation_keywords():
        f.write("{}\n".format(item))

In [7]:
def str_escape(s):
    return json.dumps(s)[1:-1]

In [8]:
@lrudecorator(500)
def resolve_keyword(keyword):
    query = """
    prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
    prefix obo: <http://purl.obolibrary.org/obo/>
    prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?entity
    WHERE {
        # only resolve diseases by infectious agent
        ?entity rdfs:subClassOf* obo:DOID_0050117
        ; oboInOwl:hasNarrowSynonym|oboInOwl:hasRelatedSynonym|oboInOwl:hasExactSynonym|rdfs:label ?label
        FILTER regex(?label, "^(""" + str_escape(re.escape(keyword)) + str_escape("(\s[\[\(].*[\]\)])*") + """)$", "i")
    }
    """
    qres = list(disease_ontology.query(query))
    if len(qres) == 0:
        print("no match for", keyword.encode('ascii', 'xmlcharrefreplace'))
    elif len(qres) > 1:
        print("multiple matches for", keyword.encode('ascii', 'xmlcharrefreplace'))
        print(qres)
    return qres

In [9]:
resolve_keyword("ebola virus disease")

[(rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_4325'))]

In [10]:
def iterate_infectious_disease_articles(collection, update_collection = False, no_reannotation = False):
    keyword_annotator = KeywordAnnotator(keywords=get_annotation_keywords())
    total_article_count = 0
    article_with_body_count = 0
    infectious_disease_article_count = 0
    query = {}
    if no_reannotation == False:
        query = {
            "articles."
        }
    for article in collection.find():
        total_article_count += 1
        pc_article = pubcrawler.Article(article)
        if pc_article.article_type() == "research-article":
            body = pc_article.get_text_from_tags('body')
            if len(body) > 0:
                article_with_body_count += 1
                anno_doc = AnnoDoc(body)
                anno_doc.add_tier(keyword_annotator)
                infectious_diseases = [
                    (disease.text, resolve_keyword(disease.text))
                    for disease in anno_doc.tiers['keywords'].spans
                ]
                if len(infectious_diseases) > 0:
                    infectious_disease_article_count += 1
                    #print(infectious_disease_article_count, "/", total_article_count, ",", article_with_body_count)
                    yield article, infectious_diseases

In [31]:
db = pymongo.MongoClient('localhost')['pmc']
for article, infectious_diseases in iterate_infectious_disease_articles(db.articlesubset):
    print(article['_id'], infectious_diseases)
    print("")

BMC_Infect_Dis_2013_Jul_22_13_337 [('candidiasis', [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_1508'),)]), ('candidiasis', [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_1508'),)])]

Tuberc_Res_Treat_2014_Nov_19_2014_195287 [('tuberculosis', [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_399'),)]), ('tuberculosis', [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_399'),)]), ('tuberculosis', [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_399'),)]), ('tuberculosis', [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_399'),)]), ('tuberculosis', [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_399'),)]), ('tuberculosis', [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_399'),)]), ('tuberculosis', [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_399'),)]), ('tuberculosis', [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_399'),)]), ('tuberculosis', [(rdflib.term.URIRef('http://purl.obolibrary.

In [32]:
total_article_count

NameError: name 'total_article_count' is not defined

In [None]:
article_with_body_count

In [None]:
infectious_disease_article_count