In [1]:
import os
os.chdir("..")

In [2]:
"""
Iterate over the PubMED articles that mention infecious diseases from the
disease ontology.
"""
import rdflib
from pylru import lrudecorator
import pubcrawler.article as pubcrawler
from annotator.keyword_annotator import KeywordAnnotator
from annotator.annotator import AnnoDoc
import re
import json
import pymongo

In [33]:
print("Loading disease ontology...")
disease_ontology = rdflib.Graph()
disease_ontology.parse(
    "http://purl.obolibrary.org/obo/doid.owl",
    format="xml"
)
print("disease ontology loaded")

Loading disease ontology...
disease ontology loaded


In [4]:
disease_ontology

<Graph identifier=Nbda20d900ed74f759fe6397bf8111869 (<class 'rdflib.graph.Graph'>)>

In [58]:
def get_annotation_keywords():
    qres = disease_ontology.query("""
    prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
    prefix obo: <http://purl.obolibrary.org/obo/>
    prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?entity ?label
    WHERE {
        # only resolve diseases by infectious agent
        ?entity rdfs:subClassOf* obo:DOID_0050117
        ; oboInOwl:hasNarrowSynonym|oboInOwl:hasRelatedSynonym|oboInOwl:hasExactSynonym|rdfs:label ?label
    }
    """)
    def remove_parenthetical_notes(label):
        label = re.sub(r"\s\(.*\)","", label)
        label = re.sub(r"\s\[.*\]","", label)
        assert(len(label) > 0)
        return label
    return list(set([remove_parenthetical_notes(str(r[1])) for r in qres]))

In [59]:
with open("annotation_keywords", "w+") as f:
    for item in get_annotation_keywords():
        f.write("{}\n".format(item))

In [60]:
def str_escape(s):
    return json.dumps(s)[1:-1]

In [61]:
@lrudecorator(500)
def resolve_keyword(keyword):
    query = """
    prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
    prefix obo: <http://purl.obolibrary.org/obo/>
    prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?entity
    WHERE {
        # only resolve diseases by infectious agent
        ?entity rdfs:subClassOf* obo:DOID_0050117
        ; oboInOwl:hasNarrowSynonym|oboInOwl:hasRelatedSynonym|oboInOwl:hasExactSynonym|rdfs:label ?label
        FILTER regex(?label, "^(""" + str_escape(re.escape(keyword)) + str_escape("(\s[\[\(].*[\]\)])*") + """)$", "i")
    }
    """
    qres = list(disease_ontology.query(query))
    if len(qres) == 0:
        print("no match for", keyword.encode('ascii', 'xmlcharrefreplace'))
    elif len(qres) > 1:
        print("multiple matches for", keyword.encode('ascii', 'xmlcharrefreplace'))
        print(qres)
    return qres

In [62]:
resolve_keyword("ebola virus disease")

[(rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_4325'))]

In [10]:
def iterate_infectious_disease_articles(collection, update_collection = False, no_reannotation = False):
    keyword_annotator = KeywordAnnotator(keywords=get_annotation_keywords())
    total_article_count = 0
    article_with_body_count = 0
    infectious_disease_article_count = 0
    query = {}
    if no_reannotation == False:
        query = {
            "articles."
        }
    for article in collection.find():
        total_article_count += 1
        pc_article = pubcrawler.Article(article)
        if pc_article.article_type() == "research-article":
            body = pc_article.get_text_from_tags('body')
            if len(body) > 0:
                article_with_body_count += 1
                anno_doc = AnnoDoc(body)
                anno_doc.add_tier(keyword_annotator)
                infectious_diseases = [
                    (disease.text, resolve_keyword(disease.text))
                    for disease in anno_doc.tiers['keywords'].spans
                ]
                if len(infectious_diseases) > 0:
                    infectious_disease_article_count += 1
                    #print(infectious_disease_article_count, "/", total_article_count, ",", article_with_body_count)
                    yield article, infectious_diseases

In [3]:
db = pymongo.MongoClient('localhost')['pmc']
articles = db.articlesubset

In [388]:
cursor = articles.find({'meta': {'$exists': False}})

In [389]:
cursor.count()

9966

In [387]:
cursor.next()

{'_id': 'Clin_Med_Insights_Cardiol_2015_Apr_19_8(Suppl_3)_49-59',
 'annotations': {'disease-ontology-keywords': None},
 'meta': {'article-ids': {'doi': '10.4137/CMC.S17068',
   'pmc': '4405090',
   'pmid': '25983559',
   'publisher-id': 'cmc-suppl_3-2014-049'},
  'article-type': 'review-article',
  'keywords': ['myocarditis',
   'myocardial infarct',
   'inflammasome',
   'macrophages',
   'autoantibodies',
   'sex hormones']},
 'nxml': '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">\n<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" article-type="review-article"><?properties open_access?><front><journal-meta><journal-id journal-id-type="nlm-ta">Clin Med Insights Cardiol</journal-id><journal-id journal-id-type="iso-abbrev">Clin Med Insights Cardiol</journal-id><journal-id journal-id-type="publisher-id">Clinical Medicine Insights: Cardiology</journal

In [None]:
keyword_annotator = KeywordAnnotator(keywords=get_annotation_keywords())

In [305]:
def annotated_keyword_list_to_dict(keyword_list):
    keyword_dict = {}
    for keyword_entity in keyword_list:
        keyword, uri = keyword_entity
        if keyword in keyword_dict:
            continue
        else:
            keyword_dict[keyword] = uri[0].entity.toPython()
    return(keyword_dict)

In [356]:
def write_article_meta_to_mongo(article):
    pc_article = pubcrawler.Article(article)
    anno_doc = AnnoDoc(pc_article.body)
    anno_doc.add_tier(keyword_annotator)
    infectious_diseases = [
        (disease.text, resolve_keyword(disease.text))
        for disease in anno_doc.tiers['keywords'].spans
    ]
    disease_ontology_keywords = None if len(infectious_diseases) == 0 else annotated_keyword_list_to_dict(infectious_diseases)
    print(disease_ontology_keywords)
    articles.update_one({'_id': 'test'},
                        {
                        '$set':
                            {
                            'meta':
                                {
                                'article-ids': pc_article.pub_ids(),
                                'article-type': pc_article.article_type(),
                                # 'pub-dates': pc_article.pub_dates()
                                # Need to fix stuff with dates in Mongo
                                'keywords': pc_article.keywords()
                                },
                            'annotations':
                                {
                                'disease-ontology-keywords': disease_ontology_keywords
                                }
                            },
                        })

In [364]:
x = cursor.next()
x['_id'] = "test"
articles.delete_one({'_id': "test"})
articles.insert_one(x)

<pymongo.results.InsertOneResult at 0x11af04bd0>

In [365]:
x = articles.find_one({'_id': 'test'})
x

{'_id': 'test',
 'nxml': '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">\n<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" article-type="research-article"><?properties open_access?><front><journal-meta><journal-id journal-id-type="nlm-ta">J Pathol</journal-id><journal-id journal-id-type="publisher-id">path</journal-id><journal-title-group><journal-title>The Journal of Pathology</journal-title></journal-title-group><issn pub-type="ppub">0022-3417</issn><issn pub-type="epub">1096-9896</issn><publisher><publisher-name>John Wiley &#x00026; Sons, Ltd.</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="pmid">20229506</article-id><article-id pub-id-type="pmc">3262968</article-id><article-id pub-id-type="doi">10.1002/path.2696</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subje

In [366]:
write_article_meta_to_mongo(x)

None


In [367]:
x = articles.find_one({'_id': 'test', 'annotations.disease-ontology-keywords': {'$exists': True}})
x

{'_id': 'test',
 'annotations': {'disease-ontology-keywords': None},
 'meta': {'article-ids': {'doi': '10.1002/path.2696',
   'pmc': '3262968',
   'pmid': '20229506'},
  'article-type': 'research-article',
  'keywords': ['p53',
   'high-grade pelvic serous carcinoma',
   'ovarian cancer',
   'DNA sequence analysis',
   'array-based genomic hybridization',
   'histopathology',
   'clinical outcome',
   'BRCA']},
 'nxml': '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">\n<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" article-type="research-article"><?properties open_access?><front><journal-meta><journal-id journal-id-type="nlm-ta">J Pathol</journal-id><journal-id journal-id-type="publisher-id">path</journal-id><journal-title-group><journal-title>The Journal of Pathology</journal-title></journal-title-group><issn pub-type="ppub">0022-3417</issn><issn

In [375]:
articles.delete_one({'_id': "test"})

<pymongo.results.DeleteResult at 0x11ae012d0>

In [16]:
for article, infectious_diseases in iterate_infectious_disease_articles(db.articlesubset):
    print(article['_id'], infectious_diseases)
    print("")

NameError: name 'iterate_infectious_disease_articles' is not defined

In [10]:
total_article_count

NameError: name 'total_article_count' is not defined

In [11]:
article_with_body_count

NameError: name 'article_with_body_count' is not defined

In [12]:
infectious_disease_article_count

NameError: name 'infectious_disease_article_count' is not defined