In [1]:
import os
os.chdir("..")

In [2]:
"""
Iterate over the PubMED articles that mention infecious diseases from the
disease ontology.
"""
import rdflib
from pylru import lrudecorator
import pubcrawler.article as pubcrawler
from annotator.keyword_annotator import KeywordAnnotator
from annotator.annotator import AnnoDoc
import re
import json
import pymongo

In [3]:
print("Loading disease ontology...")
disease_ontology = rdflib.Graph()
disease_ontology.parse(
    "http://purl.obolibrary.org/obo/doid.owl",
    format="xml"
)
print("disease ontology loaded")

Loading disease ontology...
disease ontology loaded


In [4]:
disease_ontology

<Graph identifier=N282c4c34511b47809860361834f167da (<class 'rdflib.graph.Graph'>)>

In [5]:
def get_annotation_keywords():
    qres = disease_ontology.query("""
    prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
    prefix obo: <http://purl.obolibrary.org/obo/>
    prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?entity ?label
    WHERE {
        # only resolve diseases by infectious agent
        ?entity rdfs:subClassOf* obo:DOID_0050117
        ; oboInOwl:hasNarrowSynonym|oboInOwl:hasRelatedSynonym|oboInOwl:hasExactSynonym|rdfs:label ?label
    }
    """)
    def remove_parenthetical_notes(label):
        label = re.sub(r"\s\(.*\)","", label)
        label = re.sub(r"\s\[.*\]","", label)
        assert(len(label) > 0)
        return label
    return list(set([remove_parenthetical_notes(str(r[1])) for r in qres]))

In [6]:
with open("annotation_keywords", "w+") as f:
    for item in get_annotation_keywords():
        f.write("{}\n".format(item))

In [7]:
def str_escape(s):
    return json.dumps(s)[1:-1]

In [8]:
@lrudecorator(500)
def resolve_keyword(keyword):
    query = """
    prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
    prefix obo: <http://purl.obolibrary.org/obo/>
    prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?entity
    WHERE {
        # only resolve diseases by infectious agent
        ?entity rdfs:subClassOf* obo:DOID_0050117
        ; oboInOwl:hasNarrowSynonym|oboInOwl:hasRelatedSynonym|oboInOwl:hasExactSynonym|rdfs:label ?label
        FILTER regex(?label, "^(""" + str_escape(re.escape(keyword)) + str_escape("(\s[\[\(].*[\]\)])*") + """)$", "i")
    }
    """
    qres = list(disease_ontology.query(query))
    if len(qres) == 0:
        print("no match for", keyword.encode('ascii', 'xmlcharrefreplace'))
    elif len(qres) > 1:
        print("multiple matches for", keyword.encode('ascii', 'xmlcharrefreplace'))
        print(qres)
    return qres

In [44]:
def annotated_keywords_to_dict_list(keywords):
    seen_keys = []
    keyword_list = []
    for keyword_entity in keywords:
        keyword, uri = keyword_entity
        if keyword in seen_keys:
            continue
        else:
            keys.append(keyword)
            keyword_dict = {
                "keyword": keyword,
                "uri": uri[0].entity.toPython()
            }
            keyword_list.append(keyword_dict)
    return(keyword_list)

In [50]:
def write_article_meta_to_mongo(article, collection):
    pc_article = pubcrawler.Article(article)
    anno_doc = AnnoDoc(pc_article.body)
    anno_doc.add_tier(keyword_annotator)
    infectious_diseases = [
        (disease.text, resolve_keyword(disease.text))
        for disease in anno_doc.tiers['keywords'].spans
    ]
    disease_ontology_keywords = None if len(infectious_diseases) == 0 else annotated_keywords_to_dict_list(infectious_diseases)
    collection.update_one({'_id': article['_id']},
        {
        '$set':
            {
            'meta':
                {
                'article-ids': pc_article.pub_ids(),
                'article-type': pc_article.article_type(),
                # 'pub-dates': pc_article.pub_dates()
                # Need to fix stuff with dates in Mongo
                'keywords': pc_article.keywords()
                },
            'annotations':
                {
                'disease-ontology-keywords': disease_ontology_keywords
                }
            },
        })

In [46]:
def iterate_infectious_disease_articles(collection):
    query = {}
    if args.no_reannotation:
        query = {'meta': {'$exists': False}}
    total_articles = collection.count(query)
    processed_articles = 0
    for article in collection.find(query):
        processed_articles += 1
        print("Processing article {} of {} ({:.2}%)...".format(processed_articles, total_articles, processed_articles / total_articles), end="")
        write_article_meta_to_mongo(article, collection=collection)
        print(" Done!")

In [11]:
db = pymongo.MongoClient('localhost')['pmc']
articles = db.articlesubset

In [59]:
cursor = articles.find({'meta': {'$exists': False}})

In [60]:
cursor.count()

10001

In [15]:
x = cursor.next()

In [16]:
x

{'_id': 'PLoS_One_2016_Jan_25_11(1)_e0147962',
 'nxml': '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">\n<article xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" article-type="research-article"><?properties open_access?><front><journal-meta><journal-id journal-id-type="nlm-ta">PLoS One</journal-id><journal-id journal-id-type="iso-abbrev">PLoS ONE</journal-id><journal-id journal-id-type="publisher-id">plos</journal-id><journal-id journal-id-type="pmc">plosone</journal-id><journal-title-group><journal-title>PLoS ONE</journal-title></journal-title-group><issn pub-type="epub">1932-6203</issn><publisher><publisher-name>Public Library of Science</publisher-name><publisher-loc>San Francisco, CA USA</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="pmid">26807734</article-id><article-i

In [17]:
x['_id'] = "test"
articles.delete_one({'_id': "test"})
articles.insert_one(x)

<pymongo.results.InsertOneResult at 0x111c873a8>

In [18]:
x = articles.find_one({'_id': 'test'})
x

{'_id': 'test',
 'nxml': '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">\n<article xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" article-type="research-article"><?properties open_access?><front><journal-meta><journal-id journal-id-type="nlm-ta">PLoS One</journal-id><journal-id journal-id-type="iso-abbrev">PLoS ONE</journal-id><journal-id journal-id-type="publisher-id">plos</journal-id><journal-id journal-id-type="pmc">plosone</journal-id><journal-title-group><journal-title>PLoS ONE</journal-title></journal-title-group><issn pub-type="epub">1932-6203</issn><publisher><publisher-name>Public Library of Science</publisher-name><publisher-loc>San Francisco, CA USA</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="pmid">26807734</article-id><article-id pub-id-type="pmc">4726549</ar

In [19]:
keyword_annotator = KeywordAnnotator(keywords=get_annotation_keywords())

In [51]:
write_article_meta_to_mongo(x, articles)

In [52]:
x = articles.find_one({'_id': 'test', 'annotations.disease-ontology-keywords': {'$exists': True}})
x

{'_id': 'test',
 'annotations': {'disease-ontology-keywords': [{'keyword': 'Western equine encephalitis',
    'uri': 'http://purl.obolibrary.org/obo/DOID_10843'},
   {'keyword': 'St. Louis encephalitis',
    'uri': 'http://purl.obolibrary.org/obo/DOID_10845'},
   {'keyword': 'Japanese encephalitis',
    'uri': 'http://purl.obolibrary.org/obo/DOID_10844'},
   {'keyword': 'chikungunya',
    'uri': 'http://purl.obolibrary.org/obo/DOID_0050012'},
   {'keyword': 'Rift Valley fever',
    'uri': 'http://purl.obolibrary.org/obo/DOID_1328'},
   {'keyword': 'yellow fever',
    'uri': 'http://purl.obolibrary.org/obo/DOID_9682'},
   {'keyword': 'St. Louis encephalitis',
    'uri': 'http://purl.obolibrary.org/obo/DOID_10845'},
   {'keyword': 'Western equine encephalitis',
    'uri': 'http://purl.obolibrary.org/obo/DOID_10843'},
   {'keyword': 'Venezuelan equine encephalitis',
    'uri': 'http://purl.obolibrary.org/obo/DOID_9584'},
   {'keyword': 'Eastern equine encephalitis',
    'uri': 'http://pur

In [61]:
articles.delete_one({'_id': "test"})

<pymongo.results.DeleteResult at 0x1145052d0>

In [16]:
for article, infectious_diseases in iterate_infectious_disease_articles(db.articlesubset):
    print(article['_id'], infectious_diseases)
    print("")

NameError: name 'iterate_infectious_disease_articles' is not defined

In [10]:
total_article_count

NameError: name 'total_article_count' is not defined

In [11]:
article_with_body_count

NameError: name 'article_with_body_count' is not defined

In [12]:
infectious_disease_article_count

NameError: name 'infectious_disease_article_count' is not defined

In [57]:
def strip_article_meta(collection):
    collection.update_many({'meta': {'$exists': True}},
        {
        '$unset':
            {
            'meta': "",
            'annotations': ""
            }
        })

In [58]:
strip_article_meta(articles)