In [1]:
import os
os.chdir("..")

In [77]:
"""
Iterate over the PubMED articles that mention infecious diseases from the
disease ontology.
"""
import rdflib
from pylru import lrudecorator
import pubcrawler.article as pubcrawler
from annotator.keyword_annotator import KeywordAnnotator
from annotator.geoname_annotator import GeonameAnnotator
from annotator.geoname_annotator import GeoSpan
from annotator.annotator import AnnoDoc
import re
import json
import pymongo

In [3]:
print("Loading disease ontology...")
disease_ontology = rdflib.Graph()
disease_ontology.parse(
    "http://purl.obolibrary.org/obo/doid.owl",
    format="xml"
)
print("disease ontology loaded")

Loading disease ontology...
disease ontology loaded


In [245]:
def get_annotation_keywords():
    qres = disease_ontology.query("""
    prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
    prefix obo: <http://purl.obolibrary.org/obo/>
    prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?entity ?label
    WHERE {
        # only resolve diseases by infectious agent
        ?entity rdfs:subClassOf* obo:DOID_0050117
        ; oboInOwl:hasNarrowSynonym|oboInOwl:hasRelatedSynonym|oboInOwl:hasExactSynonym|rdfs:label ?label
    }
    """)
    def remove_parenthetical_notes(label):
        label = re.sub(r"\s\(.*\)","", label)
        label = re.sub(r"\s\[.*\]","", label)
        assert(len(label) > 0)
        return label
    return list(set([remove_parenthetical_notes(str(r[1])) for r in qres]))

def str_escape(s):
    return json.dumps(s)[1:-1]

@lrudecorator(500)
def resolve_keyword(keyword):
    query = """
    prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
    prefix obo: <http://purl.obolibrary.org/obo/>
    prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?entity
    WHERE {
        # only resolve diseases by infectious agent
        ?entity rdfs:subClassOf* obo:DOID_0050117
        ; oboInOwl:hasNarrowSynonym|oboInOwl:hasRelatedSynonym|oboInOwl:hasExactSynonym|rdfs:label ?label
        FILTER regex(?label, "^(""" + str_escape(re.escape(keyword)) + str_escape("(\s[\[\(].*[\]\)])*") + """)$", "i")
    }
    """
    qres = list(disease_ontology.query(query))
    if len(qres) == 0:
        print("no match for", keyword.encode('ascii', 'xmlcharrefreplace'))
    elif len(qres) > 1:
        print("multiple matches for", keyword.encode('ascii', 'xmlcharrefreplace'))
        print(qres)
    return qres

""" This gets the list of tuples returned by the function below and transforms
it into a list of dicts, appropriate for dumping into a Mongo document. """
def annotated_keywords_to_dict_list(keywords):
    seen_keys = []
    keyword_list = []
    for keyword_entity in keywords:
        keyword, uri = keyword_entity
        if keyword in seen_keys:
            continue
        else:
            seen_keys.append(keyword)
            keyword_dict = {
                "keyword": keyword,
                "uri": uri[0].entity.toPython()
            }
            keyword_list.append(keyword_dict)
    return(keyword_list)

"""
Currently, this writes the following set of metadata to the appropriate
mongo document:

- meta
    - article-ids
    - article-type
    - keywords
- annotations
    - disease-ontology-keywords
"""

def write_article_meta_to_mongo(article, collection):
    pc_article = pubcrawler.Article(article)
    anno_doc = AnnoDoc(pc_article.body)
    anno_doc.add_tier(keyword_annotator)
    infectious_diseases = [
        (disease.text, resolve_keyword(disease.text))
        for disease in anno_doc.tiers['keywords'].spans
    ]
    disease_ontology_keywords = None if len(infectious_diseases) == 0 else annotated_keywords_to_dict_list(infectious_diseases)
    collection.update_one({'_id': article['_id']},
        {
        '$set':
            {
            'meta':
                {
                'article-ids': pc_article.pub_ids(),
                'article-type': pc_article.article_type(),
                # 'pub-dates': pc_article.pub_dates()
                # Need to fix stuff with dates in Mongo
                'keywords': pc_article.keywords()
                },
            'annotations':
                {
                'disease-ontology-keywords': disease_ontology_keywords
                }
            },
        })

def iterate_articles(collection):
    query = {}
    query = {'meta': {'$exists': False}}
    total_articles = collection.count(query)
    processed_articles = 0
    for article in collection.find(query):
        processed_articles += 1
        print("Processing article {} of {} ({:.2}%)...".format(processed_articles, total_articles, processed_articles / total_articles), end="")
        write_article_meta_to_mongo(article, collection=collection)
        print(" Done!")

def strip_article_info(collection):
    collection.update_many({},
        {
        '$unset':
            {
            'meta': "",
            'annotations': "",
            'all_geonames': "",
            'culled_geonames': ""
            }
        })

In [5]:
def extract_disease_ontology_keywords(article):
    pc_article = pubcrawler.Article(article)
    anno_doc = AnnoDoc(pc_article.body)
    anno_doc.add_tier(keyword_annotator)
    infectious_diseases = [
        (disease.text, resolve_keyword(disease.text))
        for disease in anno_doc.tiers['keywords'].spans
    ]
    disease_ontology_keywords = None if len(infectious_diseases) == 0 else annotated_keywords_to_dict_list(infectious_diseases)
    return({
        'annotations':
        {
            'disease-ontology-keywords': disease_ontology_keywords
        }
    })

def extract_meta(article):
    pc_article = pubcrawler.Article(article)
    meta = {
        'meta':
        {
            'article-ids': pc_article.pub_ids(),
            'article-type': pc_article.article_type(),
            # 'pub-dates': pc_article.pub_dates()
            # Need to fix stuff with dates in Mongo
            'keywords': pc_article.keywords()
        }
    }
    return(meta)



# def process_single_article(article, )

In [230]:
def extract_geonames(article):
#     print("Hi")
    pc_article = pubcrawler.Article(article)
    anno_doc = AnnoDoc(pc_article.body)
    
    
    candidate_locations = geoname_annotator.get_candidate_geonames(anno_doc)
    features = geoname_annotator.extract_features(candidate_locations)
        
#     return(candidate_locations)

    feature_weights = dict(
        population_score=2.0,
        synonymity=1.0,
        num_spans_score=0.4,
        short_span_score=(-5),
        NEs_contained=1.2,
        # Distinctness is probably more effective when combined
        # with other features
        distinctness=1.0,
        max_span_score=1.0,
        # close_locations=0.8,
        # closest_location=0.8,
        # containment_level=0.8,
        cannonical_name_used=0.5,
        feature_code_score=0.6,
    )
    for location, feature in zip(candidate_locations, features):
        location['score'] = feature.score(feature_weights)
    culled_locations = [location
        for location in candidate_locations
        if location['score'] > 50]
    geo_spans = []
    for location in culled_locations:
        # Copy the dict so we don't need to return a custom class.
        location = dict(location)
        for span in location['spans']:
            # TODO: Adjust scores to give geospans that exactly match
            # a corresponding geoname a bonus.
            geo_span = GeoSpan(
                span.start, span.end, anno_doc, location
            )
            geo_spans.append(geo_span)
    culled_geospans = geoname_annotator.cull_geospans(geo_spans)
#     props_to_omit = ['spans', 'alternatenames', 'alternateLocations']
#     for geospan in culled_geospans:
#         # The while loop removes the properties from the parentLocations.
#         # There will probably only be one parent location.
#         cur_location = geospan.geoname
#         while True:
#             if all([
#                 prop not in cur_location
#                 for prop in props_to_omit
#             ]):
#                 break
#             for prop in props_to_omit:
#                 cur_location.pop(prop)
#             if 'parentLocation' in cur_location:
#                 cur_location = cur_location['parentLocation']
#             else:
#                 break
    
    
    props_to_omit = ['spans', 'alternateLocations']
    # Get candidate geonameids and feature vectors
    all_geonames = []
    for location, feature in zip(candidate_locations, features):
        geoname_dict = location
        for prop in props_to_omit:
            geoname_dict.pop(prop, None)
#         geoname_dict['geonameid'] = location['geonameid']
        geoname_dict['annie_features'] = feature.to_dict()
        all_geonames.append(geoname_dict)
        

    
    culled_geonames = []
    for geospan in culled_geospans:
        geoname = geospan.geoname
        for prop in props_to_omit:
            geoname.pop(prop, None)
        culled_geonames.append(geospan.to_dict())
    
    
    
    return({'all_geonames': all_geonames, 'culled_geonames': culled_geonames})

In [231]:
extract_geonames(x)

{'all_geonames': [{'admin1 code': '03',
   'admin2 code': '',
   'admin3 code': '',
   'admin4 code': '',
   'alternatenames': ['1',
    'Farg`ona Ikinchi',
    'Farg‘ona Ikinchi',
    'Stantsiya Fergana Vtoraya'],
   'annie_features': {'NEs_contained': 0.0,
    'cannonical_name_used': 100,
    'distinctness': 100.0,
    'feature_code_score': 0,
    'max_span_score': 0,
    'num_spans_score': 50,
    'population_score': 0,
    'short_span_score': 100,
    'synonymity': 10},
   'asciiname': '1',
   'cc2': '',
   'country code': 'UZ',
   'dem': '586',
   'elevation': 0,
   'feature class': 'S',
   'feature code': 'RSTN',
   'geonameid': '10908325',
   'latitude': 40.38861,
   'longitude': 71.80753,
   'modification date': '2015-11-10',
   'name': '1',
   'population': 0,
   'score': -54.7108473672578,
   'timezone': 'Asia/Tashkent'},
  {'admin1 code': '03',
   'admin2 code': '',
   'admin3 code': '',
   'admin4 code': '',
   'alternatenames': ['Atlas', 'Ատլաս'],
   'annie_features': {'NE

In [187]:
# I'm gonna see how long it takes to look up a geoname in the db
# Slow without index, blazing fast with.
geonames = db = pymongo.MongoClient()['geonames']
allCountries = geonames.allCountries

In [195]:
allCountries.find_one({'geonameid': foo[1]['geonameid']})

{'_id': ObjectId('57c2419c979e067ddc99cfd2'),
 'admin1 code': '03',
 'admin2 code': '',
 'admin3 code': '',
 'admin4 code': '',
 'alternatenames': ['Atlas', 'Ատլաս'],
 'asciiname': 'Atlas',
 'cc2': '',
 'country code': 'AM',
 'dem': '855',
 'elevation': 0,
 'feature class': 'L',
 'feature code': 'FLD',
 'geonameid': '11224076',
 'latitude': 40.11108,
 'longitude': 44.11249,
 'modification date': '2016-08-05',
 'name': 'Atlas',
 'population': 0,
 'timezone': 'Asia/Yerevan'}

In [176]:
bar = []
for feature in foo:
    bar.append(feature.to_dict())
bar

[{'NEs_contained': 0.0,
  'cannonical_name_used': 100,
  'distinctness': 100.0,
  'feature_code_score': 0,
  'max_span_score': 0,
  'num_spans_score': 50,
  'population_score': 0,
  'short_span_score': 100,
  'synonymity': 10},
 {'NEs_contained': 100.0,
  'cannonical_name_used': 100,
  'distinctness': 3.3333333333333335,
  'feature_code_score': 0,
  'max_span_score': 40,
  'num_spans_score': 25,
  'population_score': 0,
  'short_span_score': 0,
  'synonymity': 10},
 {'NEs_contained': 100.0,
  'cannonical_name_used': 100,
  'distinctness': 3.3333333333333335,
  'feature_code_score': 0,
  'max_span_score': 40,
  'num_spans_score': 25,
  'population_score': 0,
  'short_span_score': 0,
  'synonymity': 0},
 {'NEs_contained': 100.0,
  'cannonical_name_used': 100,
  'distinctness': 3.3333333333333335,
  'feature_code_score': 0,
  'max_span_score': 40,
  'num_spans_score': 25,
  'population_score': 0,
  'short_span_score': 0,
  'synonymity': 0},
 {'NEs_contained': 100.0,
  'cannonical_name_use

In [130]:
y = dict(y)
y.pop('alternateLocations')

set()

In [131]:
y

{'admin1 code': '03',
 'admin2 code': '',
 'admin3 code': '',
 'admin4 code': '',
 'alternatenames': ['1',
  'Farg`ona Ikinchi',
  'Farg‘ona Ikinchi',
  'Stantsiya Fergana Vtoraya'],
 'asciiname': '1',
 'cc2': '',
 'country code': 'UZ',
 'dem': '586',
 'elevation': 0,
 'feature class': 'S',
 'feature code': 'RSTN',
 'geonameid': '10908325',
 'latitude': 40.38861,
 'longitude': 71.80753,
 'modification date': '2015-11-10',
 'name': '1',
 'population': 0,
 'spans': {1147-1148:1, 759-760:1},
 'timezone': 'Asia/Tashkent'}

In [107]:
bar

{'admin1 code': '03',
 'admin2 code': '',
 'admin3 code': '',
 'admin4 code': '',
 'alternateLocations': set(),
 'alternatenames': ['1',
  'Farg`ona Ikinchi',
  'Farg‘ona Ikinchi',
  'Stantsiya Fergana Vtoraya'],
 'asciiname': '1',
 'cc2': '',
 'country code': 'UZ',
 'dem': '586',
 'elevation': 0,
 'feature class': 'S',
 'feature code': 'RSTN',
 'geonameid': '10908325',
 'latitude': 40.38861,
 'longitude': 71.80753,
 'modification date': '2015-11-10',
 'name': '1',
 'population': 0,
 'spans': {1147-1148:1, 759-760:1},
 'timezone': 'Asia/Tashkent'}

In [91]:
for key, value in extract_geonames(x).spans[0].geoname.items():
    print(key, value)

alternateLocations 

KeyboardInterrupt: 

In [78]:
super(GeoSpan, extract_geonames(x).spans[0])

<super: annotator.geoname_annotator.GeoSpan, 809-815:Oxford>

In [49]:
combine_extracted_info(x, extract_geonames)

{'annotations': {'disease-ontology-keywords': None},
 'meta': {'article-ids': {'coden': 'ACSEBH',
   'doi': '10.1107/S1600536810008433',
   'pii': 'S1600536810008433',
   'pmc': '2984023',
   'pmid': '21580644',
   'publisher-id': 'bh2274'},
  'article-type': 'research-article',
  'keywords': []}}

In [14]:
"""
Takes an article and an iterable of extraction functions.
Returns a combined dict, ready to be used in a MongoDB '$set' command.
"""
def combine_extracted_info(article, *args):
    extracted = {}
    for f in args:
        extracted.update(f(article))
    return(extracted)

In [15]:
db = pymongo.MongoClient()['pmc']
keyword_annotator = KeywordAnnotator(keywords=get_annotation_keywords())
articles = db.articlesubset

In [16]:
cursor = articles.find({'meta': {'$exists': False}})
cursor.count()

10000

In [17]:
x = cursor.next()
x['_id'] = "test"
articles.delete_one({'_id': "test"})
articles.insert_one(x)
x = articles.find_one({'_id': 'test'})
x

{'_id': 'test',
 'nxml': '\n<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">\n<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" article-type="research-article"><?properties open_access?><front><journal-meta><journal-id journal-id-type="nlm-ta">Acta Crystallogr Sect E Struct Rep Online</journal-id><journal-id journal-id-type="publisher-id">Acta Cryst. E</journal-id><journal-title-group><journal-title>Acta Crystallographica Section E: Structure Reports Online</journal-title></journal-title-group><issn pub-type="epub">1600-5368</issn><publisher><publisher-name>International Union of Crystallography</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="pmid">21580644</article-id><article-id pub-id-type="pmc">2984023</article-id><article-id pub-id-type="publisher-id">bh2274</article-id><article-id pub-id-type="doi">10.1107/S1600

In [23]:
args = (extract_meta, extract_disease_ontology_keywords)

In [38]:
extract_geonames(x).__dict__

{'spans': [809-815:Oxford, 935-941:Oxford, 1322-1328:Oxford, 1382-1388:Oxford]}

In [20]:
to_write = combine_extracted_info(x, extract_geonames)
to_write

TypeError: 'AnnoTier' object is not iterable

In [91]:
articles.update_one({'_id': x['_id']}, {'$set': to_write})

<pymongo.results.UpdateResult at 0x11bc76f78>

In [92]:
x = articles.find_one({'_id': 'test'})
x

{'_id': 'test',
 'annotations': {'disease-ontology-keywords': [{'keyword': 'pertussis',
    'uri': 'http://purl.obolibrary.org/obo/DOID_1116'}]},
 'meta': {'article-ids': {'pmc': '3742127',
   'pmid': '23946635',
   'publisher-id': '182',
   'publisher-manuscript': '2013MOLVIS0058'},
  'article-type': 'research-article',
  'keywords': []},
 'nxml': '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">\n<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" article-type="research-article"><?properties open_access?><front><journal-meta><journal-id journal-id-type="nlm-ta">Mol Vis</journal-id><journal-id journal-id-type="iso-abbrev">Mol. Vis</journal-id><journal-id journal-id-type="publisher-id">MV</journal-id><journal-title-group><journal-title>Molecular Vision</journal-title></journal-title-group><issn pub-type="epub">1090-0535</issn><publisher><publisher-name>

Yes! This works. Now, to do it with a bulk write.

In [114]:
args = (extract_meta, extract_disease_ontology_keywords)
bulk = articles.initialize_unordered_bulk_op()
cursor = articles.find({'meta': {'$exists': False}}, limit = 100)
for article in cursor:
    to_write = combine_extracted_info(x, *args)
    bulk.find({'_id': article['_id']}).update({'$set': to_write})
bulk.execute()

74.77401326599988

Timings for bulk
14.206271923000031

In [115]:
args = (extract_meta, extract_disease_ontology_keywords)
cursor = articles.find({'meta': {'$exists': False}}, limit = 100)
for article in cursor:
    to_write = combine_extracted_info(x, *args)
    articles.update_one({'_id': x['_id']}, {'$set': to_write})

76.03849305499989

In [159]:
articles.count({'meta': {'$exists': False}})

10000

In [154]:
"""
Notes:
- no_reannotation should now be the NAME of the field you don't want to reannotate
"""
def write_article_info(collection, *args, limit=0, no_reannotation=None):
    query = {}
    if no_reannotation is not None:
        query = {no_reannotation: {'$exists': False}}
    cursor = collection.find(query, limit=limit)
    total_articles = cursor.count(with_limit_and_skip=True)
    processed_articles = 0
    for article in cursor:
        processed_articles += 1
        print("Processing article {} of {} ({:.2}%)...".format(processed_articles, total_articles, processed_articles / total_articles), end="")
        to_write = combine_extracted_info(article, *args)
        collection.update_one({'_id': article['_id']}, {'$set': to_write})
        print(" Done!")

In [155]:
write_article_info(articles, extract_metab, extract_disease_ontology_keywords, limit=10, no_reannotation='meta')

Processing article 1 of 10 (0.1%)... Done!
Processing article 2 of 10 (0.2%)... Done!
Processing article 3 of 10 (0.3%)... Done!
Processing article 4 of 10 (0.4%)... Done!
Processing article 5 of 10 (0.5%)... Done!
Processing article 6 of 10 (0.6%)... Done!
Processing article 7 of 10 (0.7%)... Done!
Processing article 8 of 10 (0.8%)... Done!
Processing article 9 of 10 (0.9%)... Done!
Processing article 10 of 10 (1.0%)... Done!


In [6]:
db = pymongo.MongoClient()['pmc']
keyword_annotator = KeywordAnnotator(keywords=get_annotation_keywords())
articles = db.articlesubset

In [11]:
db = pymongo.MongoClient()['pmc']

articles = db.articlesubset

In [140]:
cursor = articles.find({'meta': {'$exists': False}})

In [141]:
type(cursor)

pymongo.cursor.Cursor

In [142]:
cursor.count()

9883

In [10]:
keyword_annotator = KeywordAnnotator(keywords=get_annotation_keywords())
geoname_annotator = GeonameAnnotator()

In [51]:
# write_article_meta_to_mongo(x, articles)

In [52]:
x = articles.find_one({'_id': 'test', 'annotations.disease-ontology-keywords': {'$exists': True}})
x

{'_id': 'test',
 'annotations': {'disease-ontology-keywords': [{'keyword': 'Western equine encephalitis',
    'uri': 'http://purl.obolibrary.org/obo/DOID_10843'},
   {'keyword': 'St. Louis encephalitis',
    'uri': 'http://purl.obolibrary.org/obo/DOID_10845'},
   {'keyword': 'Japanese encephalitis',
    'uri': 'http://purl.obolibrary.org/obo/DOID_10844'},
   {'keyword': 'chikungunya',
    'uri': 'http://purl.obolibrary.org/obo/DOID_0050012'},
   {'keyword': 'Rift Valley fever',
    'uri': 'http://purl.obolibrary.org/obo/DOID_1328'},
   {'keyword': 'yellow fever',
    'uri': 'http://purl.obolibrary.org/obo/DOID_9682'},
   {'keyword': 'St. Louis encephalitis',
    'uri': 'http://purl.obolibrary.org/obo/DOID_10845'},
   {'keyword': 'Western equine encephalitis',
    'uri': 'http://purl.obolibrary.org/obo/DOID_10843'},
   {'keyword': 'Venezuelan equine encephalitis',
    'uri': 'http://purl.obolibrary.org/obo/DOID_9584'},
   {'keyword': 'Eastern equine encephalitis',
    'uri': 'http://pur

In [158]:
articles.delete_one({'_id': "test"})

<pymongo.results.DeleteResult at 0x11bc63990>

In [None]:
iterate_articles(db.articlesubset)

In [246]:
strip_article_info(articles)

In [248]:
articles.count({'all_geonames': {'$exists': True}})

0