In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pymongo
from collections import defaultdict

In [221]:
import pandas as pd
import numpy as np

In [4]:
from IPython.display import clear_output

In [5]:
from pubcrawler.article import Article

In [6]:
client = pymongo.MongoClient("localhost", 27017)
articles = client.pmc.articles

In [7]:
# Make sure we have a text index
articles.create_index([("extracted_text", pymongo.TEXT,)])

'extracted_text_text'

In [141]:
import time
from statistics import mean
from IPython.display import clear_output
class Reporter:
    def __init__(self, interval, total):
        self.start = time.time()
        self.this_time = self.start
        self.interval = interval
        self.total = total
        self.times_per_batch = []
    
    def report(self, idx):
        idx += 1
        if idx % self.interval is not 0:
            return
        self.last_time = self.this_time
        self.this_time = time.time()
        time_per = (self.this_time - self.last_time) / self.interval
        self.times_per_batch.append(time_per)
        est_time_left = (self.total - idx) * mean(self.times_per_batch[-10:])
        elapsed = time.time() - self.start

        output = "Processed {0} articles ({1:.1f}%) in {2:.0f}m{3:.0f}s; about {4:.0f}m{5:.0f}s left.".format(
            idx,
            idx/self.total * 100,
            elapsed // 60,
            elapsed % 60,
            est_time_left // 60,
            est_time_left % 60)
        clear_output(wait=True)
        print(output)

## Search for articles matching terms

In [84]:
terms = [
    'field work',
    'fieldwork',
    'field study',
    'study site'
]

In [93]:
by_term = dict()
by_id = defaultdict(list)
for term in terms:
    results = articles.find(
        { '$text': { '$search': '"' + term + '"' } },
        { '_id': '_id', 'score': { '$meta': "textScore" } }
    )
    result_set = {result["_id"] for result in results}
    by_term[term] = result_set
    [by_id[result].append(term) for result in result_set]

In [86]:
all_matches = set.union(*by_term.values())

In [91]:
[(term, len(ids)) for term, ids in by_term.items()]

[('field work', 68),
 ('fieldwork', 95),
 ('field study', 53),
 ('study site', 318)]

### Add an article.text_matches field

In [94]:
for term in terms:
    results = articles.update_many(
        filter={ '$text': { '$search': '"' + term + '"' } },
        update={"$addToSet": {"text_matches": term}}
    )

In [95]:
articles.create_index("text_matches")

'text_matches_1'

This is an alternative version of the original search that uses the created index.

In [96]:
by_term = dict()
by_id = defaultdict(list)
for term in terms:
    results = articles.find(
        { "text_matches": term },
        { '_id': '_id'}
    )
    result_set = {result["_id"] for result in results}
    by_term[term] = result_set
    [by_id[result].append(term) for result in result_set]

In [97]:
all_matches = set.union(*by_term.values())

In [98]:
[(term, len(ids)) for term, ids in by_term.items()]

[('field work', 68),
 ('fieldwork', 95),
 ('field study', 53),
 ('study site', 318)]

This query will then allow us to find "one of" for all those text queries, *within* a Mongo pipeline.

In [57]:
art = articles.find_one(
    { "text_matches": { "$in": terms } }
)

In [58]:
art = articles.aggregate([
    { "$match": { "text_matches": { "$in": terms } } },
    { "$sample": { "size": 1 } }
]).next()

In [59]:
print(art["extracted_text"])




Introduction

Long-term cognitive impairment has been reported in up to 65% of individuals with moderate-severe traumatic brain injury (TBI) with adverse effects on independence, homemaking tasks, interpersonal relationships, leisure, employment, and other aspects of life.1 In spite of limited evidence of efficacy, prescribing pharmacological agents to improve chronic cognitive dysfunction after TBI is common practice. The dopaminergic agent and N-methyl-D-aspartate (NMDA) antagonist, amantadine (approved by the U.S. Food and Drug Administration for influenza prevention and Parkinson's disease), is commonly used for this purpose.2 Amantadine has a relatively benign side-effect profile (assuming adequate renal function) compared to other agents. Although previous studies are limited by small sample size and design flaws, preliminary evidence suggests some cognitive benefit from amantadine. Notably, there is strong evidence that amantadine improves rate of recovery acutely in those wi

In [36]:
article = Article(art["xml"])

In [136]:
article.article_type()

'research-article'

In [175]:
article.soup.find("front").find("journal-meta").find("journal-title").get_text()

'PLoS ONE'

In [176]:
article.journal_title()

'PLoS ONE'

## DataFrame scratch

In [107]:
foo = pd.DataFrame()

In [117]:
pd.DataFrame({"one": [10], "two": ["dog"]})

Unnamed: 0,one,two
0,10,dog


In [119]:
pd.DataFrame.from_records([{"one": 10, "two": "dog"}])

Unnamed: 0,one,two
0,10,dog


In [112]:
pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],ignore_index=True)

Unnamed: 0,A
0,0
1,1
2,2
3,3
4,4


In [None]:
pd.concat([foo])

## Count article types and other metadata

TODO: Check if an article's `_id` is in `result_set` and add those to a different default_dict

In [186]:
articles.count_documents({})

20000

In [185]:
cursor = articles.aggregate([{ "$sample": { "size": articles.count_documents({}) } }])

OperationFailure: Sort exceeded memory limit of 104857600 bytes, but did not opt in to external sorting. Aborting operation. Pass allowDiskUse:true to opt in.

In [187]:
matches = list(articles.find({}))

In [190]:
id_types = defaultdict(int)
article_types = defaultdict(int)
keywords = defaultdict(int)

reporter = Reporter(25, len(matches))

rows = []

for idx, record in enumerate(matches):
    reporter.report(idx)

    row = {}
    article = Article(record["xml"])

    row["id"] = record["id"]
    row["keywords"] = article.keywords()
    row["id_types"] = list(article.pub_ids().keys())
    row["article_type"] = article.article_type()
    row["has_body"] = True if article.soup.body else False
    row["text_matches"] = record.get("text_matches")
    row["article_title"] = article.article_title()
    row["journal_title"] = article.journal_title()
    row["text_length"] = len(record["extracted_text"])

    rows.append(row)

Processed 20000 articles (100.0%) in 16m52s; about 0m0s left.


In [191]:
article_df = pd.DataFrame.from_records(rows)

In [282]:
article_df["any_matches"] = [False if row is None else True for row in article_df["text_matches"]]
article_df.groupby(["any_matches"]).size()

any_matches
False    19519
True       481
dtype: int64

In [356]:
article_df.to_csv("article_sample.csv")

In [285]:
article_df.groupby(["any_matches", "article_type"]).size()
art_types = article_df.groupby(["any_matches", "article_type"]).size().reset_index()
art_types.columns = ["any_matches", "article_type", "count"]
art_types.sort_values(by = ["any_matches", "count"], ascending=False)

Unnamed: 0,any_matches,article_type,count
39,True,research-article,446
40,True,review-article,15
32,True,brief-report,7
38,True,protocol,4
33,True,case-report,3
37,True,other,3
34,True,data-paper,1
35,True,editorial,1
36,True,letter,1
28,False,research-article,14138


In [364]:
keep_vars = ["id", "any_matches"]
unnest_var = "keywords"

def unnest(data, unnest_var, keep_vars):
    all_vars = keep_vars + [unnest_var]
    nested = article_df.loc[:, keep_vars + [unnest_var]]
    lens = [len(item) if item is not None else 1 for item in nested[unnest_var]]
    unnested_dict = {var: np.repeat([nested[var].values], lens) for var in keep_vars}
    unnested_dict[unnest_var] = np.hstack(nested[unnest_var])
    unnested = pd.DataFrame(unnested_dict)
    return(unnested)

In [365]:
keywords = unnest(article_df, "keywords", ["id", "any_matches"])

In [336]:
keywords.to_csv("keywords.csv")

In [366]:
id_types = unnest(article_df, "id_types", ["id", "any_matches"])

In [361]:
id_types.to_csv("id_types.csv")

In [367]:
text_matches = unnest(article_df, "text_matches", ["id"])

In [369]:
text_matches.to_csv("text_matches.csv")

### Old version

In [131]:
id_types = defaultdict(int)
article_types = defaultdict(int)
keywords = defaultdict(int)

reporter = Reporter(10, len(matches))

for idx, doc in enumerate(matches):
    if idx is not 0 and idx % reporter.interval is 0:
        reporter.report(idx)
    article = Article(doc["xml"])
    
    kwds = article.keywords()
    if kwds:
        for kwd in kwds:
            keywords[kwd] += 1
    
    ids = article.pub_ids().keys()
    if len(ids) is 0:
        id_types["none"] += 1
    else:
        for item in ids:
            id_types[item] += 1
    
    article_type = article.article_type()
    if article_type:
        article_types[article_type] += 1



Processed 240 articles (96.0%) in 0m12s; about 0m1s left.


In [105]:
keywords = pd.DataFrame(list(keywords.items()), columns=["keyword", "count"])

In [106]:
keywords.sort_values("count", ascending=False)

Unnamed: 0,keyword,count
43,inflammation,6
587,oxidative stress,5
159,epidemiology,5
183,Obesity,4
279,Prostate cancer,4
1220,breast cancer,4
647,SNP,4
1833,cytotoxicity,4
45,mitochondria,4
294,depression,4


In [97]:
id_types

defaultdict(int,
            {'pmid': 18771,
             'pmc': 20000,
             'publisher-id': 13829,
             'doi': 17408,
             'pii': 1117,
             'pmc-scan': 1198,
             'manuscript': 206,
             'coden': 219,
             'art-access-id': 208,
             'other': 134,
             'publisher-manuscript': 22,
             'sici': 27,
             'medline': 4})

In [98]:
article_types

defaultdict(int,
            {'research-article': 14584,
             'abstract': 507,
             'brief-report': 318,
             'retraction': 12,
             'other': 748,
             'review-article': 1294,
             'editorial': 315,
             'case-report': 899,
             'correction': 245,
             'news': 82,
             'discussion': 42,
             'letter': 302,
             'meeting-report': 67,
             'product-review': 31,
             'reply': 9,
             'article-commentary': 92,
             'addendum': 3,
             'book-review': 306,
             'protocol': 31,
             'obituary': 29,
             'rapid-communication': 15,
             'methods-article': 13,
             'in-brief': 17,
             'systematic-review': 11,
             'introduction': 9,
             'oration': 2,
             'books-received': 3,
             ' case-report': 1,
             'data-paper': 8,
             'report': 2,
             'announcement'

## Look at the text of a single document matching a search result

In [None]:
article_df