In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pymongo
from collections import defaultdict

In [3]:
import pandas as pd
import numpy as np

In [4]:
from IPython.display import clear_output

In [5]:
from pubcrawler.article import Article
from bs4 import BeautifulSoup

In [6]:
from reporter import Reporter

In [7]:
client = pymongo.MongoClient("localhost", 27017)
articles = client.pmc.articles

In [8]:
with open("terms") as f:
    terms = [line.strip() for line in f.readlines()]

In [19]:
art = articles.find_one()

In [17]:
# To sample a random article:
art = articles.aggregate([
    { "$sample": { "size": 1 } }
]).next()

In [32]:
terms

['field work',
 'fieldwork',
 'field study',
 'field site',
 'field area',
 'study site',
 'study location',
 'study area',
 'research site',
 'research location',
 'sampling site',
 'sampling location',
 'sampling area']

In [39]:
def view_one_article(query={ "$match": { "text_matches": { "$in": terms } } }):
    query = [
        query,
        { "$sample": { "size": 1 } }
    ]
    article = articles.aggregate(query).next()
    print("""
PMC ID

{}


TITLE

{}


METADATA

{}


TEXT MATCHES

{}


EXTRACTED TEXT

{}""".format(article.get("_id"),
             article.get("article_title"),
             article.get("article_meta"),
             article.get("text_matches"),
             article.get("extracted_text").strip()))

In [43]:
problematic_terms = terms[-3:]

In [53]:
view_one_article({ "$match": { "text_matches": { "$in": problematic_terms } } })


PMC ID

3538743


TITLE

None


METADATA

None


TEXT MATCHES

['study site', 'study area', 'sampling location', 'sampling area']


EXTRACTED TEXT

Introduction

Comparisons of persisting versus shorter-term effects of a given long-term disturbance are less common than might be expected; grazing management has provided a good laboratory for such studies, because of detailed, long-term stock use records, the presence of de facto long-term exclosures, and an understanding among managers that long- and short-term grazing effects may differ [1]–[4]. These studies have shown both similarities between long- and short-term effects as well as divergent effects [5], [6]. A significant period of time may be required for indirect effects or other subtle, slow, or complex processes to operate in ecosystems [2], [3], [7] resulting in, for example, short-term increases in nutrients due to grazing eventually transitioning to long-term nutrient decreases [2], [8], which in turn may influence arthropo

In [31]:
# To sample an article with text matches from our terms:
art = articles.aggregate([
    { "$match": { "text_matches": { "$in": terms } } },
    { "$sample": { "size": 1 } }
]).next()

# Print the ID, metadata, matches, and extracted text of your article
print("""
PMC ID

{}


TITLE

{}


METADATA

{}


TEXT MATCHES

{}


EXTRACTED TEXT

{}""".format(art.get("_id"),
             art.get("article_title"),
             art.get("article_meta"),
             art.get("text_matches"),
             art.get("extracted_text").strip()))


PMC ID

3543354


TITLE

None


METADATA

None


TEXT MATCHES

['study area']


EXTRACTED TEXT

Introduction

The transfer of energy and recycling of nutrients via the decomposition of organic matter is a central unifying process that links all organisms to the functioning of ecosystems [1]. However, dead organic matter varies enormously in its spatial and temporal distribution [2], and this determines the magnitude of its contribution to nutrient cycling [3] and the diversity and dynamics of its consumers [4]. Animal carrion is the most nutrient-rich form of dead organic matter [5], and recent reviews have highlighted the overlooked role of carrion in food webs [6], [7], [8], and driving variation in biodiversity and ecological processes in landscapes [5], [9]. Widespread changes to the population dynamics of large vertebrates, through loss of top predators [10] or hunting and harvesting of wild herbivores [11], are affecting the distribution and input of carrion resources in some te

In [24]:
soup = BeautifulSoup(art["xml"])

In [33]:
soup.front.find("article-id", attrs={"pub-id-type": "pmc"}).get_text()

'3573865'

In [136]:
article.article_type()

'research-article'

In [175]:
article.soup.find("front").find("journal-meta").find("journal-title").get_text()

'PLoS ONE'

In [14]:
article.journal_title()

'Methods in Ecology and Evolution'

In [16]:
article.article_title()

'Advances in multiplex PCR: balancing primer efficiencies and improving detection success'

## DataFrame scratch

In [107]:
foo = pd.DataFrame()

In [117]:
pd.DataFrame({"one": [10], "two": ["dog"]})

Unnamed: 0,one,two
0,10,dog


In [119]:
pd.DataFrame.from_records([{"one": 10, "two": "dog"}])

Unnamed: 0,one,two
0,10,dog


In [112]:
pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],ignore_index=True)

Unnamed: 0,A
0,0
1,1
2,2
3,3
4,4


In [None]:
pd.concat([foo])

## Count article types and other metadata

TODO: Check if an article's `_id` is in `result_set` and add those to a different default_dict

In [25]:
articles.count_documents({})

211

In [15]:
cursor = articles.aggregate([{ "$sample": { "size": articles.count_documents({}) } }])

KeyboardInterrupt: 

In [None]:
count = articles.count_documents({})
cursor = articles.find({})

In [46]:
reporter = Reporter(25, len(matches))

rows = []

for idx, record in enumerate(matches):
    reporter.report(idx)

    row = {}
    article = Article(record["xml"])

    row["id"] = record["_id"]
    row["keywords"] = article.keywords()
    row["id_types"] = list(article.pub_ids().keys())
    row["article_type"] = article.article_type()
    row["has_body"] = True if article.soup.body else False
    row["text_matches"] = record.get("text_matches")
    row["article_title"] = article.article_title()
    row["journal_title"] = article.journal_title()
    row["text_length"] = len(record["extracted_text"])

    rows.append(row)

[F[KProcessed 175 articles (82.9%) in 0m8s; about 0m2s left.


KeyboardInterrupt: 

In [47]:
article_df = pd.DataFrame.from_records(rows)

In [282]:
article_df["any_matches"] = [False if row is None else True for row in article_df["text_matches"]]
article_df.groupby(["any_matches"]).size()

any_matches
False    19519
True       481
dtype: int64

In [356]:
article_df.to_csv("article_sample.csv")

In [285]:
article_df.groupby(["any_matches", "article_type"]).size()
art_types = article_df.groupby(["any_matches", "article_type"]).size().reset_index()
art_types.columns = ["any_matches", "article_type", "count"]
art_types.sort_values(by = ["any_matches", "count"], ascending=False)

Unnamed: 0,any_matches,article_type,count
39,True,research-article,446
40,True,review-article,15
32,True,brief-report,7
38,True,protocol,4
33,True,case-report,3
37,True,other,3
34,True,data-paper,1
35,True,editorial,1
36,True,letter,1
28,False,research-article,14138


In [364]:
keep_vars = ["id", "any_matches"]
unnest_var = "keywords"

def unnest(data, unnest_var, keep_vars):
    all_vars = keep_vars + [unnest_var]
    nested = article_df.loc[:, keep_vars + [unnest_var]]
    lens = [len(item) if item is not None else 1 for item in nested[unnest_var]]
    unnested_dict = {var: np.repeat([nested[var].values], lens) for var in keep_vars}
    unnested_dict[unnest_var] = np.hstack(nested[unnest_var])
    unnested = pd.DataFrame(unnested_dict)
    return(unnested)

In [369]:
keywords = unnest(article_df, "keywords", ["id", "any_matches"])

keywords.to_csv("keywords.csv")

id_types = unnest(article_df, "id_types", ["id", "any_matches"])

id_types.to_csv("id_types.csv")

text_matches = unnest(article_df, "text_matches", ["id"])

text_matches.to_csv("text_matches.csv")

### Old version

In [131]:
id_types = defaultdict(int)
article_types = defaultdict(int)
keywords = defaultdict(int)

reporter = Reporter(10, len(matches))

for idx, doc in enumerate(matches):
    if idx is not 0 and idx % reporter.interval is 0:
        reporter.report(idx)
    article = Article(doc["xml"])
    
    kwds = article.keywords()
    if kwds:
        for kwd in kwds:
            keywords[kwd] += 1
    
    ids = article.pub_ids().keys()
    if len(ids) is 0:
        id_types["none"] += 1
    else:
        for item in ids:
            id_types[item] += 1
    
    article_type = article.article_type()
    if article_type:
        article_types[article_type] += 1



Processed 240 articles (96.0%) in 0m12s; about 0m1s left.


In [105]:
keywords = pd.DataFrame(list(keywords.items()), columns=["keyword", "count"])

In [106]:
keywords.sort_values("count", ascending=False)

Unnamed: 0,keyword,count
43,inflammation,6
587,oxidative stress,5
159,epidemiology,5
183,Obesity,4
279,Prostate cancer,4
1220,breast cancer,4
647,SNP,4
1833,cytotoxicity,4
45,mitochondria,4
294,depression,4


In [97]:
id_types

defaultdict(int,
            {'pmid': 18771,
             'pmc': 20000,
             'publisher-id': 13829,
             'doi': 17408,
             'pii': 1117,
             'pmc-scan': 1198,
             'manuscript': 206,
             'coden': 219,
             'art-access-id': 208,
             'other': 134,
             'publisher-manuscript': 22,
             'sici': 27,
             'medline': 4})

In [98]:
article_types

defaultdict(int,
            {'research-article': 14584,
             'abstract': 507,
             'brief-report': 318,
             'retraction': 12,
             'other': 748,
             'review-article': 1294,
             'editorial': 315,
             'case-report': 899,
             'correction': 245,
             'news': 82,
             'discussion': 42,
             'letter': 302,
             'meeting-report': 67,
             'product-review': 31,
             'reply': 9,
             'article-commentary': 92,
             'addendum': 3,
             'book-review': 306,
             'protocol': 31,
             'obituary': 29,
             'rapid-communication': 15,
             'methods-article': 13,
             'in-brief': 17,
             'systematic-review': 11,
             'introduction': 9,
             'oration': 2,
             'books-received': 3,
             ' case-report': 1,
             'data-paper': 8,
             'report': 2,
             'announcement'

## Look at the text of a single document matching a search result

In [23]:
articles.update_many(
    filter={},
    update={ "$unset": { "text_matches": "" } }
)

<pymongo.results.UpdateResult at 0x10d353408>