In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pymongo
from collections import defaultdict

In [3]:
import pandas as pd
import numpy as np

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

In [8]:
from IPython.display import clear_output

In [9]:
from pubcrawler.article import Article
from bs4 import BeautifulSoup

In [10]:
from reporter import Reporter

In [11]:
client = pymongo.MongoClient("localhost", 27017)
articles = client.pmc.articles

In [12]:
with open("terms") as f:
    terms = [line.strip() for line in f.readlines()]

In [13]:
art = articles.find_one()

In [14]:
# To sample a random article:
art = articles.aggregate([
    { "$sample": { "size": 1 } }
]).next()

In [15]:
all_articles = list(articles.find({}))

In [105]:
corpus = [article["extracted_text"] for article in all_articles]

In [137]:
terms

['field work',
 'fieldwork',
 'field study',
 'field site',
 'field area',
 'study site',
 'study location',
 'study area',
 'research site',
 'research location',
 'sampling site',
 'sampling location',
 'sampling area']

In [178]:
vectorizer = CountVectorizer(stop_words="english")
counts = vectorizer.fit_transform(corpus)
transformer = TfidfTransformer(smooth_idf=True, sublinear_tf=True)
tfidf = transformer.fit_transform(counts)
word_features = vectorizer.get_feature_names()

In [179]:
for i in range(10):
    word_indexes = tfidf.getrow(i).todense().A1.argsort()[-10:][::-1]
    print(word_indexes)
    for i in word_indexes:
        print(word_features[i])

[322204 256677 233299 461838 561712 971313 610883 971311 778468 562046]
citi
botswana
batswana
fhb
hrdc
seminars
jfm
seminar
nonissues
hru
[ 790005 1101959 1073438  809599  447313  282914 1122152  243340  252286
  437990]
ob
uw
treadmill
ow
exercise
calories
walking
bicycle
bmi
ergometer
[221560 327330 816278 351788 957664 661510 458111 837499 584310 816496]
avb
cmr
pacemaker
cs
sarcoidosis
lge
fdg
pet
implantation
pacing
[980339 698890 585305 577284 312817 480478 469401 687615 497705 407725]
shlomi
mba
inactivereactions
iems
checkmodelconsistency
fva
fluxes
mahadevan
generic
duarte
[660260 314954 371466 248354 519958 725121 217678 456655 809702 696136]
leukemia
childhood
daycare
birth
greaves
miscarriage
attendance
father
ownership
maternal
[ 762901  627716  988483  799108 1100513  878864  479228 1070437  266491
 1016587]
nemesis
ketcher
sketch
openbabel
users
project
functionality
trajectory
build
structure
[ 953659  541212 1089785  932509  990020  740154  152392  220204  220195
  18

In [107]:
x = "background of infectious disease mosquito sick"

In [123]:
y = vectorizer.transform([x])
z = transformer.transform(y)
z2 = z.todense().A1.argsort()[-10:][::-1]
print(z2)
for i in z2:
    print(word_features[i])

[736358 982377 588862 227412 394134 391707 391705 391736 391735 391734]
mosquito
sick
infectious
background
disease
dimiker
dimidschstein
diminutesished
diminuted
diminuta


## Get the tf-idf scores for documents matching a certain term

In [139]:
term = "field area"

In [145]:
all_articles[0]["text_matches"]

['study site']

In [151]:
match_index = []
for i, article in enumerate(all_articles):
    if article.get("text_matches"):
        if term in article.get("text_matches"):
            match_index.append(i)

In [155]:
print(corpus[match_index[0]])




Introduction

A growing body of research on the communicative behaviour of non-human primates has demonstrated that their vocalisations can convey a considerably rich amount of information that is meaningful to receivers (e.g. [1]). For instance, field experiments with various primate species have shown that acoustically distinct alarm calls can inform listeners about specific types of dangers (e.g. [2]–[5]). In some species, there is evidence that signallers produce strings of acoustically variable calls composed in context-specific ways (e.g. [6]–[8]). For example, black-and-white Colobus monkeys (Colobus polykomos, C. guereza) produce two types of vocalisations to predators, which are arranged in event-specific sequences that are seemingly meaningful to others [8].

Food discovery is another event type during which some primates produce highly context-specific vocalisations. Since food is often patchily distributed and seasonally dispersed, food calls can provide listeners with a

In [160]:
i = 593

In [161]:
article_tfidf = tfidf.getrow(i)

In [173]:
dense = article_tfidf.todense()

In [175]:
dense.

array([0.01425685, 0.        , 0.        , ..., 0.        , 0.        ,
       0.        ])

In [157]:
for i in match_index:
    word_indexes = tfidf.getrow(i).todense().A1.argsort()[-10:][::-1]
    article_tfidf = tfidf.getrow(i)
    
    print(word_indexes)
    for i in word_indexes:
        print(word_features[i])

[[0.01425685 0.         0.         ... 0.         0.         0.        ]]
[632189 198563 471176 855415 471461 255138 282795 892066 973154 587331]
kiwi
apple
food
playback
foraging
bonobos
calls
py
sequences
individuals
[[0.        0.0027204 0.        ... 0.        0.        0.       ]]
[ 711305  309924  429585  576144 1020736  694238  444644  995491  734764
  414854]
methanogens
ch4
enceladus
icy
subsurface
mars
europa
solar
moons
earth
[[0.01539038 0.02097378 0.         ... 0.         0.         0.        ]]
[1013019  347431  251864 1030135  739649  350460  399969  739722 1013021
  348150]
stifle
cr
bm
synovial
msc
crp
dogs
mscs
stifles
crcl
[[0.         0.00298952 0.         ... 0.         0.         0.        ]]
[1048546 1016386  888307  886893  879455  745420  255664  449177  594449
  689388]
tert
stromal
pts
pt
promoter
mutation
borderline
expression
intermediate
malignant
[[0. 0. 0. ... 0. 0. 0.]]
[ 748798  323378  462078 1038662  915717  748805 1152022  217778  972059
  972220]


In [43]:
Y = vectorizer2.fit_transform(corpus)

In [45]:
Y.toarray()

array([[0.00484436, 0.0078168 , 0.00484436, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.00906395, 0.        ,
        0.00906395],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.01458545,
        0.        ],
       [0.        , 0.01433153, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [48]:
vectorizer2.get_feature_names()

['0001',
 '001',
 '003',
 '004',
 '01',
 '04',
 '040',
 '041',
 '043',
 '05',
 '051',
 '057',
 '060',
 '073',
 '076',
 '08',
 '080',
 '094',
 '10',
 '100',
 '1000',
 '101112',
 '1083',
 '11',
 '1187',
 '12',
 '121',
 '124',
 '13',
 '1360',
 '137',
 '14',
 '144',
 '1472',
 '15',
 '16',
 '163',
 '17',
 '177',
 '178',
 '18',
 '1827',
 '18f',
 '19',
 '190',
 '1905',
 '195',
 '1964',
 '1970',
 '1980',
 '1991',
 '1997',
 '1999',
 '20',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2013',
 '21',
 '211',
 '214',
 '22',
 '2235',
 '23',
 '24',
 '243',
 '25',
 '2500',
 '26',
 '27',
 '2732',
 '2766',
 '277',
 '28',
 '29',
 '30',
 '303',
 '304',
 '31',
 '314',
 '32',
 '33',
 '34',
 '342',
 '349',
 '35',
 '352',
 '358',
 '36',
 '37',
 '374',
 '3742',
 '38',
 '3844',
 '385',
 '39',
 '392',
 '40',
 '4000',
 '41',
 '42',
 '43',
 '431',
 '435',
 '44',
 '448',
 '454',
 '46',
 '460',
 '46073',
 '47',
 '471',
 '48',
 '4806',
 '484',
 '49',
 '50',
 '507

In [50]:
vectorizer2.vocabulary_

{'background': 418,
 'many': 1611,
 'countries': 752,
 'in': 1366,
 'the': 2526,
 'global': 1210,
 'south': 2386,
 'are': 349,
 'hard': 1250,
 'pressed': 1984,
 'to': 2558,
 'identify': 1330,
 'country': 753,
 'personnel': 1901,
 'with': 2742,
 'adequate': 259,
 'training': 2579,
 'human': 1312,
 'subjects': 2452,
 'research': 2176,
 'ethics': 996,
 'participate': 1863,
 'as': 361,
 'investigators': 1469,
 'staff': 2408,
 'or': 1816,
 'members': 1649,
 'of': 1783,
 'review': 2206,
 'bodies': 477,
 'international': 1452,
 'partners': 1870,
 'have': 1255,
 'attempted': 391,
 'address': 257,
 'this': 2543,
 'need': 1731,
 'by': 495,
 'incorporating': 1380,
 'short': 2324,
 'courses': 755,
 'and': 323,
 'workshops': 2755,
 'into': 1460,
 'their': 2527,
 'capacity': 510,
 'building': 491,
 'programs': 2018,
 'supporting': 2479,
 'host': 1300,
 'initiatives': 1416,
 'implement': 1351,
 'efforts': 932,
 'own': 1844,
 'while': 2729,
 'such': 2459,
 'generally': 1191,
 'share': 2316,
 'common':

In [52]:
vectorizer2.vocabulary_.get("background")

418

In [54]:
vectorizer2.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [56]:
X = vectorizer2.inverse_transform(Y)

In [61]:
x = X[0]

In [62]:
y = Y[0]

In [67]:
pd.DataFrame(zip(x, y))

Unnamed: 0,0,1
0,background,"(0, 418)\t0.006488649324705781\n (0, 1611)\..."


## Code for viewing documents

In [17]:
terms

['field work',
 'fieldwork',
 'field study',
 'field site',
 'field area',
 'study site',
 'study location',
 'study area',
 'research site',
 'research location',
 'sampling site',
 'sampling location',
 'sampling area']

In [25]:
def print_article(article):
    print("""
PMC ID

{}


TITLE

{}


METADATA

{}


TEXT MATCHES

{}


EXTRACTED TEXT

{}""".format(article.get("_id"),
             article.get("article_title"),
             article.get("article_meta"),
             article.get("text_matches"),
             article.get("extracted_text").strip()))

In [18]:
def view_one_article(query={ "$match": { "text_matches": { "$in": terms } } }):
    query = [
        query,
        { "$sample": { "size": 1 } }
    ]
    article = articles.aggregate(query).next()
    print("""
PMC ID

{}


TITLE

{}


METADATA

{}


TEXT MATCHES

{}


EXTRACTED TEXT

{}""".format(article.get("_id"),
             article.get("article_title"),
             article.get("article_meta"),
             article.get("text_matches"),
             article.get("extracted_text").strip()))

In [31]:
terms

['field work',
 'fieldwork',
 'field study',
 'field site',
 'field area',
 'study site',
 'study location',
 'study area',
 'research site',
 'research location',
 'sampling site',
 'sampling location',
 'sampling area']

In [46]:
view_one_article()


PMC ID

6210248


TITLE

The Effect of Different Habitat Types and Ontogenetic Stages on the Diet Shift of a Critically Endangered Fish Species, Coreius guichenoti (Sauvage and Dabry de Thiersant, 1874)


METADATA

{'has_body': True, 'article_type': 'research-article'}


TEXT MATCHES

['study area', 'sampling site']


EXTRACTED TEXT

1. Introduction

The identification and protection of fish critical habitats are central to the active management of species at risk [1,2]. Generally, the spawning grounds and nursery, migration, and rearing areas on which fish species depend directly to complete their life history cycles, are identified as the critical habitats for fish species [1]. However, many fish species use different critical habitats within different life history stages [3,4]. The distance between critical habitats may extend to several hundred or thousands of kilometers. Due to environmental heterogeneity in different habitats, the fish species may exhibit plastic dietary pattern

In [66]:
problematic_terms = terms[-3:]

In [76]:
view_one_article({ "$match": { "text_matches": { "$in": problematic_terms } } })


PMC ID

6002991


TITLE

Decadal stability in genetic variation and structure in the intertidal seaweed Fucus serratus (Heterokontophyta: Fucaceae)


METADATA

{'has_body': True, 'article_type': 'research-article'}


TEXT MATCHES

['sampling site', 'sampling location']


EXTRACTED TEXT

Background

Understanding temporal stability of genetic structure and diversity is crucial for the utility of temporal snapshots in conservation management and to infer how climate-induced range shifts might affect the future distribution and adaptive potential of species. In trailing edge populations, effective population size and genetic diversity are considered major keys to adaptive potential and subsequent persistence under climate change [1, 2]. In contrast, the evolutionary potential and survival of low-diversity leading edge populations [3] may be either enhanced or impaired by the ‘surfing’ of new mutations that can rapidly increase in frequency over iterated founder events, depending on wheth

In [53]:
phrase = """
"Following optic nerve injury, few axons grow beyond the lesion, but we find these axons branch and form loops proximal to the lesion"
"""

In [54]:
cursor = articles.find({'$text': {'$search': phrase}})

In [55]:
print_article(cursor.next())


PMC ID

5575138


TITLE

3D Visualization of Individual Regenerating Retinal Ganglion Cell Axons Reveals Surprisingly Complex Growth Paths


METADATA

{'has_body': True, 'article_type': 'research-article'}


TEXT MATCHES

['field area']


EXTRACTED TEXT

Significance Statement

Retinal ganglion cells (RGCs) are viewed as being incapable of mounting lengthy axon regeneration. Using whole tissue immunolabeling, we establish a technique to visualize and trace the entire paths of small populations of genetically labeled RGC axons as they regenerate. Following optic nerve injury, few axons grow beyond the lesion, but we find these axons branch and form loops proximal to the lesion. A regeneration inducing treatment further exacerbates branching and tortuous growth, while only modestly increasing the number of RGC axons that successfully grow beyond the lesion. Our study demonstrates extensive and circuitous RGC axon elongation both in pre- and post-lesion regions, highlighting the need to 

In [24]:
view_one_article({ "$match": {'$text': {'$search': phrase}}})


PMC ID

5063180


TITLE

Linking quality of care and training costs: cost‐effectiveness in health professions education


METADATA

{'has_body': True, 'article_type': 'research-article'}


TEXT MATCHES

None


EXTRACTED TEXT

Introduction

Health professions education involves training and certifying care‐provider groups in specific procedures. However, there are considerable associated costs that have been estimated globally to amount to more than 80 billion Euros per year.1 Because some training is usually more effective than no training2 but often associated with considerable monetary and time costs, identifying the most cost‐effective strategy can be challenging.3 Nonetheless, many institutions have to balance the need for training new health care providers in performing certain types of procedures against the costs associated with training.4 Cost‐effectiveness analyses are suitable for these types of decisions; however, only a few studies have attempted to link training costs to 

In [31]:
# To sample an article with text matches from our terms:
art = articles.aggregate([
    { "$match": { "text_matches": { "$in": terms } } },
    { "$sample": { "size": 1 } }
]).next()

# Print the ID, metadata, matches, and extracted text of your article
print("""
PMC ID

{}


TITLE

{}


METADATA

{}


TEXT MATCHES

{}


EXTRACTED TEXT

{}""".format(art.get("_id"),
             art.get("article_title"),
             art.get("article_meta"),
             art.get("text_matches"),
             art.get("extracted_text").strip()))


PMC ID

3543354


TITLE

None


METADATA

None


TEXT MATCHES

['study area']


EXTRACTED TEXT

Introduction

The transfer of energy and recycling of nutrients via the decomposition of organic matter is a central unifying process that links all organisms to the functioning of ecosystems [1]. However, dead organic matter varies enormously in its spatial and temporal distribution [2], and this determines the magnitude of its contribution to nutrient cycling [3] and the diversity and dynamics of its consumers [4]. Animal carrion is the most nutrient-rich form of dead organic matter [5], and recent reviews have highlighted the overlooked role of carrion in food webs [6], [7], [8], and driving variation in biodiversity and ecological processes in landscapes [5], [9]. Widespread changes to the population dynamics of large vertebrates, through loss of top predators [10] or hunting and harvesting of wild herbivores [11], are affecting the distribution and input of carrion resources in some te

In [24]:
soup = BeautifulSoup(art["xml"])

In [33]:
soup.front.find("article-id", attrs={"pub-id-type": "pmc"}).get_text()

'3573865'

In [136]:
article.article_type()

'research-article'

In [175]:
article.soup.find("front").find("journal-meta").find("journal-title").get_text()

'PLoS ONE'

In [14]:
article.journal_title()

'Methods in Ecology and Evolution'

In [16]:
article.article_title()

'Advances in multiplex PCR: balancing primer efficiencies and improving detection success'

## DataFrame scratch

In [107]:
foo = pd.DataFrame()

In [117]:
pd.DataFrame({"one": [10], "two": ["dog"]})

Unnamed: 0,one,two
0,10,dog


In [119]:
pd.DataFrame.from_records([{"one": 10, "two": "dog"}])

Unnamed: 0,one,two
0,10,dog


In [112]:
pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],ignore_index=True)

Unnamed: 0,A
0,0
1,1
2,2
3,3
4,4


In [None]:
pd.concat([foo])

## Count article types and other metadata

TODO: Check if an article's `_id` is in `result_set` and add those to a different default_dict

In [25]:
articles.count_documents({})

211

In [15]:
cursor = articles.aggregate([{ "$sample": { "size": articles.count_documents({}) } }])

KeyboardInterrupt: 

In [None]:
count = articles.count_documents({})
cursor = articles.find({})

In [46]:
reporter = Reporter(25, len(matches))

rows = []

for idx, record in enumerate(matches):
    reporter.report(idx)

    row = {}
    article = Article(record["xml"])

    row["id"] = record["_id"]
    row["keywords"] = article.keywords()
    row["id_types"] = list(article.pub_ids().keys())
    row["article_type"] = article.article_type()
    row["has_body"] = True if article.soup.body else False
    row["text_matches"] = record.get("text_matches")
    row["article_title"] = article.article_title()
    row["journal_title"] = article.journal_title()
    row["text_length"] = len(record["extracted_text"])

    rows.append(row)

[F[KProcessed 175 articles (82.9%) in 0m8s; about 0m2s left.


KeyboardInterrupt: 

In [47]:
article_df = pd.DataFrame.from_records(rows)

In [282]:
article_df["any_matches"] = [False if row is None else True for row in article_df["text_matches"]]
article_df.groupby(["any_matches"]).size()

any_matches
False    19519
True       481
dtype: int64

In [356]:
article_df.to_csv("article_sample.csv")

In [285]:
article_df.groupby(["any_matches", "article_type"]).size()
art_types = article_df.groupby(["any_matches", "article_type"]).size().reset_index()
art_types.columns = ["any_matches", "article_type", "count"]
art_types.sort_values(by = ["any_matches", "count"], ascending=False)

Unnamed: 0,any_matches,article_type,count
39,True,research-article,446
40,True,review-article,15
32,True,brief-report,7
38,True,protocol,4
33,True,case-report,3
37,True,other,3
34,True,data-paper,1
35,True,editorial,1
36,True,letter,1
28,False,research-article,14138


In [364]:
keep_vars = ["id", "any_matches"]
unnest_var = "keywords"

def unnest(data, unnest_var, keep_vars):
    all_vars = keep_vars + [unnest_var]
    nested = article_df.loc[:, keep_vars + [unnest_var]]
    lens = [len(item) if item is not None else 1 for item in nested[unnest_var]]
    unnested_dict = {var: np.repeat([nested[var].values], lens) for var in keep_vars}
    unnested_dict[unnest_var] = np.hstack(nested[unnest_var])
    unnested = pd.DataFrame(unnested_dict)
    return(unnested)

In [369]:
keywords = unnest(article_df, "keywords", ["id", "any_matches"])

keywords.to_csv("keywords.csv")

id_types = unnest(article_df, "id_types", ["id", "any_matches"])

id_types.to_csv("id_types.csv")

text_matches = unnest(article_df, "text_matches", ["id"])

text_matches.to_csv("text_matches.csv")

### Old version

In [131]:
id_types = defaultdict(int)
article_types = defaultdict(int)
keywords = defaultdict(int)

reporter = Reporter(10, len(matches))

for idx, doc in enumerate(matches):
    if idx is not 0 and idx % reporter.interval is 0:
        reporter.report(idx)
    article = Article(doc["xml"])
    
    kwds = article.keywords()
    if kwds:
        for kwd in kwds:
            keywords[kwd] += 1
    
    ids = article.pub_ids().keys()
    if len(ids) is 0:
        id_types["none"] += 1
    else:
        for item in ids:
            id_types[item] += 1
    
    article_type = article.article_type()
    if article_type:
        article_types[article_type] += 1



Processed 240 articles (96.0%) in 0m12s; about 0m1s left.


In [105]:
keywords = pd.DataFrame(list(keywords.items()), columns=["keyword", "count"])

In [106]:
keywords.sort_values("count", ascending=False)

Unnamed: 0,keyword,count
43,inflammation,6
587,oxidative stress,5
159,epidemiology,5
183,Obesity,4
279,Prostate cancer,4
1220,breast cancer,4
647,SNP,4
1833,cytotoxicity,4
45,mitochondria,4
294,depression,4


In [97]:
id_types

defaultdict(int,
            {'pmid': 18771,
             'pmc': 20000,
             'publisher-id': 13829,
             'doi': 17408,
             'pii': 1117,
             'pmc-scan': 1198,
             'manuscript': 206,
             'coden': 219,
             'art-access-id': 208,
             'other': 134,
             'publisher-manuscript': 22,
             'sici': 27,
             'medline': 4})

In [98]:
article_types

defaultdict(int,
            {'research-article': 14584,
             'abstract': 507,
             'brief-report': 318,
             'retraction': 12,
             'other': 748,
             'review-article': 1294,
             'editorial': 315,
             'case-report': 899,
             'correction': 245,
             'news': 82,
             'discussion': 42,
             'letter': 302,
             'meeting-report': 67,
             'product-review': 31,
             'reply': 9,
             'article-commentary': 92,
             'addendum': 3,
             'book-review': 306,
             'protocol': 31,
             'obituary': 29,
             'rapid-communication': 15,
             'methods-article': 13,
             'in-brief': 17,
             'systematic-review': 11,
             'introduction': 9,
             'oration': 2,
             'books-received': 3,
             ' case-report': 1,
             'data-paper': 8,
             'report': 2,
             'announcement'

## Look at the text of a single document matching a search result

In [23]:
articles.update_many(
    filter={},
    update={ "$unset": { "text_matches": "" } }
)

<pymongo.results.UpdateResult at 0x10d353408>