In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from IPython.display import clear_output

In [3]:
from pubcrawler.article import Article

In [105]:
import os
import pymongo
from bson.objectid import ObjectId
from collections import defaultdict
from random import sample, seed

In [159]:
import timeit

## Create a list of files

In [5]:
pmc_path = "/Volumes/Transcend/datasets/pmc/2019-04-03/oa_bulk/"

In [6]:
all_files = []
for (dirpath, dirnames, filenames) in os.walk(pmc_path):
    all_files += [os.path.join(dirpath, filename) for filename in filenames if filename.endswith("xml")]

## Counting coverage of different ID types from a sample of 5000 articles

In [16]:
files = sample(all_files, 1000)

In [17]:
id_types = defaultdict(int)
article_types = defaultdict(int)

for idx, filename in enumerate(files):
    if idx % 25 is 0:
        clear_output(wait=True)
        print("Processed {0} articles ({1:.1f}% done).".format(idx, idx/len(files) * 100))
    with open(filename, "r") as f:
        article = Article(f.read())
    
    ids = article.pub_ids().keys()
    if len(ids) is 0:
        id_types["none"] += 1
    else:
        for item in ids:
            id_types[item] += 1
    
    article_type = article.article_type()
    if article_type:
        article_types[article_type] += 1

clear_output()

In [18]:
id_types

defaultdict(int,
            {'pmid': 923,
             'pmc': 1000,
             'doi': 850,
             'publisher-id': 691,
             'coden': 10,
             'pii': 46,
             'pmc-scan': 66,
             'other': 8,
             'art-access-id': 7,
             'manuscript': 13,
             'publisher-manuscript': 2})

In [19]:
article_types

defaultdict(int,
            {'research-article': 716,
             'other': 40,
             'meeting-report': 4,
             'case-report': 52,
             'review-article': 59,
             'letter': 13,
             'brief-report': 19,
             'abstract': 28,
             'correction': 14,
             'book-review': 20,
             'editorial': 19,
             'obituary': 2,
             'article-commentary': 3,
             'retraction': 3,
             'methods-article': 2,
             'news': 2,
             'reply': 1,
             'product-review': 1,
             'protocol': 2})

It's clear that the PMC ID is the most broadly and consistently applied. Which makes sense! Because we're dealing with PMC articles.

In [83]:
with open(sample(all_files, 1)[0], "r") as f:
    xml = f.read()
    article = Article(xml)
    print(article.article_type())
    if article.soup.body:
        print("BODY")
        print(article.extract_text())
    else:
        print("NO BODY")
        print(article.extract_text())

research-article
BODY



Introduction

A prominent feature of schizophrenia is patients’ social exclusion. Patients have smaller social networks, less satisfactory interpersonal relationships and greater unemployment than healthy people or patients with other psychiatric disorders [1], [2]. A central and debilitating feature of schizophrenia, which may contribute to patients’ social exclusion, is patients’ difficulty interacting with others. These social deficits are poorly understood. Patients display poor performance on ‘off-line’ assessments of social cognition, which investigates the ability to discriminate facial expressions in pictures, attribute emotional states to the protagonists in short narratives and infer intentions in abstract problem solving contexts [3]. However, how patients’ social deficits manifest during their ‘on-line’ social interactions with others remains largely unexplored.

Early psychiatrists described feeling an intuitive ‘lack of rapport’ when interacting w

## Load articles into a MongoDB collection

In [84]:
client = pymongo.MongoClient("localhost", 27017)

In [85]:
articles = client.pmc.articles

In [198]:
seed("2019-04-04")
files = sample(all_files, 10000)

In [199]:
articles.drop()
for file in files:
    with open(file, "r") as f:
        xml = f.read()
        article = Article(xml)
        articles.insert_one({
            "_id": article.pub_ids().get("pmc"),  # Assuming they all have them
            "xml": xml,
            "extracted_text": article.extract_text()
        })

In [172]:
articles.find_one()

{'_id': '24489775',
 'xml': '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">\n<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" article-type="research-article"><?properties open_access?><front><journal-meta><journal-id journal-id-type="nlm-ta">PLoS One</journal-id><journal-id journal-id-type="iso-abbrev">PLoS ONE</journal-id><journal-id journal-id-type="publisher-id">plos</journal-id><journal-id journal-id-type="pmc">plosone</journal-id><journal-title-group><journal-title>PLoS ONE</journal-title></journal-title-group><issn pub-type="epub">1932-6203</issn><publisher><publisher-name>Public Library of Science</publisher-name><publisher-loc>San Francisco, USA</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="pmid">24489775</article-id><article-id pub-id-type="pmc">3904948</article-id><article-id pub-id-type="publisher-id">P

Using `collection.update_one()`

In [173]:
for file in files:
    with open(file, "r") as f:
        xml = f.read()
        article = Article(xml)
        articles.update_one(
            filter = {
                "_id": article.pub_ids().get("pmc")
            },
            update = {
                "$set": {
                    "xml": xml,
                    "extracted_text": article.extract_text()
                }
            },
            upsert=True)

## Stuff for Python

TODO: Figure out how to use a boolean query

In [204]:
articles.create_index([("extracted_text", pymongo.TEXT,)])

'extracted_text_text'

In [235]:
study_site_query = '"field study"'

In [236]:
articles.count_documents(
   { '$text': { '$search': study_site_query } }
)

8404

In [224]:
results = articles.find(
   { '$text': { '$search': study_site_query } },
   { 'score': { '$meta': "textScore" } }
).sort([('score', {'$meta': 'textScore'})])

In [225]:
result_list = [result["extracted_text"] for result in results]

In [230]:
print(result_list[2])




1. Introduction

The current treatments for GBMs include surgery, fractionated radiotherapy (FR), and temozolomide. This approach provides a definite effect as patients receiving this multimodal treatment have a median survival of approximately 15 months [1, 2], compared with 3 months if no treatment is given. Surgical debulking reduces symptoms and provides tissue for diagnosis, but infiltrative tumor growth makes complete removal impossible. Conventional radiotherapy improves survival [3] but is associated with noteworthy toxicity due to the high doses delivered to the surrounding brain tissue. Thus, patients surviving more than 12 months often exhibit significant cognitive deficits. Due to the improved survival of GBM patients in recent years with two year survival rates of 26.5%, more people will live to experience these side effects [4]. As such, optimized radiation modalities to reduce toxicity are warranted. Future treatments may include lower dose FR combined with hypofracti

## Basic code for walking with a limit

However, this isn't usable, because we actually need to *randomly* sample the files.

In [86]:
i = 10
for path, dirs, files in os.walk(pmc_path):
    if i <= 0:
        break
    for file in files[:i]:
        if file.endswith("xml"):
            i -= 1
            with open(os.path.join(path, file), "r") as file:
                article = Article(file.read())
            print(article.pub_ids().get("pmid"))
print("Stopped.")

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
Stopped.


In [85]:
## Counting how many have PMIDs.

In [105]:
foo = dict()

In [107]:
len(foo.keys())

0

In [113]:
id_types = defaultdict(int)

i = 2500
for path, dirs, files in os.walk(pmc_path):
    if i <= 0:
        break
    for file in files[:i]:
        if file.endswith("xml"):
            if i % 25 is 0:
                print(i)
            i -= 1
            with open(os.path.join(path, file), "r") as file:
                article = Article(file.read())
            ids = article.pub_ids().keys()
            if len(ids) is 0:
                id_types["none"] += 1
            else:
                for item in ids:
                    id_types[item] += 1
            
print(id_types)

2500
2475
2450
2425
2400
2375
2350
2325
2300
2275
2250
2225
2200
2175
2150
2125
2100
2075
2050
2025
2000
1975
1950
1925
1900
1875
1850
1825
1800
1775
1750
1725
1700
1675
1650
1625
1600
1575
1550
1525
1500
1475
1450
1425
1400
1375
1350
1325
1300
1275
1250
1225
1200
1175
1150
1125
1100
1075
1050
1025
1000
975
950
925
900
875
850
825
800
775
750
725
700
675
650
625
600
575
550
525
500
475
450
425
400
375
350
325
300
275
250
225
200
175
150
125
100
75
50
25
defaultdict(<class 'int'>, {'pmid': 2484, 'pmc': 2500, 'doi': 2500, 'publisher-id': 2476, 'art-access-id': 4, 'manuscript': 11, 'coden': 2002, 'pii': 1995})
