In [225]:
#!pip install pymed
#!pip install entrezpy

In [14]:
!pip install biopython #required

Collecting biopython
  Downloading biopython-1.78-cp37-cp37m-macosx_10_9_x86_64.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 146 kB/s eta 0:00:01
Installing collected packages: biopython
Successfully installed biopython-1.78


In [1]:
import Bio

In [2]:
Bio.__version__

'1.79.dev0'

In [224]:
from Bio import Entrez
import xml.etree.ElementTree as ET 
import re
from bs4 import BeautifulSoup 
import pandas as pd

In [212]:
Entrez.email = 'jpzhangv@gmail.com'

In [229]:
search_query = '("neurofibromatoses"[MeSH Terms] OR "neurofibromatoses"[All Fields] OR "neurofibromatosis"[All Fields]) OR NF1[All Fields] OR  NF2[All Fields] AND "open access"[filter]'

In [230]:
search_results = Entrez.read(Entrez.esearch(db="pmc", term=search_query, retmax=10, usehistory="y"))

In [245]:
search_results["Count"]

'16733'

In [236]:
handle = Entrez.efetch(db="pmc", rettype="full", retmode="xml", retstart=0, retmax=int(search_results["Count"]), webenv=search_results["WebEnv"], query_key=search_results["QueryKey"])

In [54]:
#record = Entrez.read(handle, validate=False)

In [237]:
text = handle.read()
type(text)

bytes

In [238]:
#len(text)

In [223]:
text[:1000]

b'<?xml version="1.0" ?>\n<!DOCTYPE pmc-articleset PUBLIC "-//NLM//DTD ARTICLE SET 2.0//EN" "https://dtd.nlm.nih.gov/ncbi/pmc/articleset/nlm-articleset-2.0.dtd">\n<pmc-articleset><article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" article-type="research-article">\n  <?properties open_access?>\n  <front>\n    <journal-meta>\n      <journal-id journal-id-type="nlm-ta">JCI Insight</journal-id>\n      <journal-id journal-id-type="iso-abbrev">JCI Insight</journal-id>\n      <journal-id journal-id-type="publisher-id">JCI Insight</journal-id>\n      <journal-title-group>\n        <journal-title>JCI Insight</journal-title>\n      </journal-title-group>\n      <issn pub-type="epub">2379-3708</issn>\n      <publisher>\n        <publisher-name>American Society for Clinical Investigation</publisher-name>\n      </publisher>\n    </journal-meta>\n    <article-meta>\n      <article-id pub-id-type="pmid">32960816</article-id>\n      <article-id pub-id-ty

In [198]:
text[-2000:]

b'tes, one was derived from a neurofibromatosis patient and the other from an A-T heterozygote. When SV40 DNA was employed as the transforming agent for the latter, the transformation rate was no longer raised.</p>\n        <sec sec-type="scanned-figures">\n          <title>Images</title>\n          <fig id="F1">\n            <label>Fig. 1</label>\n            <graphic xlink:href="brjcancer00296-0052-a" xlink:role="586"/>\n          </fig>\n          <fig id="F2">\n            <label>Fig. 2</label>\n            <graphic xlink:href="brjcancer00296-0053-a" xlink:role="587"/>\n          </fig>\n        </sec>\n      </abstract>\n    </article-meta>\n  </front>\n  <body>\n    <supplementary-material content-type="scanned-pages">\n      <graphic xlink:href="brjcancer00296-0049.tif" xlink:title="scanned-page" xlink:role="583" mimetype="image" mime-subtype="tiff"/>\n      <graphic xlink:href="brjcancer00296-0050.tif" xlink:title="scanned-page" xlink:role="584" mimetype="image" mime-subtype="t

In [255]:
#text[89000:109000]

In [22]:
text.count(b'article-id pub-id-type="pmc"')

1200

In [28]:
text.count(b'<abstract>')

921

In [63]:
text.count(b'<article-title>')

75296

In [26]:
text.count(b'article-title')

150637

## Parse the raw XML data

In [239]:
%time
pmc_paper_data = BeautifulSoup(text, "xml") 

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 17.9 µs


In [30]:
set([tag.name for tag in pmc_paper_data.find_all()])

{'abstract',
 'ack',
 'addr-line',
 'address',
 'aff',
 'alt-text',
 'alt-title',
 'alternatives',
 'annotation',
 'anonymous',
 'app',
 'app-group',
 'array',
 'article',
 'article-categories',
 'article-id',
 'article-meta',
 'article-title',
 'article-version',
 'attrib',
 'author-comment',
 'author-notes',
 'award-group',
 'award-id',
 'back',
 'bio',
 'body',
 'bold',
 'boxed-text',
 'break',
 'caption',
 'chapter-title',
 'citation',
 'city',
 'col',
 'colgroup',
 'collab',
 'comment',
 'compound-subject',
 'compound-subject-part',
 'conf-date',
 'conf-loc',
 'conf-name',
 'conference',
 'contrib',
 'contrib-group',
 'contrib-id',
 'copyright-holder',
 'copyright-statement',
 'copyright-year',
 'corresp',
 'country',
 'counts',
 'custom-meta',
 'custom-meta-group',
 'date',
 'date-in-citation',
 'day',
 'def',
 'def-item',
 'def-list',
 'degrees',
 'disp-formula',
 'disp-quote',
 'edition',
 'element-citation',
 'elocation-id',
 'email',
 'equation-count',
 'etal',
 'ext-link',
 

In [70]:
len(pmc_paper_data.find_all('article'))

1200

In [71]:
len(pmc_paper_data.find_all('abstract'))

1282

In [59]:
#pmc_paper_data.find_all('article-id', attrs={'pub-id-type' : 'pmc'})
#./pmc-articleset/article/ [@pub-id-type="pmc"]

In [60]:
all_pmc_ids = [tag.text for tag in pmc_paper_data.find_all('article-id', attrs={'pub-id-type' : 'pmc'})]

## Extract useful information from the parsed XML data

In [240]:
all_article = pmc_paper_data.find_all('article')

In [244]:
len(all_article)

10000

In [None]:
all_article[0].find_all('')

In [241]:
pmc_papers_ls = []
for article in all_article:
    pmc_id = article.find("article-id", attrs={'pub-id-type' : 'pmc'}).getText()
    title = article.find("article-title").getText()
    author_ls = set([s.getText().strip().replace("\n", " ") for ss in article.find_all("contrib", attrs = {'contrib-type': "author"}) for s in ss.find_all("name") if s])
    authors = ', '.join(author_ls)
    affliations = ', '.join([re.sub("n\d+|\d+|\n", "", t.text)  for t in article.find_all("aff") if t is not None])
    keywords = ', '.join([t.text.strip().replace("\n", " ")  for t in article.find_all("kwd")])
    abstract_node = article.find("abstract")
    if abstract_node and abstract_node.find("p"):
        abstract = abstract_node.find("p").getText().strip()
    else:
        abstract = ''
        
    pmc_papers_ls.append({'pmc_id': pmc_id, 'title': title, 'authors': authors, 'affliations': affliations, 'keywords': keywords, 'abstract': abstract})
    #print(keywords)
    #print("\n")

In [242]:
pmc_papers_df = pd.DataFrame(pmc_papers_ls)

In [243]:
pmc_papers_df.shape

(10000, 6)

In [247]:
pmc_papers_df['keywords'] = pmc_papers_df['keywords'].map(lambda x: x.replace('\n', ''))

In [248]:
pmc_papers_df.head()

Unnamed: 0,pmc_id,title,authors,affliations,keywords,abstract
0,7643456,Discernment between candidate mechanisms for K...,"Stites Edward C., Rossman Kent L., McFall Thom...","grid.. Integrative Biology Laboratory, Salk ...","KRAS, GTPase, EGFR, Cancer, Targeted therapy",Phase three clinical trial evidence suggests t...
1,7643332,Psychometric properties of satisfaction with t...,"Mirghafourvand Mojgan, Mohammad-Alizadeh-Chara...","grid..f Students’ Research Committee, Tabriz...","Satisfaction, Childbirth education, Validity, ...",Childbirth preparation classes can reduce preg...
2,7641497,A Rare Incidence of Metachronous Neurovascular...,"Chandrasekaran Deepak, Azariah Emmanuel D, Chi...","Oral and Maxillofacial Surgery, Sri Ramachandr...","mesenchymal tumor, solitary, intraosseous, juv...",Neurofibroma is an uncommon benign tumor arisi...
3,7640792,A Child with Enlarged Extremities – A Case of ...,"Sundareswaran N, Gopinath G, Gunasekaran K","From the Department of Neurology, Government M...","Enlarged extremities, focal gigantism, macroda...","Macrodystrophia lipomatosa (ML) is a rare, non..."
4,7609672,Primary pancreatic glomus tumor invading into ...,"Sasaki Yu, Taki Yoshiro, Tamaki Ichiro, Sasano...","grid..cDepartment of Surgery, Kansai Electric ...","Glomus tumor, Glomangiomyoma, Pancreas, Immuno...",Glomus tumors are subcutaneous tumors arising ...


In [249]:
pmc_papers_df.drop_duplicates('pmc_id').to_csv("../data/pmc_papers.csv")