In [34]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, "../../scrapemed")

import scrapemed.paper as paper
import pandas as pd
import lxml
from dotenv import load_dotenv
import os
load_dotenv()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


True

## Download a Paper from PMC

In [35]:
#Specify creds and PMCID

PMCID = 7067710
email = os.getenv("PMC_EMAIL")

p = paper.Paper.from_pmc(PMCID, email, download=False)





## Natural Language Querying and Paper Embeddings

In [36]:
p.query("absorption")

Vectorizing Paper (This may take a little while)...


Insert of existing embedding ID: pmcid-7067710-chunk-0
Insert of existing embedding ID: pmcid-7067710-chunk-1
Insert of existing embedding ID: pmcid-7067710-chunk-2
Insert of existing embedding ID: pmcid-7067710-chunk-3
Insert of existing embedding ID: pmcid-7067710-chunk-4
Insert of existing embedding ID: pmcid-7067710-chunk-5
Insert of existing embedding ID: pmcid-7067710-chunk-6
Insert of existing embedding ID: pmcid-7067710-chunk-7
Insert of existing embedding ID: pmcid-7067710-chunk-8
Insert of existing embedding ID: pmcid-7067710-chunk-9
Insert of existing embedding ID: pmcid-7067710-chunk-10
Insert of existing embedding ID: pmcid-7067710-chunk-11
Insert of existing embedding ID: pmcid-7067710-chunk-12
Insert of existing embedding ID: pmcid-7067710-chunk-13
Insert of existing embedding ID: pmcid-7067710-chunk-14
Insert of existing embedding ID: pmcid-7067710-chunk-15
Insert of existing embedding ID: pmcid-7067710-chunk-16
Insert of existing embedding ID: pmcid-7067710-chunk-17
In

Done Vectorizing Paper! Natural language query with Paper.query() now available.


{'Match on pmcid-7067710-chunk-26': '...similar to its monocomponent constituents when administered separately or coadministered, indicating no drug–drug interactions and no formulation effects. Similar to previous findings for the individual components, the rates of absorption of ibuprofen and acetaminophen from the FDC were slightly delayed in the presence of food. Overall, adolescents had similar exposures to acetaminophen and ibuprofen...'}

In [37]:
p.query("what were the participants ages?", n_results=2, n_before=1, n_after=3)

{'Match on pmcid-7067710-chunk-302': '...of the FDC to adolescents in Study 3 are summarized in Table\xa07. Results are presented for all subjects and for the age groups 12–14\xa0years and 15–17\xa0years separately. The overall ibuprofen exposure (AUC values) following administration of the FDC was similar for both the younger and older age groups. However, the younger group had a C_max that was approximately 23% higher and occurred 1\xa0h earlier relative to...',
 'Match on pmcid-7067710-chunk-92': '...for this study.\n\n    Subjects included in the two adult studies (i.e. Studies 1 and 2) were healthy males and females 12–17\xa0years of age, inclusive, with at least one painful condition (e.g. headache,...'}

## Peek at the Full Text

In [38]:
p.full_text()[0:100]

'Abstract: \n\nSECTION: Introduction:\n\nA fixed-dose combination (FDC) of ibuprofen and acetaminophen ha'

## Peek at relational df representation of Paper

In [39]:
pd.DataFrame(p.to_relational()).info()

<class 'pandas.core.frame.DataFrame'>
Index: 31 entries, PMCID to Figures
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       30 non-null     object
dtypes: object(1)
memory usage: 496.0+ bytes


## Look at a bunch of other Paper metadata

In [40]:
p.title

'Phase I Pharmacokinetic Study of Fixed-Dose Combinations of Ibuprofen and Acetaminophen in Healthy Adult and Adolescent Populations'

In [41]:
pd.options.display.max_colwidth = 200
p.authors

Unnamed: 0,Contributor_Type,First_Name,Last_Name,Email_Address,Affiliations
0,Author,Sanela,Tarabar,Sanela.Tarabar@pfizer.com,"[Aff1: Pfizer New Haven Clinical Research Unit, New Haven, CT USA, Aff6: Clinical Research and Development, KS1, 1 Portland Street, Cambridge, MA 02139 USA]"
1,Author,Debra,Kelsh,,"[Aff2: Altasciences/Vince and Associates Clinical Research, Overland Park, KS USA]"
2,Author,Bradley,Vince,,"[Aff2: Altasciences/Vince and Associates Clinical Research, Overland Park, KS USA]"
3,Author,Rina,Leyva,,"[Aff3: Pfizer Consumer Healthcare, Madison, NJ USA]"
4,Author,Dongweon,Song,,"[Aff4: Pfizer Inc., Collegeville, PA USA]"
5,Author,Kyle,Matschke,,"[Aff4: Pfizer Inc., Collegeville, PA USA]"
6,Author,David E.,Kellstein,,"[Aff5: Pfizer Consumer Healthcare, Madison, NJ USA]"
7,Author,Suzanne,Meeves,,"[Aff5: Pfizer Consumer Healthcare, Madison, NJ USA]"
8,Author,Mario,Cruz-Rivera,,"[Aff5: Pfizer Consumer Healthcare, Madison, NJ USA]"


In [42]:
print(p.non_author_contributors)

No non-author contributors were found after parsing this paper.


In [43]:
p.journal_id

{'nlm-ta': 'Drugs R D', 'iso-abbrev': 'Drugs R D'}

In [44]:
p.issn

{'ppub': '1174-5886', 'epub': '1179-6901'}

In [45]:
p.journal_title

'Drugs in R&D'

In [46]:
p.publisher_location

'Cham'

In [47]:
p.publisher_name

'Springer International Publishing'

In [48]:
p.article_id

{'pmid': '32130679',
 'pmc': '7067710',
 'publisher-id': '293',
 'doi': '10.1007/s40268-020-00293-5'}

In [49]:
p.article_types

['Original Research Article']

In [50]:
p.article_categories

'No extra article categories found. Check paper.article_types for header categories.'

In [51]:
p.published_date

{'epub': datetime.datetime(2020, 3, 4, 0, 0),
 'pmc-release': datetime.datetime(2020, 3, 4, 0, 0),
 'ppub': datetime.datetime(2020, 3, 1, 0, 0)}

In [52]:
p.volume

'20'

In [53]:
p.issue

'1'

In [54]:
p.fpage

'23'

In [55]:
p.lpage

'37'

In [56]:
p.permissions

{'Copyright Statement': '© The Author(s) 2020',
 'License Type': 'OpenAccess',
 'License Text': "Open AccessThis article is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License, which permits any non-commercial use, sharing, adaptation, distribution and reproduction in any medium or format, as long as you give appropriate credit to the original author(s) and the source, provide a link to the Creative Commons licence, and indicate if changes were made. The images or other third party material in this article are included in the article's Creative Commons licence, unless indicated otherwise in a credit line to the material. If material is not included in the article's Creative Commons licence and your intended use is not permitted by statutory regulation or exceeds the permitted use, you will need to obtain permission directly from the copyright holder.To view a copy of this licence, visit [External URI:]http://creativecommons.org/licenses/by-nc/4.0/."}

In [57]:
p.copyright

'© The Author(s) 2020'

In [58]:
p.license

'OpenAccess'

In [59]:
p.funding

['Pfizer Consumer Healthcare']

In [60]:
p.footnote

'The authors David E. Kellstein, Suzanne Meeves and Mario Cruz-Rivera were Employees of Pfizer Consumer Healthcare, Madison, NJ, USA at the time this research was conducted.'

In [61]:
for ack in p.acknowledgements:
    print(ack)

Acknowledgements 
       Medical writing support was provided by John H. Simmons, MD, of Peloton Advantage, LLC, an OPEN Health company, and was funded by Pfizer. On 1 August 2019, PCH became part of GSK Consumer Healthcare. The authors would like to thank Zhongwei Zhou, lead programmer, and all the programmers who supported these studies, as well as the study participants.


## More Detailed Investigation of Ref Map and Texts

In [62]:

#look at DATAREF index 1, a citation (if using the provided example PMCID article)
p.ref_map[1]

{'Authors': ['A Dickman'],
 'Title': 'Choosing over-the-counter analgesics',
 'Source': 'Pharm J.',
 'Year': '2008',
 'Volume': '281',
 'FirstPage': '631',
 'LastPage': None,
 'DOI': None,
 'PMID': None}

In [63]:
#look at datarefs filtered by citations
p.citations[0:1]

[{'Authors': ['A Dickman'],
  'Title': 'Choosing over-the-counter analgesics',
  'Source': 'Pharm J.',
  'Year': '2008',
  'Volume': '281',
  'FirstPage': '631',
  'LastPage': None,
  'DOI': None,
  'PMID': None}]

In [64]:
#look at an example table filtered out of the ref map
p.tables[0]

Unnamed: 0,0
0,The pharmacokinetic profile of a fixed-dose combination (FDC) of ibuprofen 250 mg and acetaminophen 500 mg was found to be similar to its individual components administered separately or coadminis...
1,"Similar to what has been previously reported for the individual ingredients, food delayed the absorption of both components but had no effect on overall exposure; exposure to both components in ad..."
2,"Since the efficacy of this FDC has been shown to be superior to the same doses of individual components and it is generally well tolerated, this new FDC may provide another analgesic treatment opt..."


In [65]:
#look at an example figure filtered out from the ref map
p.figures[0]

{'Label': 'Fig.\xa01',
 'Caption': 'Median plasma ibuprofen concentration over time following a single oral dose: a Study 1; b Study 2; c Study 3. Note that plasma concentrations for IBU 200\xa0mg in Study 1 are dose-normalized. APAP acetaminophen, FDC fixed-dose combination, IBU ibuprofen',
 'Link': '40268_2020_293_Fig1_HTML'}

In [66]:
#iterate through abstract sections and print the cleaned text, and text with MHTML datarefs side-by-side

if p.body:
    for par in p.body:
        print("Text: " + par.text)
        print()
        print("Text with Refs: " + par.text_with_refs)

Text: SECTION: Key Points:




Text with Refs: SECTION: Key Points:

[MHTML::dataref::0]

Text: SECTION: Introduction:

Ibuprofen and acetaminophen are among the most widely used non-prescription over-the-counter (OTC) analgesic/antipyretic drugs, both in the US and globally [1, 2]. The efficacy of these agents for the treatment of mild-to-moderate acute pain and fever in the OTC setting is well established [2–5]. Ibuprofen is a nonsteroidal anti-inflammatory drug (NSAID) that inhibits the cyclooxygenase (COX)-1 and -2 isoenzymes and hence the synthesis of pro-inflammatory prostaglandins, whereas acetaminophen is believed to act through inhibition of a subclass of COX enzyme isoforms in the central nervous system [6]. Additionally, acetaminophen has been reported to have effects on descending inhibitory serotonergic pain pathways to inhibit the l-arginine nitric oxide pathway; effects on cannabinoid receptors may also be operant [7]. Both ibuprofen and acetaminophen are associated with