### Raw experiments

In [5]:
import requests
import json
url = 'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{0}/unicode'
idx = '36520504'

resp = requests.get(url.format(idx))

In [6]:
data = resp.json()

In [7]:
print(json.dumps(data, indent=4))

{
    "source": "PMC",
    "date": "20230516",
    "key": "pmc.key",
    "infons": {},
    "documents": [
        {
            "id": "10183809",
            "infons": {
                "license": "CC BY-NC-ND"
            },
            "passages": [
                {
                    "offset": 0,
                    "infons": {
                        "alt-title": "Utilizing NOS Targeting Agents in Cancer",
                        "article-id_doi": "10.1158/1078-0432.CCR-22-2791",
                        "article-id_pmc": "10183809",
                        "article-id_pmid": "36520504",
                        "article-id_publisher-id": "CCR-22-2791",
                        "fpage": "1855",
                        "issue": "10",
                        "license": "This open access article is distributed under the Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0) license.",
                        "lpage": "1868",
                       

In [12]:
output = {}
text_type = data['documents'][0]['passages'][0]['text']
curr_section_type = data['documents'][0]['passages'][0]['infons']['section_type']
for passage in data['documents'][0]['passages']:
    if passage['infons']['section_type'] == 'TITLE':
        continue
    
    section_type = passage['infons']['section_type']
    if section_type not in output.keys():
        print('Creating new section...')
        output[section_type] = {}
        text_type = passage['infons']['type']
        output[section_type][text_type] = []
        
    # In the event where the section has shown up before
    elif section_type != curr_section_type:
        text_type = passage['infons']['type']
        print(output[section_type].keys())
        
    print('*'*100)
    print(passage['infons']['section_type'])
    print(passage['infons']['type'])
    print(passage['text'])
    
    if 'title' in passage['infons']['type'].lower():
        output[section_type][passage['text']] = []
        text_type = passage['text']
        print('text type changed here to:', text_type)
    else:
        #print(text_type)
        if text_type not in output[section_type].keys():
            output[section_type][text_type] = []
        output[section_type][text_type].append([passage['text']])
        
    curr_section_type = section_type

Creating new section...
****************************************************************************************************
ABSTRACT
abstract_title_1
Abstract
text type changed here to: Abstract
****************************************************************************************************
ABSTRACT
abstract
Utilizing targeted therapies capable of reducing cancer metastasis, targeting chemoresistant and self-renewing cancer stem cells, and augmenting the efficacy of systemic chemo/radiotherapies is vital to minimize cancer-associated mortality. Targeting nitric oxide synthase (NOS), a protein within the tumor microenvironment, has gained interest as a promising therapeutic strategy to reduce metastatic capacity and augment the efficacy of chemo/radiotherapies in various solid malignancies. Our review highlights the influence of nitric oxide (NO) in tumor progression and cancer metastasis, as well as promising preclinical studies that evaluated NOS inhibitors as anticancer therapie

In [13]:
output['TABLE'].keys()

dict_keys(['table_caption'])

In [14]:
print(json.dumps(output, indent=4))

{
    "ABSTRACT": {
        "abstract_title_1": [],
        "Abstract": [
            [
                "Utilizing targeted therapies capable of reducing cancer metastasis, targeting chemoresistant and self-renewing cancer stem cells, and augmenting the efficacy of systemic chemo/radiotherapies is vital to minimize cancer-associated mortality. Targeting nitric oxide synthase (NOS), a protein within the tumor microenvironment, has gained interest as a promising therapeutic strategy to reduce metastatic capacity and augment the efficacy of chemo/radiotherapies in various solid malignancies. Our review highlights the influence of nitric oxide (NO) in tumor progression and cancer metastasis, as well as promising preclinical studies that evaluated NOS inhibitors as anticancer therapies. Lastly, we highlight the prospects and outstanding challenges of using NOS inhibitors in the clinical setting."
            ]
        ]
    },
    "INTRO": {
        "title_1": [],
        "Introduction": [


In [4]:
class PubmedJSONParser():
    def __init__(self, url, pmc_id):
        self.pmc_id = pmc_id
        self.url = url.format(self.pmc_id)
        print('*'*100)
        print('Parsing data from Pubmed')
        print('*'*100)
        self.fetch_json()
        self.format_json()
        self.clean_formatted_json()
        
    def fetch_json(self):
        print('Fetching data from PMC API...', end='')
        resp = requests.get(self.url)
        if resp.status_code == '404':
            raise ValueError('404 response from PMC API. Is the PMC id given correct?')
        self.data = resp.json()
        print('Done!')
        
    def format_json(self):
        print('Formatting data...', end='')
        output = {}
        text_type = self.data['documents'][0]['passages'][0]['text']
        for passage in self.data['documents'][0]['passages']:
            if passage['infons']['section_type'] == 'TITLE':
                continue

            section_type = passage['infons']['section_type']
            if section_type not in output.keys():
                output[section_type] = {}
                text_type = passage['infons']['type']
                output[section_type][text_type] = []
            #print('*'*100)
            #print(passage['infons']['section_type'])
            #print(passage['infons']['type'])
            #print(passage['text'])

            if 'title' in passage['infons']['type'].lower():
                output[section_type][passage['text']] = []
                text_type = passage['text']
                #print('text type changed here to:', text_type)
            else:
                output[section_type][text_type].append(passage['text'])
                
        self.output = output
        print('Done!')
        
    def clean_formatted_json(self):
        print('Cleaning formatted JSON...', end='')
        for section, val in self.output.items():
            keys_to_remove = []
            for title, out in val.items():
                if len(out) == 0:
                    keys_to_remove.append(title)
                    
            for key in keys_to_remove:
                del val[key]
        print('Done!')            

In [5]:
parser = PubmedJSONParser('https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{0}/unicode', '33301246')

****************************************************************************************************
Parsing data from Pubmed
****************************************************************************************************
Fetching data from PMC API...

NameError: name 'requests' is not defined

In [43]:
parser.output

{'ABSTRACT': {'Background': ['Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) infection and the resulting coronavirus disease 2019 (Covid-19) have afflicted tens of millions of people in a worldwide pandemic. Safe and effective vaccines are needed urgently.'],
  'Methods': ['In an ongoing multinational, placebo-controlled, observer-blinded, pivotal efficacy trial, we randomly assigned persons 16 years of age or older in a 1:1 ratio to receive two doses, 21 days apart, of either placebo or the BNT162b2 vaccine candidate (30 μg per dose). BNT162b2 is a lipid nanoparticle–formulated, nucleoside-modified RNA vaccine that encodes a prefusion stabilized, membrane-anchored SARS-CoV-2 full-length spike protein. The primary end points were efficacy of the vaccine against laboratory-confirmed Covid-19 and safety.'],
  'Results': ['A total of 43,548 participants underwent randomization, of whom 43,448 received injections: 21,720 with BNT162b2 and 21,728 with placebo. There were 8 cas

In [1]:
config = {
    'pmc_id':'36520504',
    'pmc_url':'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{0}/unicode',
    'raw_text_path':'pmc_json/'
}

In [2]:
from pubmed_json_parser import PubmedJSONParser
parser = PubmedJSONParser(config)

****************************************************************************************************
Parsing data from Pubmed
****************************************************************************************************
Fetching data from PMC API...Done!
Formatting data...Done!
Cleaning formatted JSON...Done!
Saving formatted JSON to pmc_json/...Done!


In [9]:
parser.url.format(config['pmc_id'])

'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/30088834/unicode'