Resources
- [xml module documentation](https://docs.python.org/3/library/xml.etree.elementtree.html)
- [NCBI APIs documentation](https://www.ncbi.nlm.nih.gov/home/develop/api/)
- [MeSH (Medical Subject Headings) Descriptor Search](https://meshb.nlm.nih.gov/search)
- [MeSH Documentation](https://pubmed.ncbi.nlm.nih.gov/help/#mhda)
- [Pubmed user guide](https://pubmed.ncbi.nlm.nih.gov/help/#mesh-subheadings)

Notes 

In [1]:
import requests
import xml.etree.ElementTree as ET
import webbrowser
import pandas as pd
import datetime
import os

In [2]:
# list of databases accessible to Entrez
r = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi')
dom = ET.fromstring(r.text)
dbs = dom.find('DbList')
dbs_names = []
for db in dbs.findall('DbName'):
    dbs_names.append(db.text)
dbs_names.sort()
pd.DataFrame(dbs_names, columns=['dbs'])

Unnamed: 0,dbs
0,annotinfo
1,assembly
2,biocollections
3,bioproject
4,biosample
5,biosystems
6,blastdbinfo
7,books
8,cdd
9,clinvar


### Helper Functions

In [3]:
def get_article_index(articles, PMID):
    # Find an index of specific article in our articles based on PMID
    # Params:
    #    articles: array of articles with type xml.etree.ElementTree.Element
    #    PMID: string
    # Example:
    #    idx = get_article_index(articles, '34163160')
    for i in range(len(articles)):
        if PMID == articles[i].find('MedlineCitation/PMID').text:
            return i
    return -1

In [4]:
def get_article_biomarker(biomarker):
    # Get the title, keyword list, chemical list, and abstract of all the articles, 
    # related to the biomarkers specified related with MDD and return a dataframe
    #
    # Params:
    #     biomarker: the biomarker to be searched for
    # Variables:
    #    ...
    
    # esearch for matching articles
    payload = {'db': 'pubmed',
           'term': 'Major depressive disorder[MH] ' + biomarker,
           'datetype':'edat',
           'usehistory': 'y'}
    r = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi', params=payload)
    dom = ET.fromstring(r.text)
    web_env = dom.find('WebEnv').text
    query_key = dom.find('QueryKey').text
    assert web_env != None
    assert query_key != None
    
    # efetch to get details about the article
    payload = {'db': 'pubmed',
           'query_key': query_key,
           'WebEnv': web_env,
           'rettype': 'xml',
           'retstart': 0}
    r = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi', params=payload)
    dom = ET.fromstring(r.text)
    articles = dom.findall('PubmedArticle')
    
    # initialize df
    df = pd.DataFrame(columns=['title', 'pmid', 'language', 'date', 'mesh major', 'keyword', 'mesh minor',
                              'mesh qualifier', 'chemical', 'background', 'objective',
                              'method', 'result', 'discussion', 'conclusion'])
    # begin parsing the XML of each article
    for idx in range(len(articles)):
        article = articles[idx].find('MedlineCitation')
        abstract = article.find('Article/Abstract')
        title = article.find('Article/ArticleTitle')
        # concatenating broken texts due to tags
        title_tail = [(x.text or "") + (x.tail or "") for x in title.findall('.//')]
        title_tail = ''.join(title_tail)
        title = (title.text or "") + title_tail
        PMID = article.find('PMID').text
        date = article.find('DateRevised')
        date = datetime.datetime(int(date.find('Year').text), int(date.find('Month').text), int(date.find('Day').text))
        language = article.find('Article/Language').text
        # minor relevant MeSH terms
        meshMinorList = []
        # major relevant MeSH terms
        meshMajorList = []
        # qualifer relevant MeSH terms
        meshQualifierList = set()
        keyWordList = []
        chemicalList = []
        if article.find('MeshHeadingList'):
            for meshHeading in article.find('MeshHeadingList'):
                for item in meshHeading:
                    if item.tag == 'QualifierName':
                        meshQualifierList.add(item.text)
                    else:      
                        if 'Y' == item.attrib['MajorTopicYN']:
                            meshMajorList.append(item.text)
                        else:
                            meshMinorList.append(item.text)
        if article.find('KeywordList'):
            for keyword in article.find('KeywordList'):
                keyWordList.append(keyword.text)
        if article.find('ChemicalList'):
            for chemical in article.find('ChemicalList'):
                for item in chemical:
                    if item.tag == 'NameOfSubstance':
                        chemicalList.append(item.text)
        meshQualifierList = list(meshQualifierList)
        
        # for article that has abstract
        if abstract != None:
            background, objective, method, result, discussion, conclusion = '', '', '', '', '', ''
            abstract_sections = abstract.findall('AbstractText')
            for section in abstract_sections:
                sub_sections_text = [(x.text or "") + (x.tail or "") for x in section.findall('.//')]
                sub_sections_text = ''.join(sub_sections_text)
                abstract_text = (section.text or "") + sub_sections_text
                if 'NlmCategory' in section.attrib or 'Label' in section.attrib:
                    attrib = section.attrib
                    temp = attrib.get('NlmCategory', attrib['Label'])
                    if temp == 'UNASSIGNED':
                        temp = attrib['Label']
                    attrib = temp
                    attrib = attrib.lower().strip()
                    if 'background' in attrib:
                        background = abstract_text
                    elif 'objective' in attrib or 'aim' in attrib:
                        objective = abstract_text
                    elif 'method' in attrib:
                        method = abstract_text
                    elif 'result' in attrib:
                        result = abstract_text
                    elif 'discussion' in attrib:
                        discussion = abstract_text
                    elif 'conclusion' in attrib:
                        conclusion = abstract_text
                # the abstract section doesn't have a label and that's the only abstract section
                elif len(abstract_sections) == 1:
                    # parse articles that use <b> tag for abstract sections
                    section_tags = list(section)
                    section_tag_names = [x.tag for x in section_tags]
                    if 'b' in section_tag_names:
                        count = 0
                        while count < len(section_tags):
                            abstract_text = ""
                            if section_tag_names[count] == 'b':
                                attrib = section_tags[count].text
                                # if the attrib has an inner tag which has the text
                                if attrib is None:
                                    attrib = "".join([x.text for x in section_tags[count]])
                                attrib = attrib.lower().strip()
                                abstract_text += section_tags[count].tail or ""
                                if count < len(section_tags) - 1:
                                    count += 1
                                    while count < len(section_tags) and section_tag_names[count] != 'b':
                                        abstract_text += section_tags[count].text or ""
                                        abstract_text += section_tags[count].tail or ""
                                        count += 1
                                    count -= 1
                                if 'background' in attrib:
                                    background = abstract_text
                                elif 'objective' in attrib or 'aim' in attrib:
                                    objective = abstract_text
                                elif 'method' in attrib:
                                    method = abstract_text
                                elif 'result' in attrib:
                                    result = abstract_text
                                elif 'discussion' in attrib:
                                    discussion = abstract_text
                                elif 'conclusion' in attrib:
                                    conclusion = abstract_text
                            count += 1

                    else:
                        background = abstract_text
            df = df.append({'title': title, 'pmid': PMID, 'language': language, 'date': date, 
                   'mesh major': meshMajorList, 'keyword': keyWordList, 'mesh minor': meshMinorList, 'mesh qualifier': meshQualifierList, 'chemical': chemicalList, 
                   'background': background, 'objective': objective, 'method': method, 'result': result, 
                   'discussion': discussion, 'conclusion': conclusion}, ignore_index=True)
        else:
            pass
    return df

In [5]:
def display_abstract(background='', objective='', method='', result='', discussion='', conclusion=''):
    # Print out all fields of abstract that are not empty
    # Params: string
    print('[Background]')
    if len(background) > 0:
        print(background)
    print('[Objective]')
    if len(objective) > 0:
        print(objective)
    print('[Method]')
    if len(method) > 0:
        print(method)
    print('[Result]')
    if len(result) > 0:
        print(result)
    print('[Discussion]')
    if len(discussion) > 0:
        print(discussion)
    print('[Conclusion]')
    if len(conclusion) > 0:
        print(conclusion)

In [6]:
def browse_articles(articles, idx=0):
    # Get the title, keyword list, chemical list, and abstract of one of the articles, rerun this cell to iterate through the articles
    # Params:
    #     idx: starting index to browese the articles, default to the beginning (i.e. 0)
    #     articles: array of articles with type xml.etree.ElementTree.Element
    article = articles[idx].find('MedlineCitation')
    abstract = article.find('Article/Abstract')
    title = article.find('Article/ArticleTitle')
    # concatenating broken texts due to tags
    title_tail = [(x.text or "") + (x.tail or "") for x in title.findall('.//')]
    title_tail = ''.join(title_tail)
    title = (title.text or "") + title_tail
    PMID = article.find('PMID').text
    date = article.find('DateRevised')
    date = datetime.datetime(int(date.find('Year').text), int(date.find('Month').text), int(date.find('Day').text))
    language = article.find('Article/Language').text
    # minor relevant MeSH terms
    meshMinorList = []
    # major relevant MeSH terms
    meshMajorList = []
    # qualifer relevant MeSH terms
    meshQualifierList = set()
    keyWordList = []
    chemicalList = []
    if article.find('MeshHeadingList'):
        for meshHeading in article.find('MeshHeadingList'):
            for item in meshHeading:
                if item.tag == 'QualifierName':
                    meshQualifierList.add(item.text)
                else:      
                    if 'Y' == item.attrib['MajorTopicYN']:
                        meshMajorList.append(item.text)
                    else:
                        meshMinorList.append(item.text)
    if article.find('KeywordList'):
        for keyword in article.find('KeywordList'):
            keyWordList.append(keyword.text)
    if article.find('ChemicalList'):
        for chemical in article.find('ChemicalList'):
            for item in chemical:
                if item.tag == 'NameOfSubstance':
                    chemicalList.append(item.text)

    print('Title:', title)
    print('PMID:', PMID)
    print("Language:", language)
    print(date.strftime("Date Revised: %m/%d/%Y"))
    print("Keywords:", keyWordList)
    print("MeSH Major:", meshMajorList)
    print("MeSH Minor:", meshMinorList)
    print("MeSH Qualifier:", meshQualifierList)
    print("Chemicals:", chemicalList)
    print('='*50 + 'Abstract' + '='*50)
    
    # for article that has abstract
    if abstract != None:
        background, objective, method, result, discussion, conclusion = '', '', '', '', '', ''
        abstract_sections = abstract.findall('AbstractText')
        for section in abstract_sections:
            sub_sections_text = [(x.text or "") + (x.tail or "") for x in section.findall('.//')]
            sub_sections_text = ''.join(sub_sections_text)
            abstract_text = (section.text or "") + sub_sections_text
            if 'NlmCategory' in section.attrib or 'Label' in section.attrib:
                attrib = section.attrib
                temp = attrib.get('NlmCategory', attrib['Label'])
                if temp == 'UNASSIGNED':
                    temp = attrib['Label']
                attrib = temp
                attrib = attrib.lower().strip()
                if 'background' in attrib:
                    background = abstract_text
                elif 'objective' in attrib or 'aim' in attrib:
                    objective = abstract_text
                elif 'method' in attrib:
                    method = abstract_text
                elif 'result' in attrib:
                    result = abstract_text
                elif 'discussion' in attrib:
                    discussion = abstract_text
                elif 'conclusion' in attrib:
                    conclusion = abstract_text
            # the abstract section doesn't have a label and that's the only abstract section
            elif len(abstract_sections) == 1:
                # parse articles that use <b> tag for abstract sections
                section_tags = list(section)
                section_tag_names = [x.tag for x in section_tags]
                if 'b' in section_tag_names:
                    count = 0
                    while count < len(section_tags):
                        abstract_text = ""
                        if section_tag_names[count] == 'b':
                            attrib = section_tags[count].text
                            # if the attrib has an inner tag which has the text
                            if attrib is None:
                                attrib = "".join([x.text for x in section_tags[count]])
                            attrib = attrib.lower().strip()
                            abstract_text += section_tags[count].tail or ""
                            if count < len(section_tags) - 1:
                                count += 1
                                while count < len(section_tags) and section_tag_names[count] != 'b':
                                    abstract_text += section_tags[count].text or ""
                                    abstract_text += section_tags[count].tail or ""
                                    count += 1
                                count -= 1
                            if 'background' in attrib:
                                background = abstract_text
                            elif 'objective' in attrib or 'aim' in attrib:
                                objective = abstract_text
                            elif 'method' in attrib:
                                method = abstract_text
                            elif 'result' in attrib:
                                result = abstract_text
                            elif 'discussion' in attrib:
                                discussion = abstract_text
                            elif 'conclusion' in attrib:
                                conclusion = abstract_text
                        count += 1

                else:
                    background = abstract_text
        display_abstract(background=background, objective=objective, method=method, result=result, discussion=discussion, conclusion=conclusion)
        print()
    else:
        print("No abstract available for this article")
    idx += 1
    idx %= len(articles)

### Cancer Study using MeSH

Get the number of articles related to cancer on pubmed in the past 3 days

In [7]:
payload = {'db': 'pubmed',
           'term': 'cancer[MAJR]',
           'datetype':'edat',
           'usehistory': 'y',
           'reldate': 3}
r = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi', params=payload)
dom = ET.fromstring(r.text)
web_env = dom.find('WebEnv').text
query_key = dom.find('QueryKey').text
assert web_env != None
assert query_key != None

In [8]:
print('# of results on pubmed related to cancer:', dom.find('Count').text)

# of results on pubmed related to cancer: 43


Get the metadata summary of the last result on pubmed regarding cancer

In [9]:
payload = {'query_key': query_key,
           'WebEnv': web_env,
           'version': 2.0} 
r = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi', params=payload)
dom = ET.fromstring(r.text)

Get the article summary on pubmed regarding cancer

In [10]:
payload = {'db': 'pubmed',
           'query_key': query_key,
           'WebEnv': web_env,
           'rettype': 'xml',
           'retstart': 0}
r = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi', params=payload)
dom = ET.fromstring(r.text)

In [11]:
# idx to update the article for the next cell
idx = 0
articles = dom.findall('PubmedArticle')

In [12]:
browse_articles(articles, idx=idx)
idx += 1
idx %= len(articles)

Title: Laparoscopic lateral lymph node dissection in two fascial spaces for locally advanced lower rectal cancer.
PMID: 34239276
Language: eng
Date Revised: 07/12/2021
Keywords: ['Cardinal ligament', 'Fascial anatomy', 'Lateral lymph node dissection', 'Locally advanced low rectal cancer', 'Vesicohypogastric fascia', 'Visceral fascia']
MeSH Major: ['Laparoscopy', 'Rectal Neoplasms']
MeSH Minor: ['Dissection', 'Humans', 'Lymph Node Excision', 'Lymph Nodes']
MeSH Qualifier: {'adverse effects', 'surgery'}
Chemicals: []
[Background]
The procedure for lateral lymph node (LLN) dissection (LLND) is complicated and can result in complications. We developed a technique for laparoscopic LLND based on two fascial spaces to simplify the procedure.
[Objective]
To clarify the anatomical basis of laparoscopic LLND in two fascial spaces and to evaluate its efficacy and safety in treating locally advanced low rectal cancer (LALRC).
[Method]
Cadaveric dissection was performed on 24 pelvises, and the fasc

Uncommend cell below to view the hierarchy of the xml file in a new tab

In [13]:
# webbrowser.open(r.url);

### Depression & Biomarker Study using MeSH

Get the number of articles regarding major depressive disorder and leptin on pubmed

In [14]:
payload = {'db': 'pubmed',
           'term': 'Major depressive disorder[MH] leptin',
           'datetype':'edat',
           'usehistory': 'y'}
r = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi', params=payload)
dom = ET.fromstring(r.text)
web_env = dom.find('WebEnv').text
query_key = dom.find('QueryKey').text
assert web_env != None
assert query_key != None

In [15]:
print('# of results on pubmed related to depression and biomarker:', dom.find('Count').text)

# of results on pubmed related to depression and biomarker: 117


Get the article summary 

In [16]:
payload = {'db': 'pubmed',
           'query_key': query_key,
           'WebEnv': web_env,
           'rettype': 'xml',
           'retstart': 0}
r = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi', params=payload)
dom = ET.fromstring(r.text)

In [17]:
# idx to update the article for the next cell
idx = 0
articles = dom.findall('PubmedArticle')

In [18]:
browse_articles(articles, idx=idx)
idx += 1
idx %= len(articles)

Title: Plasma leptin correlates with anthranilic acid in schizophrenia but not in major depressive disorder.
PMID: 32855025
Language: eng
Date Revised: 01/22/2021
Keywords: []
MeSH Major: ['Depressive Disorder, Major', 'Schizophrenia']
MeSH Minor: ['Humans', 'Insulin', 'Leptin', 'Plasma', 'Proteomics', 'ortho-Aminobenzoates']
MeSH Qualifier: {'drug therapy'}
Chemicals: ['Insulin', 'Leptin', 'ortho-Aminobenzoates', 'anthranilic acid']
No abstract available for this article


### Begin scraping pubmed articles in regards to different candidataes of biomarkers

In [19]:
biomarkers = ['BDNF', 'IL-1', 'IL-6', 'TNF', 'Malondialdehyde', 'Superoxide dismutase', 'corticosteroids', 'leptin']
df_dict = {}
for biomarker in biomarkers:
    print(biomarker)
    df_dict[biomarker] = get_article_biomarker(biomarker)

BDNF
IL-1
IL-6
TNF
Malondialdehyde
Superoxide dismutase
corticosteroids
leptin


Get the number of articles of each biomarkder

In [20]:
for key in df_dict.keys():
    print(f"{key}: {df_dict[key].shape[0]}")

BDNF: 1104
IL-1: 270
IL-6: 651
TNF: 402
Malondialdehyde: 65
Superoxide dismutase: 79
corticosteroids: 3098
leptin: 109


Take a look at cortisols

In [21]:
df_dict["corticosteroids"].head()

Unnamed: 0,title,pmid,language,date,mesh major,keyword,mesh minor,mesh qualifier,chemical,background,objective,method,result,discussion,conclusion
0,The Role of HPA Axis and Allopregnanolone on t...,34071053,eng,2021-06-21,[],"[PTSD, brexanolone, depression, hypothalamus-p...","[Adaptation, Physiological, Animals, Antidepre...","[psychology, physiopathology, drug therapy, me...","[Antidepressive Agents, GABA-A Receptor Agonis...","Under stressful conditions, the hypothalamic-p...",,,,,
1,TNIP2 mediates GRβ-promoted inflammation and i...,33932528,eng,2021-06-23,"[Depressive Disorder, Major, Receptors, Glucoc...","[Glucocorticoid receptor β, Major depressive d...","[Adaptor Proteins, Signal Transducing, Glucoco...",[metabolism],"[Adaptor Proteins, Signal Transducing, Glucoco...","In depression, continual activation of the hyp...",,,,,
2,The effects of Pythagorean Self-Awareness Inte...,33894540,eng,2021-07-05,"[Depressive Disorder, Major]","[Cortisol, Depression, Major depressive disord...","[Adult, Anxiety, Depression, Humans, Hydrocort...",[therapy],[Hydrocortisone],Stress plays an important role in major depres...,,,,,
3,Potassium-Titanyl-Phosphate (KTP) Laser Photoc...,33753714,eng,2021-05-14,"[Depressive Disorder, Major, Hemangioma, Laser...",[],"[Humans, Light Coagulation, Male, Middle Aged,...","[therapeutic use, surgery]","[Phosphates, Potassium]","BACKGROUND Hemangiomas are relatively rare, sl...",,,,,
4,"Role of insulin-like growth factor 1, sex and ...",33731067,eng,2021-04-26,"[Depressive Disorder, Major]","[Cortisol, DHEAS, Estradiol, IGF1, Testosterone]","[Dehydroepiandrosterone Sulfate, Humans, Hydro...",[],"[Testosterone, Dehydroepiandrosterone Sulfate,...",Hormones of the hypothalamic-pituitary-gonadal...,,"Serum estradiol, testosterone, cortisol, DHEAS...",Patients had significantly lower estradiol lev...,,Estradiol may affect the pathogenesis and seve...


### Export the data

In [22]:
os.listdir()

['.ipynb_checkpoints', 'data', 'Data mining for papers.ipynb']

In [23]:
if "data" not in os.listdir():
    os.mkdir("data")
for key in df_dict.keys():
    if os.path.exists()
    df_dict[key].to_pickle(f"data/{key}.pkl")