In [None]:
"""
Author: Wen-Jou Chang
Baylor College of Medicine

This notebook performs a systematic search of PubMed articles related to various disease categories.

The pipeline:
1. Searches PubMed for articles in different disease categories
2. Analyzes articles for:
    - Full text availability
    - Mentions of methylation probes or Illumina-related keywords in main text/tables
    - Supplementary data availability
    - extract CG probe mentions
4. Collects and organizes files by disease category
5. Extracts probes from supplementary files then unify all probes into one big file for each category
"""

In [None]:
"""
Initialization
"""

import os
import json
import re
import warnings
from collections import defaultdict as dd
from http.client import IncompleteRead
from pandas.errors import EmptyDataError 

import aiohttp
import asyncio
import nest_asyncio
import numpy as np
import pandas as pd
from Bio import Entrez

os.chdir("YOUR DIRECTORY")

obesity = ["Obesity"]
cancer = ["Neoplasms"]
cardiovascular = ["Cardiovascular Diseases"]
digestive = ["Digestive System Diseases"]
endocrine = ["Endocrine System Diseases"]
hematological = ["Hemic and Lymphatic Diseases"]
immune = ["Immune System Diseases"]
metabolic = ["Metabolic Diseases"]
neurological = ["Mental Disorders", "Nervous System Diseases"]
urogenital = ["Urogenital Diseases"]
respiratory = ["Respiratory Tract Diseases"]

categories = [obesity, cancer, cardiovascular, digestive, endocrine, hematological, immune, metabolic, neurological, urogenital, respiratory]
category_names = ["obesity", "cancer", "cardiovascular", "digestive", "endocrine", "hematological", "immune", "metabolic", "neurological", "urogenital", "respiratory"]
C = len(categories)

In [None]:
"""
Get articles from pubmed in each disease category.
We performed our search on January 16th, 2024; studies published thereafter are not included in our analysis.
"""
def search(query):
    Entrez.email = 'u239646@bcm.edu'
    handle = Entrez.esearch(db='pubmed',
                            sort='relevance',
                            retmax='10000',
                            retmode='xml',
                            term=query,)
    results = Entrez.read(handle)
    return results

def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'u239646@bcm.edu'
    try:
        handle = Entrez.efetch(db='pubmed',
                            retmode='xml',
                            id=ids)
        results = Entrez.read(handle, validate=False)
        handle.close()
    except IncompleteRead:
        handle = Entrez.efetch(db='pubmed',
                            retmode='xml',
                            id=ids)
        results = Entrez.read(handle, validate=False)
        handle.close()
    return results

def safe_get(data, keys, default=""):
    try:
        for key in keys:
            data = data[key]
        return data
    except (KeyError, TypeError):
        return default

def process_mesh_terms(mesh_list):
    mesh_list_processed = []
    for mesh_term in mesh_list:
        mesh = mesh_term["DescriptorName"]
        qualifier = mesh_term["QualifierName"]
        if len(qualifier) > 0:
            for q in qualifier:
                mesh_list_processed.append(mesh+"/"+q)
        else:
            mesh_list_processed.append(mesh)
    paper_mesh = " | ".join(mesh_list_processed)
    return paper_mesh

def pad_keywords(kw_list):
    if len(kw_list) == 1:
        return f'"{kw_list[0]}"[MeSH Terms]'
    initial = '("'
    kw = '"[MeSH Terms] OR "'.join(kw_list)
    initial += kw
    initial += '"[MeSH Terms])'
    return initial

def full_search_term(kw):
    return f'("DNA Methylation"[MeSH Terms] AND {kw}) NOT ("animals"[MeSH Terms] NOT "humans"[MeSH Terms]) AND "pubmed pmc"[Filter]'

def get_papers(query, output_name, file_name=None):
    
    pmids = []
    titles = []
    last_name = []
    journal = []
    publication_types = []
    journal_iso = []
    year = []
    abstract = []
    mesh_terms = []
    pmcid = []

    if file_name:
        with open(file_name, 'r') as file:
            id_list = [line.strip() for line in file]
    else:
        query = full_search_term(pad_keywords(query))
        results = search(query)
        id_list = results['IdList']
    
    count = len(id_list)

    if count > 0 and count < 10000:
        papers = fetch_details(id_list)
        for paper in papers['PubmedArticle']:
            pmids.append(safe_get(paper, ['MedlineCitation', 'PMID']))
            titles.append(safe_get(paper, ['MedlineCitation', 'Article', 'ArticleTitle']))
            last_name.append(safe_get(paper, ['MedlineCitation', 'Article', 'AuthorList', 0, 'LastName']))
            journal.append(safe_get(paper, ['MedlineCitation', 'Article', 'Journal', 'Title']))
            journal_iso.append(safe_get(paper, ['MedlineCitation', 'Article', 'Journal', 'ISOAbbreviation']))
            year.append(safe_get(paper, ['MedlineCitation', 'Article', 'Journal', 'JournalIssue', 'PubDate', 'Year']))
            abstract.append(" ".join(map(str, safe_get(paper, ['MedlineCitation', 'Article', 'Abstract', 'AbstractText'], []))))
            mesh_terms.append(process_mesh_terms(safe_get(paper, ['MedlineCitation', 'MeshHeadingList'], [])))
            ids = safe_get(paper, ['PubmedData', 'ArticleIdList'], [])
            # print(ids)
            pmcid.append(next((i for i in ids if i.startswith("PMC")), np.nan))
    elif count >= 10000:
        for i in range(0, count, 5000):
            print(i, min(i+5000, count))
            papers = fetch_details(id_list[i:min(i+5000, count)])
            for paper in papers['PubmedArticle']:
                pmids.append(safe_get(paper, ['MedlineCitation', 'PMID']))
                titles.append(safe_get(paper, ['MedlineCitation', 'Article', 'ArticleTitle']))
                last_name.append(safe_get(paper, ['MedlineCitation', 'Article', 'AuthorList', 0, 'LastName']))
                journal.append(safe_get(paper, ['MedlineCitation', 'Article', 'Journal', 'Title']))
                publication_types.append(";".join(set([str(i) for i in safe_get(paper, ['MedlineCitation', 'Article', 'PublicationTypeList']) if not str(i).startswith("Research Support")])))
                journal_iso.append(safe_get(paper, ['MedlineCitation', 'Article', 'Journal', 'ISOAbbreviation']))
                year.append(safe_get(paper, ['MedlineCitation', 'Article', 'Journal', 'JournalIssue', 'PubDate', 'Year']))
                abstract.append(" ".join(map(str, safe_get(paper, ['MedlineCitation', 'Article', 'Abstract', 'AbstractText'], []))))
                mesh_terms.append(process_mesh_terms(safe_get(paper, ['MedlineCitation', 'MeshHeadingList'], [])))
                ids = safe_get(paper, ['PubmedData', 'ArticleIdList'], [])
                # print(ids)
                pmcid.append(next((i for i in ids if i.startswith("PMC")), np.nan))

    df = pd.DataFrame({"Mesh Term": mesh_terms, "PMID": pmids, "PMCID": pmcid, "Last Name": last_name, "Year": year, "Journal": journal, "ISO_journal": journal_iso, "Title": titles, "Abstract": abstract, "Publication Type": publication_types})
    df.to_csv(output_name, index=0)

for i in range(C):
    # esearch -db pubmed -query "[query term]" | efetch -format uid > output.txt for records >= 10000
    if i == 7 or i == 1:
        get_papers(categories[i], f"pubmed_search/{category_names[i]}.csv", f"pubmed_search/{category_names[i]}.txt")
    else:
        get_papers(categories[i], f"pubmed_search/{category_names[i]}.csv")

In [None]:
"""
Get full texts if available. 
"""


async def fetch_full_text(session, pmid):
    url = f'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{pmid}/ascii'
    async with session.get(url) as response:
        data = await response.text()
        if data.startswith("[Error]"):
            return ""
        return data

async def process_probes(pmids):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_full_text(session, pmid) for pmid in pmids]
        return await asyncio.gather(*tasks)

nest_asyncio.apply()

for i in range(C):
    df = pd.read_csv(f"pubmed_search/{category_names[i]}.csv")
    pmids = df["PMID"].tolist()

    loop = asyncio.get_event_loop()

    passages = loop.run_until_complete(process_probes(pmids))


    df2 = pd.DataFrame({"PMID": pmids, "PMCID": df["PMCID"], "Full Text": passages})
    df2.to_csv(f"full_text/{category_names[i]}_full_text.csv", index=False)
    print(category_names[i]+" done")


In [None]:
"""
Get probes from only table in main text.
"""

warnings.filterwarnings('ignore')

CAPTION = 1
TABLE = 2
FOOTNOTE = 3
TITLE_CAPTION = 4
table_key = {'table_caption':CAPTION,
'table':TABLE,
'table_footnote':FOOTNOTE,
'table_title_caption':TITLE_CAPTION}

def fetch_probe_from_table(pmid, text, pmcid):
    probe_result_set = set()
    if type(text) is float:
        return
    paragraphs = json.loads(text)["documents"][0]["passages"]
    table_dict = dd(lambda: dd(list))
    for p_idx in range(len(paragraphs)):
        p = paragraphs[p_idx]
        if p["infons"]["section_type"] == "TABLE":
            try:
                table_id = p["infons"]['id']
            except:
                # print(p["infons"])
                continue
            part = p["infons"]["type"]
            if part == "table":
                table_dict[table_id][table_key.get(part, -1)].append((p["infons"]["xml"]))
            else:
                table_dict[table_id][table_key.get(part, -1)].append((p["text"].lower()))
    probe_pattern = r'cg\d{8}'
    for table_id, table_content in table_dict.items():
        has_table = table_content.get(TABLE, None)
        if has_table:
            has_table = [t for t in has_table if t]
            curr_table = " ".join(has_table)
            probe = re.findall(probe_pattern, curr_table)
            probe_result_set.update(set(probe))
        # else:
            # print(pmid)

    for p in probe_result_set:
        probe_id.append(p)
        probe_pmid.append(pmid)
        probe_pmcid.append(pmcid)
        
for cat in category_names:
    data = pd.read_csv(f"full_text/{cat}_full_text.csv")
    probe_id = []
    probe_pmid = []
    probe_pmcid = []

    for j, row in data.iterrows():
        # print(row["PMID"])
        fetch_probe_from_table(row["PMID"], row["Full Text"], row["PMCID"])

    df2 = pd.DataFrame({"probeId": probe_id, "pmid": probe_pmid, "pmcid": probe_pmcid})
    df2.to_csv(f"probe_main_table/{cat}_main_probes.csv", index=0)


In [None]:
"""
Get probes from main text.
"""

warnings.filterwarnings('ignore')

def fetch_probe_from_text(pmid, text, pmcid):
    probe_result_set = set()
    if type(text) is float:
        return
    paragraphs = [p.get("text") for p in json.loads(text)["documents"][0]["passages"] if p.get("text")]
    # print(json.dumps(paragraphs, indent=4))

    # print(type(paragraphs))
    probe_pattern = r'cg\d{8}'
    curr_table = " ".join(paragraphs)
    probe = re.findall(probe_pattern, curr_table)
    probe_result_set.update(set(probe))

    for p in probe_result_set:
        probe_id.append(p)
        probe_pmid.append(pmid)
        probe_pmcid.append(pmcid)
        
for cat in category_names:
    data = pd.read_csv(f"full_text/{cat}_full_text.csv")
    probe_id = []
    probe_pmid = []
    probe_pmcid = []

    for j, row in data.iterrows():
        fetch_probe_from_text(row["PMID"], row["Full Text"], row["PMCID"])

    df2 = pd.DataFrame({"probeId": probe_id, "pmid": probe_pmid, "pmcid": probe_pmcid})
    df2.to_csv(f"probe_main_text/{cat}_main_probes.csv", index=0)


In [None]:

"""
Identify papers mentioning Illumina-related keywords: "450K", "HM450", "EPIC", "450k", "HumanMethylation450", "850K", "850k".
"""
download_supplementary_subset = []
def contains_keywords(input_string, keywords):
    for keyword in keywords:
        if keyword in input_string:
            return True
    return False

for cat in category_names:
    kw_illumina = []
    df = pd.read_csv(f"full_text/{cat}_full_text.csv")
    for j, row in df.iterrows():
        keywords = ["450K", "HM450", "EPIC", "450k", "HumanMethylation450", "850K", "850k"]
        text = row["Full Text"]
        if type(text) is float:
            continue
        paragraphs = [p.get("text") for p in json.loads(text)["documents"][0]["passages"] if p.get("text") and p["infons"]["section_type"]!="REF"]
        text = " ".join(paragraphs)
        is_illumina_study = contains_keywords(text, keywords)
        if is_illumina_study:
            kw_illumina.append(row["PMCID"])

    kw_illumina = set(kw_illumina)
    df = pd.read_csv(f"probe_main_text/{cat}_main_probes.csv")
    mention_cg_id = set(df["pmcid"].to_list())
    download_supplementary_subset.append(mention_cg_id.union(kw_illumina))
    

In [None]:
"""
Get supplementary file download links for targeted papers from PMCOA.
"""

async def fetch_link(session, pmcid):
    url = f"https://ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmcid}"
    async with session.get(url) as response:
        data = await response.text()
        href_pattern = re.compile(r'<link[^>]*href="([^"]+)"')
        match = href_pattern.search(data)
        href_value = match.group(1) if match else None
        return href_value


async def process_link(pmids):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_link(session, pmid) for pmid in pmids]
        return await asyncio.gather(*tasks)

nest_asyncio.apply()

for i in range(4, len(category_names)):
    cat = category_names[i]
    pmcids = list(download_supplementary_subset[i])

    loop = asyncio.get_event_loop()

    passages = loop.run_until_complete(process_link(pmcids))

    df2 = pd.DataFrame({"PMCID": pmcids, "Download Link": passages})
    df2.to_csv(f"download_links/{cat}_link.csv", index=False)


In [None]:
# codes in this section were run in the cluster as slurm files, cat is the category name

# download all supplementary files based on link 
wget -i ${cat}_link.txt -c

# unzip
for file in *.tar.gz; do
   tar -xvzf "$file" -C . && mv "$file" ../tar_archive
   subfolder_name=$(basename "$file" .tar.gz)
   for file1 in "$subfolder_name"/*; do
       mv "$file1" "${file1/\//-}"
   done
rm -d $subfolder_name
done

# reorganize files based on file extension
exts=$(ls | sed 's/^.*\.//' | sort -u)
for ext in $exts
do
    echo Processing "$ext"
    mkdir -p "$ext"
    mv -vn *."$ext" "$ext"/
done

# unzip nested zipped files: first run handle_zip.py then delete empty folders
python3 handle_zip.py "${cat}"
find . -empty -type d -delete

# optional: convert Excel files (.xls/.xlsx) to CSV format. While this adds some processing overhead initially,
# it significantly speeds up subsequent file access and analysis
while IFS= read -r file; do
    while IFS= read -r sheet; do
        in2csv --sheet "$sheet" "$file" > "${file%.*}-${sheet}.csv" # need to install in2csv first
    done < <(in2csv -n "$file")
done < <(find . -name '*.xls' -o -name '*.xlsx')

# extract probes from supplementary files
python3 get_probe_supplementary.py "${cat}"


In [None]:
"""
Combine probes scraped from different file extensions into one for each category.
"""

cat_probes_dict = []
def combine_probes(input_cat):
    initial_df = pd.read_csv(f"probe_main_text/{input_cat}_main_probes.csv")
    initial_df = initial_df[["pmcid", "probeId"]]
    dfs = [initial_df]
    for type in ["csv", "xls", "xlsx", "xlsb", "xlsm", "failed"]:
        if os.path.exists(f"probe/{input_cat.split('_')[0]}_{type}_probes.csv"):
            try:
                temp = pd.read_csv(f"probe/{input_cat.split('_')[0]}_{type}_probes.csv")
                dfs.append(temp)
            except EmptyDataError:
                continue
    result_df = pd.concat(dfs, ignore_index=True)
    result_df.drop_duplicates(inplace=True)
    pubmed_paper = pd.read_csv(f"pubmed_search/{input_cat}.csv")
    
    # skip non-research papers
    pubmed_paper["Publication Type"] = pubmed_paper["Publication Type"].apply(lambda x: [y.strip() for y in x.split(";")])
    skip = ["Meta-Analysis", "Review", "Systematic Review", "Retracted Publication", "Comment", "Video-Audio Media", "Validation Study", "Historical Article"]
    pubmed_paper = pubmed_paper[pubmed_paper["Publication Type"].apply(lambda x: "Journal Article" in x and not any(skip_type in x for skip_type in skip))]
    
    pubmed_paper = pubmed_paper[["Category Mesh Term", "PMCID"]]
    pubmed_paper.rename(columns={"Category Mesh Term": "Filtered Mesh Term", "PMCID": "pmcid"}, inplace=True)
    output = pd.merge(pubmed_paper, result_df, how="inner", on="pmcid")
    output.to_csv(f"probe/{input_cat}_all_probes.csv", index=0)