In [9]:
!pip install biopython




In [12]:
import pandas as pd
import requests
import csv
import argparse
from typing import List, Dict, Optional
from Bio import Entrez


Entrez.email = "your-email@example.com" 


NON_ACADEMIC_KEYWORDS = ["Pharma", "Biotech", "Inc", "Ltd", "Corporation", "Company"]







In [13]:
def fetch_pubmed_papers(query: str, max_results: int = 50) -> List[Dict]:
    """Fetches papers from PubMed based on a given query."""
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()
    pmids = record["IdList"]
    
    papers = []
    for pmid in pmids:
        paper = fetch_paper_details(pmid)
        if paper:
            papers.append(paper)
    
    return papers

In [14]:
def fetch_paper_details(pmid: str) -> Optional[Dict]:
    """Fetches detailed paper metadata from PubMed."""
    handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
    records = Entrez.read(handle)
    handle.close()
    
    try:
        article = records["PubmedArticle"][0]["MedlineCitation"]["Article"]
        title = article.get("ArticleTitle", "N/A")
        pub_date = article.get("Journal", {}).get("JournalIssue", {}).get("PubDate", "N/A")
        authors = article.get("AuthorList", [])
        
        non_academic_authors, affiliations = extract_non_academic_authors(authors)
        corresponding_email = extract_corresponding_email(authors)
        
        if non_academic_authors:
            return {
                "PubmedID": pmid,
                "Title": title,
                "Publication Date": pub_date,
                "Non-academic Author(s)": ", ".join(non_academic_authors),
                "Company Affiliation(s)": ", ".join(affiliations),
                "Corresponding Author Email": corresponding_email,
            }
    except Exception:
        return None

In [15]:
def extract_non_academic_authors(authors: List[Dict]) -> (List[str], List[str]):
    """Extracts authors affiliated with non-academic institutions."""
    non_academic_authors = []
    company_affiliations = []
    
    for author in authors:
        affiliation = author.get("AffiliationInfo", [{}])[0].get("Affiliation", "")
        if any(keyword in affiliation for keyword in NON_ACADEMIC_KEYWORDS):
            non_academic_authors.append(author.get("LastName", "") + " " + author.get("ForeName", ""))
            company_affiliations.append(affiliation)
    
    return non_academic_authors, company_affiliations

In [16]:
def extract_corresponding_email(authors: List[Dict]) -> str:
    """Extracts the email of the corresponding author if available."""
    for author in authors:
        if "@" in author.get("Affiliation", ""):
            return author.get("Affiliation", "").split()[-1]
    return "N/A"

In [17]:
def save_to_csv(papers: List[Dict], filename: str):
    """Saves the fetched papers to a CSV file."""
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["PubmedID", "Title", "Publication Date", "Non-academic Author(s)", "Company Affiliation(s)", "Corresponding Author Email"])
        writer.writeheader()
        writer.writerows(papers)

In [19]:
def main():
    parser = argparse.ArgumentParser(description="Fetch research papers from PubMed.")
    parser.add_argument("query", type=str, help="Search query for PubMed")
    parser.add_argument("-f", "--file", type=str, help="Output CSV filename", default="output.csv")
    parser.add_argument("-d", "--debug", action="store_true", help="Enable debug mode")
    args = parser.parse_args()
    
    if args.debug:
        print(f"Fetching papers for query: {args.query}")
    
    papers = fetch_pubmed_papers(args.query)
    if papers:
        save_to_csv(papers, args.file)
        print(f"Results saved to {args.file}")
    else:
        print("No relevant papers found.")




In [21]:
import sys
sys.argv = ["notebook", "cancer therapy", "-f", "output.csv"] 

main() 

Results saved to output.csv


In [22]:
import pandas as pd

df = pd.read_csv(output_file)
df.head()


Unnamed: 0,PubmedID,Title,Publication Date,Non-academic Author(s),Company Affiliation(s),Corresponding Author Email
0,40053899,Real-World Analysis Evaluating Treatment Eligi...,"{'Year': '2025', 'Month': 'Mar'}",Eipe Thomas,"Department of Clinical Pharmacology, Advanced ...",
1,40053698,Prospective Evaluation of Structure-Based Simu...,"{'Year': '2025', 'Month': 'Mar', 'Day': '07'}","Rangwala Aziz M, Bluck Joseph P, Christ Clara ...","Department of Pharmacological Sciences, Stony ...",
2,40053692,Radiation Dose-Volume Effects on Negative Tumo...,"{'Year': '2025', 'Month': 'Mar', 'Day': '07'}",Xu Yang,Enhance Human Health Through Pharma Technology...,
3,40053689,Bioenergetics of human spermatozoa in patients...,"{'Year': '2025', 'Month': 'Mar', 'Day': '07'}","Simonik Ondrej, Bryndova Barbora, Sur Vishma P...","Laboratory of Reproductive Biology, Institute ...",
4,40053572,Hafnium-Doped Prussian Blue Nanoparticles with...,"{'Year': '2025', 'Month': 'Mar', 'Day': '07'}","Kuang Ye, Chen Yufang, Liu Xinying, Liu Baohui...",Fujian Key Laboratory of Drug Target Discovery...,
