<a href="https://colab.research.google.com/github/deekshi1230/deekshita/blob/main/Untitled8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!poetry run get-papers-list "cancer research" -f results.csv

/bin/bash: line 1: poetry: command not found


In [4]:
import requests
import csv
import re
from typing import List, Dict, Optional

PUBMED_API_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
PUBMED_SUMMARY_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"

EXCLUDE_WORDS = ["university", "college", "hospital", "institute", "school", "lab", "research center"]

def fetch_pubmed_papers(query: str, max_results: int = 10) -> List[Dict]:
    """Fetches PubMed papers based on a query."""
    params = {
        "db": "pubmed",
        "term": query,
        "retmode": "json",
        "retmax": max_results,
    }
    response = requests.get(PUBMED_API_URL, params=params)
    response.raise_for_status()
    paper_ids = response.json().get("esearchresult", {}).get("idlist", [])

    return fetch_paper_details(paper_ids)

def fetch_paper_details(paper_ids: List[str]) -> List[Dict]:
    """Fetches detailed information for a list of paper IDs."""
    if not paper_ids:
        return []

    params = {
        "db": "pubmed",
        "id": ",".join(paper_ids),
        "retmode": "json"
    }
    response = requests.get(PUBMED_SUMMARY_URL, params=params)
    response.raise_for_status()
    paper_data = response.json().get("result", {})

    return [paper_data[pid] for pid in paper_ids if pid in paper_data]

def is_non_academic(affiliation: str) -> bool:
    """Checks if an author's affiliation is non-academic."""
    return not any(word in affiliation.lower() for word in EXCLUDE_WORDS)

def extract_relevant_data(papers: List[Dict]) -> List[Dict]:
    """Processes and extracts required fields from PubMed data."""
    results = []

    for paper in papers:
        paper_id = paper.get("uid", "N/A")
        title = paper.get("title", "N/A")
        pub_date = paper.get("pubdate", "N/A")

        authors = paper.get("authors", [])
        non_academic_authors = []
        company_affiliations = []
        corresponding_author_email = "N/A"

        for author in authors:
            affiliation = author.get("affiliation", "")
            email = author.get("email", "")

            if is_non_academic(affiliation):
                non_academic_authors.append(author.get("name", ""))
                company_affiliations.append(affiliation)

                if email and corresponding_author_email == "N/A":
                    corresponding_author_email = email

        if non_academic_authors:
            results.append({
                "PubMedID": paper_id,
                "Title": title,
                "Publication Date": pub_date,
                "Non-academic Author(s)": ", ".join(non_academic_authors),
                "Company Affiliation(s)": ", ".join(company_affiliations),
                "Corresponding Author Email": corresponding_author_email
            })

    return results

def save_to_csv(data: List[Dict], filename: str):
    """Saves results to a CSV file."""
    if not data:
        print("No non-academic authors found.")
        return

    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)

    print(f"Results saved to {filename}")
