In [28]:
import requests
import json
import pandas as pd
import numpy as np

The PubMed API has two main endpoints:

- ESearch — search for papers, get IDs
- EFetch — use IDs to get the actual conten

# Search: air pollution and health


In [2]:
response = requests.get(
    "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
    params={
        "db": "pubmed",
        "term": "AQI cancer",
        "retmode": "json",
        "retmax": 100
    }
)

result = response.json()
print(json.dumps(result, indent=2))
print()
print("======BREAK=======")
print()

ids = result["esearchresult"]["idlist"]  # your list of IDs
id_string = ",".join(ids)  # turns list into "123,456,789,..."


response2 = requests.get(
    "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
    params={
        "db": "pubmed",
        "id": id_string,
        "retmode": "xml"
    }
)

print(response2.text)
print()
print("======BREAK=======")
print()


import xml.etree.ElementTree as ET

root = ET.fromstring(response2.text)

papers = []
for article in root.findall(".//PubmedArticle"):
    # Find the title
    title = article.find(".//ArticleTitle").text
    # Find all abstract sections and combine them
    abstract_parts = article.findall(".//AbstractText")
    abstract = " ".join([part.text for part in abstract_parts if part.text])
    papers.append({"title": title, "abstract": abstract})

print(len(papers))
print(papers[0])
print(papers[1])
print(papers[2])
print()
print("======BREAK=======")
print()


{
  "header": {
    "type": "esearch",
    "version": "0.3"
  },
  "esearchresult": {
    "count": "44",
    "retmax": "44",
    "retstart": "0",
    "idlist": [
      "41178424",
      "40955053",
      "40781578",
      "40724190",
      "40511528",
      "39843581",
      "39738395",
      "39642465",
      "39067059",
      "38839810",
      "38311242",
      "37985855",
      "37806430",
      "37598151",
      "37454985",
      "37361469",
      "37361164",
      "37304093",
      "36705824",
      "36126422",
      "35803593",
      "35664109",
      "35312179",
      "34110590",
      "33644766",
      "33481402",
      "33196993",
      "33082960",
      "32950635",
      "30910154",
      "30086519",
      "29965442",
      "29048397",
      "28551742",
      "27556839",
      "26799652",
      "26579661",
      "26271645",
      "25901100",
      "25684495",
      "25624787",
      "25040992",
      "15204801",
      "6546898"
    ],
    "translationset": [
      {
        "

In [3]:
print("TITLE:", title)
print("\nABSTRACT:", abstract[:500])

TITLE: Metabolism of 4'-(9-acridinylamino)methanesulfon-m-anisidide by rat liver microsomes.

ABSTRACT: 4'-(9-Acridinylamino)methanesulfon-m-anisidide (m-AMSA) is metabolized by a hepatic microsomal enzyme system composed of rat liver microsomes, a reduced nicotinamide adenine dinucleotide phosphate-generating system, cytosolic protein (or glutathione), and oxygen. Omission of any one of the components, or incubation under an atmosphere of CO or N2, results in inhibition of the reaction. Also, the addition of inhibitors of microsomal metabolism (alpha-naphthoflavone, metyrapone, or SKF 525-A) decr


In [4]:
search = pd.DataFrame(papers)
print(search.head())
search.to_csv('AQI_cancer', index=False)

                                               title  \
0  Socioeconomic impact on quality of care in pel...   
1  Adverse Health Consequences of Poor Air Qualit...   
2  Efficiency Analysis Based on Two-Stage Undesir...   
3  The Impact of Air Quality on Patient Mortality...   
4  Impact of short-term air pollution exposure on...   

                                            abstract  
0  Several European studies have shown health-rel...  
1  Literatures shows that poor air quality index ...  
2  In previous studies exploring the causes of lu...  
3                                                     
4  The effects of short-term air pollution exposu...  


In [5]:
df1 = pd.read_csv("/Users/chandlershortlidge/Desktop/Ironhack/end-to-end-project/notebooks/air_pollution_lung_health")
df2 = pd.read_csv("/Users/chandlershortlidge/Desktop/Ironhack/end-to-end-project/notebooks/AQI_cancer")
df3 = pd.read_csv("/Users/chandlershortlidge/Desktop/Ironhack/end-to-end-project/notebooks/particulate_matter_asthma")
df4 = pd.read_csv("/Users/chandlershortlidge/Desktop/Ironhack/end-to-end-project/notebooks/PM2.5_respiratory_disease")

# Concat dataframes

In [6]:
# concat the searched dataframes
# ignore_index = concat: 0, 1, 2, 3, 4, 5, 6, 7...  
full_dataset = pd.concat([df1, df2, df3, df4], ignore_index=True)
# drop duplicates
full_dataset = full_dataset.drop_duplicates(subset='title')
print(len(full_dataset))

312


# Filter out empty abstracts

In [7]:
full_dataset = full_dataset[full_dataset["abstract"] != ' ']
print(len(full_dataset))

312


In [16]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(full_dataset["abstract"].tolist())

In [20]:
def search_papers(query, top_n=3):
    # encode the query
    user_embedding = model.encode([query])
    # compute cosine similarity against embeddings
    similarities = cosine_similarity(user_embedding, embeddings)
    # get top N indices
    top_indices = similarities[0].argsort()[-3:][::-1]
    # return those papers from papers_df
    return full_dataset.iloc[top_indices][["title", "abstract"]]




In [26]:
search_papers("asthma and air pollution")


Unnamed: 0,title,abstract
202,Correlation Between Air Quality and the Exacer...,Background Asthma is a chronic respiratory dis...
200,Air Quality Index as a Predictor of Respirator...,"The Mon Valley near Pittsburgh, Pennsylvania, ..."
172,Study on the health impacts of childhood asthm...,Asthma is one of the major disease burdens in ...


In [22]:
search_papers("PM2.5 danger")

Unnamed: 0,title,abstract
337,circ_0000554 promotes macrophage M2 polarizati...,"Environmental pollution, such as fine particul..."
62,Polycyclic nitroaromatic compounds in HULIS as...,Identifying the toxic components in PM
298,DALYs-Based Health Risk Assessment and Key Inf...,The health risks of PM


In [23]:
search_papers("children air polution")

Unnamed: 0,title,abstract
155,Individual and combined effects of indoor home...,Children encounter multiple indoor and outdoor...
76,Ambient air pollutant mixture and lung functio...,Ambient air pollutants such as particulate mat...
328,Urbanization and childhood asthma.,Childhood asthma is one of the major public he...


In [27]:
search_papers("AQI danger")

Unnamed: 0,title,abstract
131,[Particle Size Distribution and Human Health R...,Under a condition of good air quality (AQI:55-...
116,Evaluation of the health risk using multi-poll...,Air pollution imposes a significant burden on ...
117,Critical air pollutant assessments and health ...,The aim of this study was to evaluate changes ...


In [29]:
full_dataset.to_csv('papers.csv', index=False)
np.save('embeddings.npy', embeddings)

In [30]:
test = np.load('embeddings.npy')
print(test.shape)

(312, 384)


file embeddings.npy


In [31]:
file embeddings.npy


SyntaxError: invalid syntax (38841769.py, line 1)