### Scraper - PubMed

In [1]:
from headers import fer_datasets, fer_datasets_LONG, topics

### TODO: API
https://docs.openalex.org/api-entities/keywords

https://github.com/J535D165/pyalex

### First step

In [3]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import time

def search_pubmed(query, start_date, end_date, retmax=100):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": query,
        "retmax": retmax,
        "datetype": "pdat",
        "mindate": start_date,
        "maxdate": end_date,
        "usehistory": "y",
        "retmode": "json"
    }
    
    response = requests.get(base_url, params=params)
    return response.json()


from urllib.parse import urlparse

def check_paper_availability(doi):
    base_url = f"https://doi.org/{doi}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(base_url, headers=headers, allow_redirects=True)
        final_url = response.url
        
        if response.status_code == 200:
            if 'pdf' in final_url.lower():
                return "Direct PDF available", final_url
            elif 'sciencedirect.com' in final_url:
                return "ScienceDirect (may require access)", final_url
            elif 'pubmed.ncbi.nlm.nih.gov' in final_url:
                return "PubMed (may have free full text)", final_url
            elif 'springer.com' in final_url:
                return "Springer (may require access)", final_url
            else:
                return "Available, but may require access", final_url
        elif response.status_code == 404:
            return "Not found", None
        else:
            return f"Unexpected status code: {response.status_code}", None
    except requests.RequestException as e:
        return f"Error: {str(e)}", None

def fetch_paper_details(pmid):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "pubmed",
        "id": pmid,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=params)
    soup = BeautifulSoup(response.content, 'xml')
    
    title = soup.find('ArticleTitle').text if soup.find('ArticleTitle') else "N/A"
    abstract = soup.find('AbstractText').text if soup.find('AbstractText') else "N/A"
    
    doi_elem = soup.find('ELocationID', EIdType="doi")
    doi = doi_elem.text if doi_elem else "N/A"
    
    pdf_link = f"https://doi.org/{doi}" if doi != "N/A" else "N/A"
    
    return title, abstract, pdf_link

def main():

    all_keywords = fer_datasets + fer_datasets_LONG + topics
    query = f"({' OR '.join(all_keywords)}) AND bias"

    start_date = "2020/01/01"
    end_date = "2024/12/31"

    search_results = search_pubmed(query, start_date, end_date)

    with open('fer_papers.csv', 'w', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['PMID', 'Title', 'Abstract', 'PDF Link'])

        for pmid in search_results['esearchresult']['idlist']:
            title, abstract, pdf_link = fetch_paper_details(pmid)
            csvwriter.writerow([pmid, title, abstract, pdf_link])
            print(f"Processed PMID: {pmid}")
            time.sleep(1)  # Be respectful to the API

if __name__ == "__main__":
    main()

Processed PMID: 39394531
Processed PMID: 39394414
Processed PMID: 39393278
Processed PMID: 39392855
Processed PMID: 39392114
Processed PMID: 39391725
Processed PMID: 39391427
Processed PMID: 39390567
Processed PMID: 39390240
Processed PMID: 39388221
Processed PMID: 39388401
Processed PMID: 39386055
Processed PMID: 39385758
Processed PMID: 39385116
Processed PMID: 39384606
Processed PMID: 39384092
Processed PMID: 39382792
Processed PMID: 39381788
Processed PMID: 39381374
Processed PMID: 39381037
Processed PMID: 39381030
Processed PMID: 39380003


KeyboardInterrupt: 

### Second step

In [4]:
import requests
import csv
from urllib.parse import urlparse

def check_paper_availability(doi):
    base_url = f"https://doi.org/{doi}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(base_url, headers=headers, allow_redirects=True)
        final_url = response.url
        
        if response.status_code == 200:
            if 'pdf' in final_url.lower():
                return "Direct PDF available", final_url
            elif 'sciencedirect.com' in final_url:
                return "ScienceDirect (may require access)", final_url
            elif 'pubmed.ncbi.nlm.nih.gov' in final_url:
                return "PubMed (may have free full text)", final_url
            elif 'springer.com' in final_url:
                return "Springer (may require access)", final_url
            else:
                return "Available, but may require access", final_url
        elif response.status_code == 404:
            return "Not found", None
        else:
            return f"Unexpected status code: {response.status_code}", None
    except requests.RequestException as e:
        return f"Error: {str(e)}", None

def main():
    with open('fer_papers.csv', 'r', newline='', encoding='utf-8') as infile, \
         open('fer_papers_availability.csv', 'w', newline='', encoding='utf-8') as outfile:
        
        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames + ['Availability Status', 'Final URL']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        for row in reader:
            doi = urlparse(row['PDF Link']).path.strip('/')
            status, final_url = check_paper_availability(doi)
            row['Availability Status'] = status
            row['Final URL'] = final_url
            writer.writerow(row)
            print(f"Processed DOI: {doi}")

if __name__ == "__main__":
    main()

Processed DOI: 10.1007/s10143-024-03028-1
Processed DOI: 10.1038/s41598-024-75295-3
Processed DOI: 10.1016/j.jflm.2024.102766
Processed DOI: 10.1371/journal.pone.0312121
Processed DOI: 10.1002/14651858.CD000143.pub2
Processed DOI: 10.1016/j.isci.2024.110933
Processed DOI: 10.7759/cureus.69115
Processed DOI: 10.1186/s12903-024-05006-x
Processed DOI: N/A
Processed DOI: 10.7554/eLife.95160
Processed DOI: 10.1371/journal.pone.0309172
Processed DOI: N/A
Processed DOI: 10.1002/pld3.70005
Processed DOI: 10.1186/s12885-024-13027-6
Processed DOI: N/A
Processed DOI: 10.1016/j.pdpdt.2024.104361
Processed DOI: 10.1007/s40122-024-00657-8
Processed DOI: 10.34172/jlms.2024.41
Processed DOI: N/A
Processed DOI: 10.3389/fonc.2024.1381217
Processed DOI: 10.3389/fnume.2024.1379647
Processed DOI: 10.1186/s12967-024-05701-x
