# **Python script to download latest DOME Registry contents, related full text papers & provide DOME Registry entries metadata read out (20241220)**
1. DOME Registry contents will be downloaded by API call providing the json file of DOME Registry data
2. DOME Registry data json will be flattened and converted into TSV for working with entries data (row based data)
3. DOME Registry TSV will be checked and used to produce a metadata readout file (+ graphs)
4. DOME Registry DOIs of articles will be converted to PMCIDs and Europe PMC IDs for full text retrieval 
5. DOME Registry entries will be downloaded as full text PDF files using EPMC API
6. DOME Registry supplementary files will be downloaded using EPMC API
7. DOME Registry title and abstracts enriched in TSV from EPMC to support data analysis
8. Metadata and graphs produced on available DOME Registry articles retrieval

#### To do: dockerise & put into simple run script vs jupyter notebook


## 1. Latest DOME Registry contents will be downloaded by DOME Registry API call providing the .json file of DOME Registry data for the given day 

In [4]:
# 1. Use the DOME API to download all entries of the DOME Registry and store them in a json file 
import os
from datetime import datetime
import requests

# Define the URL for the call
url = "https://registry.dome-ml.org/api/review?skip=0&limit=250&text=%20&public=true&sort=publication.year&asc=true"

# Make an API request to the URL to check the response
response = requests.get(url, headers={'accept': '*/*'})

# Create folder to store all JSON files
if not os.path.exists('DOME_Registry_JSON_Files'):
    os.makedirs('DOME_Registry_JSON_Files')
    print('Created folder for storing DOME Registry JSON files')
else:
    print('Folder already exists for storing DOME Registry JSON files, files will be stored here')

# Specify the desired folder path for JSON files
json_folder_path = "DOME_Registry_JSON_Files"

# Check if the request was successful
if response.status_code == 200:
    # Get the current date in ISO format for file naming
    current_date = datetime.now().strftime('%Y-%m-%d')

    # Create the output file name 
    file_name = f"DOME_Registry_Contents_{current_date}.json"
    json_file_path = os.path.join(json_folder_path, file_name)

    # Check if the file pathway already exists
    if os.path.exists(json_file_path):
        print(f"File already exists for today's date: {json_file_path}")
        print('Skipping download. Delete the file manually if you want to re-download.')
        print('Continuing with existing file...')
    else:
        print('Downloading new file...')
        # Save the content to a file
        with open(json_file_path, 'w', encoding='utf-8') as file:
            file.write(response.text)
        print(f"DOME Registry data downloaded and saved to '{json_file_path}'")
    
    print(f"Using file: {json_file_path}")
else:
    print(f"Failed to retrieve the data. Status code: {response.status_code}")
    # Set json_file_path to None to prevent errors in subsequent cells
    json_file_path = None

print("Block 1 complete.")

Folder already exists for storing DOME Registry JSON files, files will be stored here
Downloading new file...
DOME Registry data downloaded and saved to 'DOME_Registry_JSON_Files/DOME_Registry_Contents_2025-11-20.json'
Using file: DOME_Registry_JSON_Files/DOME_Registry_Contents_2025-11-20.json
Block 1 complete.


## 2. DOME Registry data .json file will be flattened and converted into TSV for easier working with entries data (row and column based data format)

In [5]:
# 2. Produce DOME Registry contents metadata .tsv file and data visualisation
import json

# 2.1 Pretty print DOME Registry contents JSON file for inspection to ensure all looks as expected 
# remove comment to activate print and debug where needed

# Function to read in and pretty-print the JSON DOME Registry file entry
def pretty_print_json(file_name):
    try:
        # Open and read the JSON file
        with open(file_name, 'r', encoding='utf-8') as file:
            data = json.load(file)
        
        # Pretty-print the JSON data
        print('Successfully read in JSON file.')
        #print(json.dumps(data, indent=4))
    
    except Exception as e:
        print(f"Error reading the JSON file: {e}")

# Call the function to pretty-print the JSON file
pretty_print_json(json_file_path)


# 2.2 Flatten the JSON for easier data processing and write to a new .json file 
# Function to read JSON data
def read_json(file_name):
    try:
        with open(file_name, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data
    except Exception as e:
        print(f"Error reading the JSON file: {e}")
        return None

# Function to flatten JSON
def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

# Function to save flattened JSON to a file
def save_flattened_json(flattened_data, output_file_name):
    try:
        with open(output_file_name, 'w', encoding='utf-8') as file:
            json.dump(flattened_data, file, indent=4)
        print(f"Flattened JSON data saved to '{output_file_name}'")
    except Exception as e:
        print(f"Error saving the flattened JSON file: {e}")

# Read JSON data
data = read_json(json_file_path)

# Flatten JSON data and save to a new JSON file
if data:
    flattened_data = [flatten_json(entry) for entry in data]
    flattened_file_name = ("flattened_"+file_name)
    # Make file path to save flattened JSON file
    json_folder_path = "DOME_Registry_JSON_Files"
    json_file_path = os.path.join(json_folder_path, flattened_file_name)
    save_flattened_json(flattened_data, json_file_path)
    # Print the flattened JSON data to view it

else:
    print("No data to process.")

#2.3 Convert flattened json to tsv 
# Function to read flattened JSON data
import json
import csv
import os

# Function to read flattened JSON data
def read_flattened_json(file_name):
    try:
        with open(file_name, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data
    except Exception as e:
        print(f"Error reading the flattened JSON file: {e}")
        return None

# TSV folders created to store tsv
if not os.path.exists('DOME_Registry_TSV_Files'):
    print('Creating folder to store DOME Registry TSV files')
    os.makedirs('DOME_Registry_TSV_Files')
else:
    print('Folder already exists for storing DOME Registry TSV files')

# Function to write JSON data to a TSV file
def write_json_to_tsv(json_data, tsv_file_name):
    try:
        # Determine all possible headers from the entire dataset
        headers = set()
        for entry in json_data:
            headers.update(entry.keys())
        headers = list(headers)
        
        # Write data to TSV file
        with open(tsv_file_name, 'w', newline='', encoding='utf-8') as tsvfile:
            writer = csv.DictWriter(tsvfile, fieldnames=headers, delimiter='\t')
            writer.writeheader()
            for entry in json_data:
                writer.writerow(entry)
        
        print(f"JSON data written to '{tsv_file_name}'")
    except Exception as e:
        print(f"Error writing to the TSV file: {e}")

# Read flattened JSON data
flattened_data = read_flattened_json(json_file_path)

# Create TSV file name and file pathway
tsv_file_name = flattened_file_name[:-5]+'.tsv'
tsv_file_path = os.path.join('DOME_Registry_TSV_Files', tsv_file_name)

# Process JSON data into TSV
if flattened_data:
    write_json_to_tsv(flattened_data, tsv_file_path)
else:
    print("No data to process.")

tsv_file_name = flattened_file_name[:-5]+'.tsv'

Successfully read in JSON file.
Flattened JSON data saved to 'DOME_Registry_JSON_Files/flattened_DOME_Registry_Contents_2025-11-20.json'
Folder already exists for storing DOME Registry TSV files
JSON data written to 'DOME_Registry_TSV_Files/flattened_DOME_Registry_Contents_2025-11-20.tsv'


## 3. DOME Registry TSV data file will be formatted with shortid as the row index and other fields cleaned (publication data) and ordered by D O M E fields 

In [7]:
#3 Reorder TSV using pandas data frame 
import pandas as pd

# Read the TSV file as a DataFrame using pandas
df = pd.read_csv(tsv_file_path, sep='\t')

# Define the prefixes to match and group tsv data
prefix_publications_cols = 'publication_'
prefix_tags_cols = 'publication_tags_'
prefix_data_cols = 'matches_data'
prefix_optimization_cols = 'matches_optimization'
prefix_model_cols = 'matches_model'
prefix_evaluation_cols = 'matches_evaluation'

# Separate columns based on whether they start with the prefix
publication_columns = [col for col in df.columns if col.startswith(prefix_publications_cols) and not col.startswith(prefix_tags_cols)]
publication_tags_columns = [col for col in df.columns if col.startswith(prefix_tags_cols)]
# Sort tags columns numerically (e.g., publication_tags_0, publication_tags_1, ...)
publication_tags_columns = sorted(publication_tags_columns, key=lambda x: int(x.split('_')[-1]) if x.split('_')[-1].isdigit() else 0)
matches_data_columns = [col for col in df.columns if col.startswith(prefix_data_cols)]
matches_optimization_columns = [col for col in df.columns if col.startswith(prefix_optimization_cols)]
matches_model_columns = [col for col in df.columns if col.startswith(prefix_model_cols)]
matches_evaluation_columns = [col for col in df.columns if col.startswith(prefix_evaluation_cols)]
other_columns = [col for col in df.columns if not col.startswith('matches_') and not col.startswith('publication_')]

# Reorder columns
reordered_columns = (other_columns + publication_columns + publication_tags_columns + matches_data_columns +
                     matches_optimization_columns + matches_model_columns + matches_evaluation_columns)
df = df[reordered_columns]

# Print the reordered DataFrame
#print(df.head())

df = pd.DataFrame(df).set_index('shortid')
df.to_csv(tsv_file_path, sep='\t', index=True, encoding='utf-8') 

print(f"Reordered TSV data saved to '{tsv_file_path}'")

Reordered TSV data saved to 'DOME_Registry_TSV_Files/flattened_DOME_Registry_Contents_2025-11-20.tsv'


## 4. DOME Registry data tsv will have columns added with PMCIDs and Europe PMC IDs returned from DOI search using NCBI E-Utilities API 

In [None]:
# 4. From DOIs get PMCIDs and Europe PMC IDs for full text search
import pandas as pd
import requests
import re

# Read in DOME Entries TSV as dataframe via pandas library functions
df = pd.read_csv(tsv_file_path, sep='\t')

# Extract DOIs from the DataFrame
dois = df['publication_doi'].dropna().unique()

# Function to clean and normalize DOI strings
def clean_doi(doi_string):
    """
    Clean DOI string by removing common prefixes and URLs.
    Handles formats like:
    - https://doi.org/10.1038/nature123
    - http://dx.doi.org/10.1016/j.cell.2020
    - doi:10.1126/science.abc456
    - 10.1002/anie.202100001
    
    Returns clean DOI like: 10.1038/nature123
    """
    if pd.isna(doi_string):
        return None
    
    # Convert to string and strip whitespace
    doi_string = str(doi_string).strip()
    
    # Remove common URL prefixes
    doi_string = re.sub(r'^https?://doi\.org/', '', doi_string, flags=re.IGNORECASE)
    doi_string = re.sub(r'^https?://dx\.doi\.org/', '', doi_string, flags=re.IGNORECASE)
    doi_string = re.sub(r'^https?://www\.doi\.org/', '', doi_string, flags=re.IGNORECASE)
    
    # Remove 'doi:' prefix
    doi_string = re.sub(r'^doi:\s*', '', doi_string, flags=re.IGNORECASE)
    
    # Strip any remaining whitespace
    doi_string = doi_string.strip()
    
    # Validate that it starts with '10.' (all DOIs start with 10.)
    if not doi_string.startswith('10.'):
        print(f"Warning: Potentially invalid DOI format: {doi_string}")
    
    return doi_string

# Clean the DOIs
dois = [clean_doi(doi) for doi in dois if clean_doi(doi) is not None]

print(f"Cleaned {len(dois)} DOIs for processing")

# Map DOIs to PMCIDs and Europe PMC IDs using NCBI E-utilities API
def map_dois_to_ids(dois, batch_size=1):
    id_mapping = {}
    for i in range(0, len(dois), batch_size):
        batch = dois[i:i + batch_size]
        doi_str = ','.join(batch)
        url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=my_tool&email=my_email@example.com&ids={doi_str}&format=json"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            records = data.get('records', [])
            if records:
                for record in records:
                    doi = record.get('doi')
                    pmcid = record.get('pmcid')
                    pmid = record.get('pmid')
                    # Europe PMC ID is typically the PMCID without the 'PMC' prefix, or the PMID
                    europepmc_id = pmcid if pmcid else (f"MED/{pmid}" if pmid else None)
                    id_mapping[doi] = {
                        'pmcid': pmcid,
                        'europepmc_id': europepmc_id,
                        'pmid': pmid
                    }
        else:
            for doi in batch:
                id_mapping[doi] = {'pmcid': None, 'europepmc_id': None, 'pmid': None}
    return id_mapping

# Map DOIs to PMCIDs and Europe PMC IDs
doi_to_id_mapping = map_dois_to_ids(dois)

# Add the mapped IDs to the DataFrame
df['mapped_pmcid'] = df['publication_doi'].apply(lambda x: doi_to_id_mapping.get(x.replace('https://doi.org/', '') if pd.notna(x) else None, {}).get('pmcid'))
df['mapped_europepmc_id'] = df['publication_doi'].apply(lambda x: doi_to_id_mapping.get(x.replace('https://doi.org/', '') if pd.notna(x) else None, {}).get('europepmc_id'))
df['mapped_pmid'] = df['publication_doi'].apply(lambda x: doi_to_id_mapping.get(x.replace('https://doi.org/', '') if pd.notna(x) else None, {}).get('pmid'))

# Save the updated DataFrame to a new TSV file
output_tsv_file_name = f'DOME_Registry_TSV_Files/PMCIDs_DOME_Registry_Contents_{current_date}.tsv'
df.to_csv(output_tsv_file_name, sep='\t', index=False)
print(f"Updated DataFrame with mapped PMCIDs and Europe PMC IDs saved to '{output_tsv_file_name}'")

Updated DataFrame with mapped PMCIDs and Europe PMC IDs saved to 'DOME_Registry_TSV_Files/PMCIDs_DOME_Registry_Contents_2025-11-16.tsv'


## 5. Use EPMC API to download full text PDFs of all DOME Registry entries and store in folder named DOME_Registry_PMC_PDFs

In [8]:
# 5. Download full text PDFs using PMCIDs from Europe PMC
# Note: Europe PMC does not directly provide PDFs through REST API - we need to use alternative methods
 
import pandas as pd
import requests
import os
import time

# Read in DOME Entries TSV as dataframe via pandas library functions
df = pd.read_csv(output_tsv_file_name, sep='\t')

# Extract PMCIDs from the DataFrame
pmcids = df['mapped_pmcid'].dropna().unique()

# Define the output folder for PDF files
output_folder = 'DOME_Registry_PMC_PDFs'
os.makedirs(output_folder, exist_ok=True)

# Track which PMCIDs need downloading (skip already downloaded)
to_download_pmcid = []
for pmcid in pmcids:
    if os.path.exists(f'{output_folder}/{pmcid}_main.pdf'):
        print(f"PDF for PMCID {pmcid} already downloaded.")
    else:
        print(f"PDF for PMCID {pmcid} not yet downloaded.")
        to_download_pmcid.append(pmcid)

print(f"\nNeed to download {len(to_download_pmcid)} PDFs out of {len(pmcids)} total entries.\n")

# Function to download full text PDF and supplementary materials
def download_pdfs(pmcids):
    """
    Download PDFs from Europe PMC. 
    Note: Direct PDF downloads are not always available through Europe PMC REST API.
    We'll try multiple approaches:
    1. Try to get PDF link from article metadata
    2. Download supplementary files if available
    3. Construct publisher URLs where possible
    """
    success_count = 0
    fail_count = 0
    supp_count = 0
    
    for idx, pmcid in enumerate(pmcids, 1):
        print(f"[{idx}/{len(pmcids)}] Processing {pmcid}...")
        
        # Clean PMCID (remove 'PMC' prefix for some API calls)
        clean_pmcid = pmcid.replace('PMC', '') if pmcid.startswith('PMC') else pmcid
        
        # Try Method 1: Get article metadata to find PDF link
        try:
            metadata_url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=PMCID:{pmcid}&resultType=core&format=json"
            metadata_response = requests.get(metadata_url, timeout=30)
            
            if metadata_response.status_code == 200:
                metadata = metadata_response.json()
                
                if metadata.get('hitCount', 0) > 0:
                    result = metadata['resultList']['result'][0]
                    
                    # Try to get PDF link from fullTextUrlList
                    if 'fullTextUrlList' in result and result['fullTextUrlList']:
                        for url_info in result['fullTextUrlList']['fullTextUrl']:
                            if url_info.get('documentStyle') == 'pdf' or url_info.get('availabilityCode') == 'OA':
                                pdf_url = url_info.get('url')
                                
                                if pdf_url and '.pdf' in pdf_url.lower():
                                    # Try to download the PDF
                                    pdf_response = requests.get(pdf_url, timeout=30, allow_redirects=True)
                                    
                                    if pdf_response.status_code == 200 and pdf_response.headers.get('Content-Type', '').startswith('application/pdf'):
                                        output_file = os.path.join(output_folder, f"{pmcid}_main.pdf")
                                        with open(output_file, 'wb') as file:
                                            file.write(pdf_response.content)
                                        print(f"  ✓ Downloaded main PDF from publisher")
                                        success_count += 1
                                        break
                    
                    # If no PDF found yet, try PMC OA service
                    if not os.path.exists(f'{output_folder}/{pmcid}_main.pdf'):
                        # Try Europe PMC OA PDF service (different endpoint)
                        pmc_oa_url = f"https://europepmc.org/articles/{pmcid}?pdf=render"
                        pmc_response = requests.get(pmc_oa_url, timeout=30, allow_redirects=True)
                        
                        if pmc_response.status_code == 200 and len(pmc_response.content) > 1000:
                            # Check if it's actually a PDF
                            if pmc_response.content[:4] == b'%PDF':
                                output_file = os.path.join(output_folder, f"{pmcid}_main.pdf")
                                with open(output_file, 'wb') as file:
                                    file.write(pmc_response.content)
                                print(f"  ✓ Downloaded main PDF from PMC OA service")
                                success_count += 1
                            else:
                                print(f"  ✗ Could not retrieve PDF (not openly available)")
                                fail_count += 1
                        else:
                            print(f"  ✗ Could not retrieve PDF (status: {pmc_response.status_code})")
                            fail_count += 1
        
        except Exception as e:
            print(f"  ✗ Error downloading main PDF: {str(e)}")
            fail_count += 1
        
        # Try to download supplementary files
        try:
            supp_url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/supplementaryFiles"
            supp_response = requests.get(supp_url, timeout=30)
            
            if supp_response.status_code == 200:
                try:
                    supp_data = supp_response.json()
                    
                    if 'supplementaryFiles' in supp_data and supp_data['supplementaryFiles']:
                        for idx_supp, supp_file in enumerate(supp_data['supplementaryFiles'], 1):
                            file_url = supp_file.get('url')
                            
                            if file_url:
                                # Download all supplementary files (not just PDFs)
                                try:
                                    file_response = requests.get(file_url, timeout=30, allow_redirects=True)
                                    
                                    if file_response.status_code == 200:
                                        # Determine file extension from URL or content-type
                                        file_ext = ''
                                        if '.pdf' in file_url.lower():
                                            file_ext = '.pdf'
                                        elif '.xlsx' in file_url.lower() or '.xls' in file_url.lower():
                                            file_ext = '.xlsx'
                                        elif '.docx' in file_url.lower() or '.doc' in file_url.lower():
                                            file_ext = '.docx'
                                        elif '.zip' in file_url.lower():
                                            file_ext = '.zip'
                                        else:
                                            # Try to get from content-type
                                            content_type = file_response.headers.get('Content-Type', '')
                                            if 'pdf' in content_type:
                                                file_ext = '.pdf'
                                            elif 'excel' in content_type or 'spreadsheet' in content_type:
                                                file_ext = '.xlsx'
                                            else:
                                                file_ext = '.dat'  # Default extension
                                        
                                        supp_output_file = os.path.join(output_folder, f"{pmcid}_supp_{idx_supp}{file_ext}")
                                        with open(supp_output_file, 'wb') as file:
                                            file.write(file_response.content)
                                        print(f"  ✓ Downloaded supplementary file {idx_supp}{file_ext}")
                                        supp_count += 1
                                
                                except Exception as e_supp:
                                    print(f"  ⚠ Could not download supplementary file {idx_supp}: {str(e_supp)}")
                
                except json.JSONDecodeError:
                    pass  # No supplementary files available
        
        except Exception as e:
            pass  # Supplementary files not critical, continue
        
        # Rate limiting - be respectful to the API
        time.sleep(0.5)
    
    print(f"\n{'='*60}")
    print(f"DOWNLOAD SUMMARY")
    print(f"{'='*60}")
    print(f"Main PDFs successfully downloaded: {success_count}")
    print(f"Main PDFs failed/not available: {fail_count}")
    print(f"Supplementary files downloaded: {supp_count}")
    print(f"{'='*60}\n")

# Download PDFs for each PMCID that hasn't been downloaded yet
if to_download_pmcid:
    download_pdfs(to_download_pmcid)
else:
    print("All PDFs already downloaded. Skipping download step.")

# Update the TSV with download status
print("Updating TSV with PDF download status...")
pdf_downloadable = []

for pmcid in df['mapped_pmcid']:
    if pd.notna(pmcid) and os.path.exists(f'{output_folder}/{pmcid}_main.pdf'):
        pdf_downloadable.append('yes')
    else:
        pdf_downloadable.append('no')

# Add the new column of download status to the DataFrame and save
df['pdf_downloadable'] = pdf_downloadable
df.to_csv(output_tsv_file_name, sep='\t', index=False)
print(f"✓ Updated TSV with PDF download status saved to '{output_tsv_file_name}'")

print("\nBlock 5 complete.")

PDF for PMCID PMC1421439 not yet downloaded.
PDF for PMCID PMC1892091 not yet downloaded.
PDF for PMCID PMC2213690 not yet downloaded.
PDF for PMCID PMC1847686 not yet downloaded.
PDF for PMCID PMC2561051 not yet downloaded.
PDF for PMCID PMC2275242 not yet downloaded.
PDF for PMCID PMC2665034 not yet downloaded.
PDF for PMCID PMC2638158 not yet downloaded.
PDF for PMCID PMC2752621 not yet downloaded.
PDF for PMCID PMC2660303 not yet downloaded.
PDF for PMCID PMC3169429 not yet downloaded.
PDF for PMCID PMC3009519 not yet downloaded.
PDF for PMCID PMC3340366 not yet downloaded.
PDF for PMCID PMC3292016 not yet downloaded.
PDF for PMCID PMC3396452 not yet downloaded.
PDF for PMCID PMC3542245 not yet downloaded.
PDF for PMCID PMC3912131 not yet downloaded.
PDF for PMCID PMC4058174 not yet downloaded.
PDF for PMCID PMC3967921 not yet downloaded.
PDF for PMCID PMC4289375 not yet downloaded.
PDF for PMCID PMC4507953 not yet downloaded.
PDF for PMCID PMC4315436 not yet downloaded.
PDF for PM

## 6. Download supplementary files (PDFs and DOC files) using Europe PMC supplementary files API

In [2]:
# 6. Download supplementary files (PDF and DOC only) using PMID from NCBI PubMed FTP
 
import pandas as pd
import os
from ftplib import FTP
import time

# Read in DOME Entries TSV as dataframe
df = pd.read_csv(output_tsv_file_name, sep='\t')

# Extract PMIDs from the DataFrame
if 'mapped_pmid' in df.columns:
    df['mapped_pmid'] = df['mapped_pmid'].apply(lambda x: str(int(x)) if pd.notna(x) else None)
    pmids = df['mapped_pmid'].dropna().unique()
    print(f"Found {len(pmids)} PMIDs to process for supplementary files")
else:
    print("Error: 'mapped_pmid' column not found in TSV.")
    pmids = []

# Define the output folder for supplementary files
supp_output_folder = 'DOME_Registry_PMC_Supplementary'
os.makedirs(supp_output_folder, exist_ok=True)

# Track which PMIDs need downloading
to_download_pmids = []
for pmid in pmids:
    existing_files = [f for f in os.listdir(supp_output_folder) if f.startswith(f"PMID{pmid}_supp_")]
    if existing_files:
        print(f"Supplementary files for PMID {pmid} already downloaded ({len(existing_files)} files).")
    else:
        to_download_pmids.append(pmid)

print(f"\nNeed to check {len(to_download_pmids)} PMIDs for supplementary files.\n")

# Function to download supplementary files from NCBI PubMed FTP
def download_supp_from_ftp(pmids):
    """
    Download supplementary PDF and DOC files from NCBI PubMed FTP server.
    FTP structure: ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/
    Files are organized by PMID in subdirectories.
    """
    total_files_downloaded = 0
    entries_with_supp = 0
    entries_without_supp = 0
    
    # Connect to NCBI FTP server
    ftp_host = 'ftp.ncbi.nlm.nih.gov'
    
    for idx, pmid in enumerate(pmids, 1):
        print(f"[{idx}/{len(pmids)}] Processing PMID {pmid}...")
        
        try:
            # Connect to FTP
            ftp = FTP(ftp_host, timeout=30)
            ftp.login()  # Anonymous login
            
            # Navigate to PubMed Central OA package directory
            # PMC files are typically in /pub/pmc/oa_package/ or manuscript directory
            base_path = '/pub/pmc/manuscript/'
            
            try:
                # Try to find the PMID directory
                # PMC organizes by PMC ID, so we need to search
                # Typically format: /pub/pmc/manuscript/PMC[pmcid]/
                
                # Get PMCID for this PMID from our dataframe
                pmcid = df[df['mapped_pmid'] == pmid]['mapped_pmcid'].iloc[0]
                
                if pd.notna(pmcid):
                    pmcid_clean = pmcid.replace('PMC', '')
                    article_path = f"{base_path}PMC{pmcid_clean}/"
                    
                    # Try to change to this directory
                    ftp.cwd(article_path)
                    
                    # List all files in directory
                    files = ftp.nlst()
                    
                    # Filter for PDF and DOC files only
                    supp_files = [f for f in files if f.lower().endswith(('.pdf', '.doc', '.docx'))]
                    
                    if supp_files:
                        print(f"  Found {len(supp_files)} supplementary file(s)")
                        
                        for file_idx, filename in enumerate(supp_files, 1):
                            # Download the file
                            local_filename = os.path.join(supp_output_folder, f"PMID{pmid}_supp_{file_idx}_{filename}")
                            
                            with open(local_filename, 'wb') as local_file:
                                ftp.retrbinary(f'RETR {filename}', local_file.write)
                            
                            file_size = os.path.getsize(local_filename)
                            print(f"  ✓ Downloaded: {filename} ({file_size:,} bytes)")
                            total_files_downloaded += 1
                        
                        entries_with_supp += 1
                    else:
                        print(f"  ℹ No PDF/DOC supplementary files found")
                        entries_without_supp += 1
                else:
                    print(f"  ✗ No PMCID found for PMID {pmid}")
                    entries_without_supp += 1
                    
            except Exception as e:
                print(f"  ℹ No supplementary files accessible via FTP: {str(e)}")
                entries_without_supp += 1
            
            ftp.quit()
            
        except Exception as e:
            print(f"  ✗ FTP connection error: {str(e)}")
            entries_without_supp += 1
        
        # Rate limiting
        time.sleep(0.5)
    
    print(f"\n{'='*60}")
    print(f"SUPPLEMENTARY FILES DOWNLOAD SUMMARY (FTP)")
    print(f"{'='*60}")
    print(f"Total supplementary files downloaded: {total_files_downloaded}")
    print(f"Entries with supplementary files: {entries_with_supp}")
    print(f"Entries without supplementary files: {entries_without_supp}")
    print(f"{'='*60}\n")

# Download supplementary files
if to_download_pmids:
    download_supp_from_ftp(to_download_pmids)
else:
    print("All supplementary files already checked/downloaded.")

# Update TSV with supplementary files information
if 'mapped_pmid' in df.columns:
    print("Updating TSV with supplementary files information...")
    
    supp_file_count = []
    supp_file_names = []
    
    for pmid in df['mapped_pmid']:
        if pd.notna(pmid):
            supp_files = [f for f in os.listdir(supp_output_folder) if f.startswith(f"PMID{pmid}_supp_")]
            count = len(supp_files)
            supp_file_count.append(count)
            
            if supp_files:
                supp_file_names.append('; '.join(supp_files))
            else:
                supp_file_names.append(None)
        else:
            supp_file_count.append(0)
            supp_file_names.append(None)
    
    df['supplementary_file_count'] = supp_file_count
    df['supplementary_file_names'] = supp_file_names
    
    df.to_csv(output_tsv_file_name, sep='\t', index=False)
    print(f"✓ Updated TSV saved to '{output_tsv_file_name}'")
    
    total_supp_files = sum(supp_file_count)
    entries_with_supp = sum(1 for count in supp_file_count if count > 0)
    
    print(f"\nSupplementary Files Statistics:")
    print(f"  Total supplementary files: {total_supp_files}")
    print(f"  Entries with supplementary files: {entries_with_supp}/{len(df)}")

print("\nBlock 6 complete.")

NameError: name 'output_tsv_file_name' is not defined

## 7. Enrich TSV with title and abstract data from Europe PMC for all DOME Registry entries

In [None]:
# 7. Enrich the TSV file with title and abstract columns from Europe PMC
 
import pandas as pd
import requests
import time

# Read in DOME Entries TSV as dataframe via pandas library functions
df = pd.read_csv(output_tsv_file_name, sep='\t')

# Check if title and abstract columns already exist
if 'article_title' in df.columns and 'article_abstract' in df.columns:
    print("Title and abstract columns already exist in TSV.")
    print("Checking for entries that need to be enriched...")
    # Count how many entries already have data
    existing_count = df['article_title'].notna().sum()
    print(f"{existing_count} out of {len(df)} entries already have title/abstract data.")
else:
    print("Adding new columns for title and abstract...")
    df['article_title'] = None
    df['article_abstract'] = None

# Function to fetch article details from Europe PMC
def fetch_article_details(pmcid):
    """
    Fetch title and abstract for a given PMCID from Europe PMC.
    
    Args:
        pmcid (str): PubMed Central ID
        
    Returns:
        tuple: (title, abstract) or (None, None) if not found
    """
    try:
        url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=PMCID:{pmcid}&resultType=core&format=json"
        response = requests.get(url, timeout=30)
        
        if response.status_code == 200:
            data = response.json()
            if data.get('hitCount', 0) > 0:
                article = data['resultList']['result'][0]
                title = article.get('title', None)
                abstract = article.get('abstractText', None)
                return title, abstract
    except Exception as e:
        print(f"  ✗ Error fetching details: {str(e)}")
    
    return None, None

# Process each row and enrich with title/abstract if needed
print("\nEnriching TSV with title and abstract data...")
success_count = 0
fail_count = 0
skip_count = 0
total_to_process = 0

# First, count how many need processing
for idx, row in df.iterrows():
    pmcid = row.get('mapped_pmcid')
    if pd.notna(pmcid):
        # Check if this entry already has title/abstract
        if pd.isna(row.get('article_title')) or pd.isna(row.get('article_abstract')):
            total_to_process += 1

print(f"Need to enrich {total_to_process} entries with title/abstract data.\n")

# Now process the entries
processed = 0
for idx, row in df.iterrows():
    pmcid = row.get('mapped_pmcid')
    
    if pd.notna(pmcid):
        # Check if this entry already has title/abstract
        if pd.isna(row.get('article_title')) or pd.isna(row.get('article_abstract')):
            processed += 1
            print(f"[{processed}/{total_to_process}] Processing {pmcid}...")
            
            title, abstract = fetch_article_details(pmcid)
            
            if title and abstract:
                df.at[idx, 'article_title'] = title
                df.at[idx, 'article_abstract'] = abstract
                print(f"  ✓ Added title and abstract")
                success_count += 1
            else:
                print(f"  ✗ Failed to retrieve article details")
                fail_count += 1
            
            # Rate limiting - be respectful to the API
            time.sleep(0.5)
        else:
            skip_count += 1
    else:
        # No PMCID available
        skip_count += 1

print(f"\n{'='*60}")
print(f"TITLE/ABSTRACT ENRICHMENT SUMMARY")
print(f"{'='*60}")
print(f"Successfully enriched: {success_count}")
print(f"Failed/not available: {fail_count}")
print(f"Skipped (already had data or no PMCID): {skip_count}")
print(f"{'='*60}\n")

# Save the enriched TSV
df.to_csv(output_tsv_file_name, sep='\t', index=False)
print(f"✓ Enriched TSV with title and abstract columns saved to '{output_tsv_file_name}'")

# Show a sample of the enriched data
print(f"\nSample of enriched data (first 3 rows with title/abstract):")
sample_df = df[df['article_title'].notna()][['mapped_pmcid', 'article_title', 'article_abstract']].head(3)
if not sample_df.empty:
    for idx, row in sample_df.iterrows():
        print(f"\nPMCID: {row['mapped_pmcid']}")
        print(f"Title: {row['article_title'][:100]}..." if len(str(row['article_title'])) > 100 else f"Title: {row['article_title']}")
        print(f"Abstract: {str(row['article_abstract'])[:150]}..." if len(str(row['article_abstract'])) > 150 else f"Abstract: {row['article_abstract']}")
else:
    print("No entries with title/abstract data found.")

print("\nBlock 7 complete.")

## 8. Metadata generated on DOME Entries data TSV such as availability of full text PDF files, supplementary files, title/abstract data, total entries, etc + some graphs of data validity vs expected inputs 

In [11]:

# 8. Create metadata file readout as a TSV, corresponding text file to explain contents and graphs to go with these
# Metadata file readout as TSV and text file to explain contents and graph visualisation of data validation 

#import libraries
import csv
import re
import os
import pandas as pd
import matplotlib.pyplot as plt

# Read in the TSV file
df = pd.read_csv(output_tsv_file_name, sep='\t')

# Calculate metadata
total_entries = len(df)
pdf_yes = df['pdf_downloadable'].value_counts().get('yes', 0)
pdf_no = df['pdf_downloadable'].value_counts().get('no', 0)

# Check for title/abstract data availability
title_abstract_available = (df['article_title'].notna() & df['article_abstract'].notna()).sum()
title_abstract_missing = total_entries - title_abstract_available

# Check for supplementary files
entries_with_supp = (df['supplementary_file_count'] > 0).sum()
entries_without_supp = total_entries - entries_with_supp
total_supp_files = df['supplementary_file_count'].sum()

# Create a metadata DataFrame
metadata = pd.DataFrame({
    'Metric': [
        'Total Entries', 
        'PDF Available (Yes)', 
        'PDF Available (No)', 
        'Title/Abstract Available', 
        'Title/Abstract Missing',
        'Entries with Supplementary Files',
        'Entries without Supplementary Files',
        'Total Supplementary Files Count'
    ],
    'Count': [
        total_entries, 
        pdf_yes, 
        pdf_no, 
        title_abstract_available, 
        title_abstract_missing,
        entries_with_supp,
        entries_without_supp,
        int(total_supp_files)
    ]
})

# Save metadata to a new TSV file
metadata_tsv_path = 'DOME_Registry_TSV_Files/DOME_Metadata.tsv'
metadata.to_csv(metadata_tsv_path, sep='\t', index=False)
print(f"Metadata saved to '{metadata_tsv_path}'")

# Plot bar charts
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Bar chart for PDF availability
axes[0].bar(['Yes', 'No'], [pdf_yes, pdf_no], color=['green', 'red'])
axes[0].set_title('PDF Availability')
axes[0].set_xlabel('Availability')
axes[0].set_ylabel('Paper count')

# Bar chart for title/abstract availability
axes[1].bar(['Available', 'Missing'], [title_abstract_available, title_abstract_missing], color=['green', 'red'])
axes[1].set_title('Title/Abstract Data Availability')
axes[1].set_xlabel('Availability')
axes[1].set_ylabel('Paper count')

# Bar chart for supplementary files
axes[2].bar(['With Supp Files', 'Without Supp Files'], [entries_with_supp, entries_without_supp], color=['blue', 'gray'])
axes[2].set_title('Supplementary Files Availability')
axes[2].set_xlabel('Availability')
axes[2].set_ylabel('Paper count')

# Adjust layout to prevent overlap
plt.tight_layout()

# Save the plot as an image file
plot_image_path = 'DOME_Registry_TSV_Files/DOME_Metadata_Bar_Charts.png'
plt.savefig(plot_image_path, dpi=300, bbox_inches='tight')
print(f"Bar charts saved to '{plot_image_path}'")

# Show the plot
plt.show()

# Print detailed summary
print(f"\n{'='*60}")
print(f"DOME REGISTRY METADATA SUMMARY")
print(f"{'='*60}")
print(f"Total entries: {total_entries}")
print(f"PDFs available: {pdf_yes} ({pdf_yes/total_entries*100:.1f}%)")
print(f"Title/Abstract available: {title_abstract_available} ({title_abstract_available/total_entries*100:.1f}%)")
print(f"Entries with supplementary files: {entries_with_supp} ({entries_with_supp/total_entries*100:.1f}%)")
print(f"Total supplementary files: {int(total_supp_files)}")
if entries_with_supp > 0:
    print(f"Average supplementary files per entry (with supp): {total_supp_files/entries_with_supp:.2f}")
print(f"{'='*60}")

# To add
# 8.2 Turn TSV data into corresponding text file to verbally explain metrics
# 8.3 Turn TSV into corresponding graphed data to visualise the metrics

print("\nBlock 8 complete.")

KeyError: 'abstract_title_downloadable'