# **Python script to download latest DOME Registry contents, related full text papers & provide DOME Registry entries metadata read out (20241202)**
1. DOME Registry contents will be downloaded by API call proividng the json file of DOME Registry data
2. DOME Registry data json will be flattened and converted into CSV for working with entries data (row based data)
3. DOME Registry CSV will be checked and used to produce a metadata readout file (+ graphs)
4. DOME Registry DOIs of articles will be convrted to PMCIDs for full text retrieval 
5. DOME Registry entries will be downloaded as full XML files using PMCIDs list and NCBI Entrez service (Replace with EPMC when API works [^1] )

[^1]: EPMC full text XL API module issues on 20241204

## 1. Latest DOME Registry contents will be downloaded by DOME Registry API call proividing the .json file of DOME Registry data for the given day 

In [None]:
# 1. Use the DOME API to download all entries of the DOME Registry and store this in a json file 
import os
from datetime import datetime
import requests

# Define the URL for the call
url = "https://registry.dome-ml.org/api/review?skip=0&limit=250&text=%20&public=true&sort=publication.year&asc=true"

# Make an API request to the URL
response = requests.get(url, headers={'accept': '*/*'})

# Create folder to store all JSON files
if not os.path.exists('DOME_Registry_JSON_Files'):
    os.makedirs('DOME_Registry_JSON_Files')
else:
    print('Folder already exists for storing DOME Registry JSON files')

# Specify the desired folder path for JSON files
json_folder_path = "DOME_Registry_JSON_Files"

# Check if the request was successful
if response.status_code == 200:
    # Get the current date in ISO format for file naming
    current_date = datetime.now().strftime('%Y-%m-%d')

    # Create the output file name 
    file_name = f"DOME_Registry_Contents_{current_date}.json"
    json_file_path = os.path.join(json_folder_path, file_name)

    # Check if the file pathway already exists
    if os.path.exists(json_file_path):
        print(f"File already exists for today's date.")
        overwrite = input('Do you want to overwrite the file? (y/n): ') 
        if overwrite.lower() != 'y':
            print('Exiting without overwriting file')
        else:
            print('Overwriting file')
            # Save the content to a file
            with open(json_file_path, 'w', encoding='utf-8') as file:
                file.write(response.text)
            print(f"DOME Registry data downloaded and saved to '{json_file_path}'")
else:
    print(f"Failed to retrieve the data. Status code: {response.status_code}")


Folder already exists for storing DOME Registry JSON files
File already exists for today's date.
Overwriting file
DOME Registry data downloaded and saved to 'DOME_Registry_JSON_Files/DOME_Registry_Contents_2024-12-10.json'


## 2. DOME Registry data .json file will be flattened and converted into CSV for easier working with entries data (row based data format)

In [36]:
# 2. Produce DOME Registry contents metadata .csv file and data visualisation
import json

# 2.1 Pretty print DOME Registry contents JSON file for inspection to ensure all looks as expected

# Function to read in and pretty-print the JSON DOME Registry file entry
def pretty_print_json(file_name):
    try:
        # Open and read the JSON file
        with open(file_name, 'r', encoding='utf-8') as file:
            data = json.load(file)
        
        # Pretty-print the JSON data
        print(json.dumps(data, indent=4))
    
    except Exception as e:
        print(f"Error reading the JSON file: {e}")

# Call the function to pretty-print the JSON file
#pretty_print_json(file_name)


# 2.2 Flatten the JSON for easier data processing and write to a new .json file 
# Function to read JSON data
def read_json(file_name):
    try:
        with open(file_name, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data
    except Exception as e:
        print(f"Error reading the JSON file: {e}")
        return None

# Function to flatten JSON
def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

# Function to save flattened JSON to a file
def save_flattened_json(flattened_data, output_file_name):
    try:
        with open(output_file_name, 'w', encoding='utf-8') as file:
            json.dump(flattened_data, file, indent=4)
        print(f"Flattened JSON data saved to '{output_file_name}'")
    except Exception as e:
        print(f"Error saving the flattened JSON file: {e}")

# Read JSON data
data = read_json(file_name)

# Flatten JSON data and save to a new JSON file
if data:
    flattened_data = [flatten_json(entry) for entry in data]
    flattened_file_name = ("flattened_"+file_name)
    # Make file path to save flattened JSON file
    json_folder_path = "DOME_Registry_JSON_Files"
    json_file_path = os.path.join(json_folder_path, flattened_file_name)
    save_flattened_json(flattened_data, json_file_path)
    
    # Print the flattened JSON data to view it

else:
    print("No data to process.")



#2.3 Convert flattened json to csv 
# Function to read flattened JSON data
import json
import csv
import os

# Define the path to the flattened JSON file
#flattened_file_name = 'flattened_DOME_Registry_Contents.json'  # Replace with your actual file name

# Function to read flattened JSON data
def read_flattened_json(file_name):
    try:
        with open(file_name, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data
    except Exception as e:
        print(f"Error reading the flattened JSON file: {e}")
        return None

# CSV folders create
if not os.path.exists('DOME_Registry_CSV_Files'):
    os.makedirs('DOME_Registry_CSV_Files')
else:
    print('Folder already exists for storing DOME Registry CSV files')

# Function to write JSON data to a CSV file
def write_json_to_csv(json_data, csv_file_name):
    try:
        # Determine all possible headers from the entire dataset
        headers = set()
        for entry in json_data:
            headers.update(entry.keys())
        headers = list(headers)
        
        # Write data to CSV file
        with open(csv_file_name, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=headers)
            writer.writeheader()
            for entry in json_data:
                writer.writerow(entry)
        
        print(f"JSON data written to '{csv_file_name}'")
    except Exception as e:
        print(f"Error writing to the CSV file: {e}")

# Read flattened JSON data
flattened_data = read_flattened_json(flattened_file_name)

# Process JSON data into CSV
if flattened_data:
    csv_file_name = flattened_file_name[:-5]+'.csv'
    csv_file_path = os.path.join('DOME_Registry_CSV_Files', csv_file_name)
    write_json_to_csv(flattened_data, csv_file_path)
else:
    print("No data to process.")

csv_file_name = flattened_file_name[:-5]+'.csv'

Flattened JSON data saved to 'DOME_Registry_JSON_Files/flattened_DOME_Registry_Contents_2024-12-10.json'
Folder already exists for storing DOME Registry CSV files
JSON data written to 'DOME_Registry_CSV_Files/flattened_DOME_Registry_Contents_2024-12-10.csv'


## 3. DOME Registry CSV data file will be formatted with shortid as the row index and other fields cleaned (publication data) and ordered by D O M E fields 

In [None]:
#3.2 reorder data frame
import pandas as pd

df = pd.read_csv(csv_file_name)

# Define the path to your CSV file
# csv_file_name = 'path_to_your_csv_file.csv'  # Replace with your actual file name

# Read in DOME Entries CSV as dataframe via pandas library functions
# df = pd.read_csv(csv_file_name)

# Define the prefixes to match and group csv data
prefix_publications_cols = 'publication_'
prefix_data_cols = 'matches_data'
prefix_optimization_cols = 'matches_optimization'
prefix_model_cols = 'matches_model'
prefix_evaluation_cols = 'matches_evaluation'

# Separate columns based on whether they start with the prefix
publication_columns = [col for col in df.columns if col.startswith(prefix_publications_cols)]
matches_data_columns = [col for col in df.columns if col.startswith(prefix_data_cols)]
matches_optimization_columns = [col for col in df.columns if col.startswith(prefix_optimization_cols)]
matches_model_columns = [col for col in df.columns if col.startswith(prefix_model_cols)]
matches_evaluation_columns = [col for col in df.columns if col.startswith(prefix_evaluation_cols)]
other_columns = [col for col in df.columns if not col.startswith('matches_') and not col.startswith('publication_')]

# Reorder columns
reordered_columns = (other_columns + publication_columns + matches_data_columns +
                     matches_optimization_columns + matches_model_columns + matches_evaluation_columns)
df = df[reordered_columns]

# Print the reordered DataFrame
#print(df.head())

df = pd.DataFrame(df).set_index('shortid')
csv_file_name = os.path.join('DOME_Registry_CSV_Files',csv_file_name)
df.to_csv(csv_file_name, sep=',', index=True, encoding='utf-8')




## 4. DOME Registry data csv will have a column added at the end with PMCIDs returned from DOI search using NCBI E-Utilities API 

print(matches_publication_columns)

In [39]:
# 4. Get DOIs -> PMCIDs full text search
# DOIs to PMCIDs
import pandas as pd
import requests

# Define the path to your CSV file
#csv_file_name = 'path_to_your_csv_file.csv'  # Replace with your actual file name

# Read in DOME Entries CSV as dataframe via pandas library functions
df = pd.read_csv(csv_file_name)

# Extract DOIs from the DataFrame
dois = df['publication_doi'].dropna().unique()

# Function to map DOIs to PMCIDs using NCBI E-utilities API
def map_dois_to_pmcids(dois):
    pmcid_mapping = {}
    for doi in dois:
        url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=my_tool&email=my_email@example.com&ids={doi}&format=json"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            records = data.get('records', [])
            if records:
                for record in records:
                    if 'pmcid' in record:
                        pmcid_mapping[doi] = record['pmcid']
                    else:
                        pmcid_mapping[doi] = None
        else:
            pmcid_mapping[doi] = None
    return pmcid_mapping

# Map DOIs to PMCIDs
doi_to_pmcid_mapping = map_dois_to_pmcids(dois)

# Add the mapped PMCIDs to the DataFrame
df['mapped_pmcid'] = df['publication_doi'].map(doi_to_pmcid_mapping)

# TO UPDATE FILE NAMING TO CORRELATE BETTER

# Save the updated DataFrame to a new CSV file
output_csv_file_name = 'updated_DOME_Registry_Contents.csv'
df.to_csv(output_csv_file_name, index=False)

# Print the updated DataFrame
print(df.head())

#TO FIX REQUEST INTO SMALLER BATCHES VS SINGLE REQUESTS FOR SPEED
#to add number of pmcids missing readout for metadata file below

      shortid                                  uuid                   created  \
0  6i0xepuivt  66a94333-8cd1-499c-86ef-0497a4c4dabc  2022-09-01T15:16:05.444Z   
1  nlj5x3dld8  147ddf2b-6b53-4335-b62f-87994d284310  2022-09-01T15:16:05.445Z   
2  ysqyy92zyr  600b20de-7c70-41af-ad39-33121af090ef  2024-03-26T17:49:30.048Z   
3  qx3ex71jye  b863eb51-d9ae-4fc0-bfd4-006db90d1631  2024-03-27T13:25:49.790Z   
4  v536tc3b5t  28fe7de1-ac05-4cf2-bfa8-d5ddd1ba32b8  2022-09-01T15:16:05.443Z   

                        _id                   updated  score  public  \
0  63516fedb9c880af1f305b5c  2022-09-01T15:16:05.444Z   0.67    True   
1  63516fedb9c880af1f305b93  2022-09-01T15:16:05.445Z   0.67    True   
2  66030aaa1502715bfe53d65c  2024-03-26T17:49:30.048Z   0.71    True   
3  66041e5d1502715bfe53d70a  2024-03-27T13:25:49.790Z   0.76    True   
4  63516fedb9c880af1f305b1c  2022-09-01T15:16:05.443Z   0.81    True   

   publication_updated publication_journal  \
0  01/28/2022 00:13:56  BMC Bioinf

## 5. Use EPMC API to return and save full text xml of all DOME Registry entries and store in folder named PMC_full_texts

In [None]:
# D5. ownload using epmc api the full text using pmcids into folder
 
import pandas as pd
import requests
import os

# Define the path to your CSV file
csv_file_name = 'updated_DOME_Registry_Contents.csv'  # Replace with your actual file name

# Read in DOME Entries CSV as dataframe via pandas library functions
df = pd.read_csv(csv_file_name)

# Extract PMCIDs from the DataFrame
pmcids = df['mapped_pmcid'].dropna().unique()

# Define the output folder for full text files
output_folder = 'PMC_Full_Texts'
os.makedirs(output_folder, exist_ok=True)

# Function to download full text for each PMCID using Europe PMC API
def download_full_text(pmcids):
    for pmcid in pmcids:
        url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"
        response = requests.get(url)
        if response.status_code == 200:
            full_text = response.text
            output_file = os.path.join(output_folder, f"{pmcid}.xml")
            with open(output_file, 'w', encoding='utf-8') as file:
                file.write(full_text)
            print(f"Full text for PMCID {pmcid} saved to '{output_file}'")
        else:
            print(f"Failed to retrieve full text for PMCID {pmcid}. Status code: {response.status_code}")

# Download full text for each PMCID
download_full_text(pmcids)

# print how many successfully downloaded and how many failed to download - TO ADD


Full text for PMCID PMC1421439 saved to 'PMC_Full_Texts/PMC1421439.xml'
Full text for PMCID PMC1847686 saved to 'PMC_Full_Texts/PMC1847686.xml'
Full text for PMCID PMC2213690 saved to 'PMC_Full_Texts/PMC2213690.xml'
Full text for PMCID PMC1892091 saved to 'PMC_Full_Texts/PMC1892091.xml'
Full text for PMCID PMC2638158 saved to 'PMC_Full_Texts/PMC2638158.xml'
Full text for PMCID PMC2665034 saved to 'PMC_Full_Texts/PMC2665034.xml'
Full text for PMCID PMC2275242 saved to 'PMC_Full_Texts/PMC2275242.xml'
Full text for PMCID PMC2561051 saved to 'PMC_Full_Texts/PMC2561051.xml'
Full text for PMCID PMC2660303 saved to 'PMC_Full_Texts/PMC2660303.xml'
Full text for PMCID PMC2752621 saved to 'PMC_Full_Texts/PMC2752621.xml'
Full text for PMCID PMC3009519 saved to 'PMC_Full_Texts/PMC3009519.xml'
Failed to retrieve full text for PMCID PMC3169429. Status code: 404
Full text for PMCID PMC3542245 saved to 'PMC_Full_Texts/PMC3542245.xml'
Full text for PMCID PMC3396452 saved to 'PMC_Full_Texts/PMC3396452.x

## 6. Metadata generated on DOME Entries data csv such as of avaialability of full text XML files, total entries, etc + some graphs of data validity vs expected inputs 

In [None]:

# 6. Create metadata file readout as a csv, corresponding text file to explain contents and graphs to go with these
# Metadata file readout as CSV and text file to explain contents and graph visualisation of data validation 

#import libraries
import csv
import re
import os
import pandas as pd
# 6.1 Calculate metadata and produce csv file
# Read in csv with pandas into dataframe for analysis
df = pd.read_csv('updated_DOME_Registry_Contents.csv')
df = df.set_index('shortid')
#df.head()

# Get dataframe total entries count based on rows in index

# Set empty counter for rows count, this will be used as the figure of DOME Resgitry entries in the csv data downloaded
rows = 0
for row in df.index:
    #print(row)
    rows = rows + 1
print(rows)

# For entry rows count how many empty or non-conforming rows (regex checks)
df.head()

for rows_entry in df[:]:
    for entry in rows_entry:
        if :
            x
        else:
            x
# 6.2 Turn csv data into corresponding text file to verbally explain metrics


# 6.3 Turn csv into into corresponding graphed data to visualise the metrics
'''



214


'\n# For entry rows count how many empty or non-conforming rows (regex checks)\ndf.head()\n\nfor rows_entry in df[:]:\n    for entry in rows_entry:\n        if :\n            x\n        else:\n            x\n# 6.2 Turn csv data into corresponding text file to verbally explain metrics\n\n\n# 6.3 Turn csv into into corresponding graphed data to visualise the metrics\n'