# **Python script to download latest DOME Registry contents, related full text papers & provide DOME Registry entries metadata read out (20241202)**
1. DOME Registry contents will be downloaded by API call proividng the json file of DOME Registry data
2. DOME Registry data json will be flattened and converted into CSV for working with entries data (row based data)
3. DOME Registry CSV will be checked and used to produce a metadata readout file (+ graphs)
4. DOME Registry DOIs of articles will be convrted to PMCIDs for full text retrieval 
5. DOME Registry entries will be downloaded as full XML files using PMCIDs list and NCBI Entrez service (Replace with EPMC when API works [^1] )

[^1]: EPMC full text XL API module issues on 20241204

## 1. DOME Registry contents will be downloaded by API call proividng the json file of DOME Registry data

In [4]:
# 1. Use the DOME API to download all entries of the DOME Registry and store this in a json file 
import requests
import os
from datetime import datetime

# Define the URL for the API call: check the API documentation for the correct URL on the DOME Registry website
url = "https://registry.dome-ml.org/api/review?skip=0&limit=250&text=%20&public=true&sort=publication.year&asc=true"

# Make an API request to the URL
response = requests.get(url, headers={'accept': '*/*'})

# Check if the request was successful
if response.status_code == 200:
    # Get the current date in ISO format for file naming
    # Potentially update fiel datetime granularity if needing to run more regularly than daily, DOME Registry contents unlikely to be more regular than this
    current_date = datetime.now().strftime('%Y-%m-%d')
    

    # Create the output file name 
    file_name = f"DOME_Registry_Contents_{current_date}.json"

    # Check if the file pathway already exists
    if os.path.exists(file_name):
        print(f"File already exists for today's date, do you want to overwrite? (y/n)")
        overwrite = input('Do you want to overwrite the file? (y/n): ') 
        if overwrite == 'n':
            print('Exiting without overwriting file')
            exit()
        elif overwrite == 'y':
            print('Overwriting file')
        else:
            print('Invalid input, exiting')
            exit()

    
    # Save the content to a file
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(response.text)
    
    print(f"DOME Registry data downloaded and saved to '{file_name}'")
else:
    print(f"Failed to retrieve the data. Status code: {response.status_code}")



File already exists for today's date, do you want to overwrite? (y/n)
Overwriting file
DOME Registry data downloaded and saved to 'DOME_Registry_Contents_2024-12-05.json'


## 2. DOME Registry data json will be flattened and converted into CSV for working with entries data (row based data)

In [5]:
# 2. Produce DOME Registry contents metadata .csv file and data visualisation
import json

# 2.1 Pretty print DOME Registry contents JSON file for inspection to ensure all looks as expected

# Function to read and pretty-print the JSON file sample entry
def pretty_print_json(file_name):
    try:
        # Open and read the JSON file
        with open(file_name, 'r', encoding='utf-8') as file:
            data = json.load(file)
        
        # Pretty-print the JSON data
        print(json.dumps(data, indent=4))
    
    except Exception as e:
        print(f"Error reading the JSON file: {e}")

# Call the function to pretty-print the JSON file
# pretty_print_json(file_name)


# 2.2 Flatten the JSON for easier data processing and write to a new .json file 
# Function to read JSON data
def read_json(file_name):
    try:
        with open(file_name, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data
    except Exception as e:
        print(f"Error reading the JSON file: {e}")
        return None

# Function to flatten JSON
def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

# Function to save flattened JSON to a file
def save_flattened_json(flattened_data, output_file_name):
    try:
        with open(output_file_name, 'w', encoding='utf-8') as file:
            json.dump(flattened_data, file, indent=4)
        print(f"Flattened JSON data saved to '{output_file_name}'")
    except Exception as e:
        print(f"Error saving the flattened JSON file: {e}")

# Read JSON data
data = read_json(file_name)

# Flatten JSON data and save to a new file
if data:
    flattened_data = [flatten_json(entry) for entry in data]
    flattened_file_name = ("flattened_"+file_name)
    save_flattened_json(flattened_data, flattened_file_name)
    
    # Print the flattened JSON data to view it
    # to add a print of file output name and sucess ftatement - print(flattened_file_name)
    #print(json.dumps(flattened_data, indent=4))
else:
    print("No data to process.")



#2.3 Convert flattened json to csv 
# Function to read flattened JSON data
import json
import csv
import os

# Define the path to the flattened JSON file
#flattened_file_name = 'flattened_DOME_Registry_Contents.json'  # Replace with your actual file name

# Function to read flattened JSON data
def read_flattened_json(file_name):
    try:
        with open(file_name, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data
    except Exception as e:
        print(f"Error reading the flattened JSON file: {e}")
        return None

# Function to write JSON data to a CSV file
def write_json_to_csv(json_data, csv_file_name):
    try:
        # Determine all possible headers from the entire dataset
        headers = set()
        for entry in json_data:
            headers.update(entry.keys())
        headers = list(headers)
        
        # Write data to CSV file
        with open(csv_file_name, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=headers)
            writer.writeheader()
            for entry in json_data:
                writer.writerow(entry)
        
        print(f"JSON data written to '{csv_file_name}'")
    except Exception as e:
        print(f"Error writing to the CSV file: {e}")

# Read flattened JSON data
flattened_data = read_flattened_json(flattened_file_name)

# Process JSON data into CSV
if flattened_data:
    csv_file_name = flattened_file_name[:-5]+'.csv'
    write_json_to_csv(flattened_data, csv_file_name)
else:
    print("No data to process.")


Flattened JSON data saved to 'flattened_DOME_Registry_Contents_2024-12-05.json'
JSON data written to 'flattened_DOME_Registry_Contents_2024-12-05.csv'


## 3. DOME Registry CSV will be analysed for entry compliance and used to produce a metadata readout file (+ graphs - TBC)

In [None]:
# Production of the DOME Registry fields validity data & subsequent metadata csv file 
import csv
import re
import os

# 3.1  Simple print of all DOME fields & simple explainer put into text file 
# for assisting with use of DOME Regsitry entry data
# for header in csv_file_name:
#    print(header)

# Define regexes to check various CSV header field entries
# Define the EPMC regex pattern for PMIDs
pmid_pattern = re.compile(r'^\d{8}$')

# Define the regex pattern for DOIs
doi_pattern = re.compile(r'^10.\d{4,9}/[-._;()/:A-Z0-9]+$', re.IGNORECASE)

# Function to read CSV data
def read_csv(file_name):
    try:
        with open(file_name, 'r', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            data = [row for row in reader]
        return data
    except Exception as e:
        print(f"Error reading the CSV file: {e}")
        return None

# Function to check PMIDs and DOIs and generate metadata
def check_pmids_and_dois_and_generate_metadata(data, csv_file_name):
    pmid_valid = 0
    pmid_invalid = 0
    doi_valid = 0
    doi_invalid = 0
    total_entries = len(data)
    
    for row in data:
        pmid = row.get('publication_pmid', '')
        doi = row.get('publication_doi', '')
        
        if pmid_pattern.match(pmid):
            pmid_valid += 1
        else:
            pmid_invalid += 1
        
        if doi_pattern.match(doi):
            doi_valid += 1
        else:
            doi_invalid += 1
    
    # Print the results
    print(f"{pmid_valid} of {total_entries} PMIDs valid")
    print(f"{pmid_invalid} of {total_entries} PMIDs invalid")
    print(f"{doi_valid} of {total_entries} DOIs valid")
    print(f"{doi_invalid} of {total_entries} DOIs invalid")
    
    # Create metadata CSV file
    metadata_file_name = f"Metadata_{os.path.basename(csv_file_name)}"
    with open(metadata_file_name, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Metric', 'Value'])
        writer.writerow(['Total Entries', total_entries])
        writer.writerow(['Valid PMIDs', pmid_valid])
        writer.writerow(['Invalid PMIDs', pmid_invalid])
        writer.writerow(['Valid DOIs', doi_valid])
        writer.writerow(['Invalid DOIs', doi_invalid])
    
    print(f"Metadata written to '{metadata_file_name}'")

# Read CSV data
csv_data = read_csv(csv_file_name)

# Check PMIDs and DOIs and generate metadata
if csv_data:
    check_pmids_and_dois_and_generate_metadata(csv_data, csv_file_name)
else:
    print("No data to process.")



f
l
a
t
t
e
n
e
d
_
D
O
M
E
_
R
e
g
i
s
t
r
y
_
C
o
n
t
e
n
t
s
_
2
0
2
4
-
1
2
-
0
4
.
c
s
v
211 of 214 PMIDs valid
3 of 214 PMIDs invalid
208 of 214 DOIs valid
6 of 214 DOIs invalid
Metadata written to 'Metadata_flattened_DOME_Registry_Contents_2024-12-04.csv'


In [6]:

# 3.1  Simple print of all DOME fields & simple explainer put into text file 
# for assisting with use of DOME Registry entry data
# reformat using dataframe the DOME entries csv columns into more logical format
import csv
import pandas as pd #because who doesn't love a panda 

#Read in DOME Entries CSV as dataframe via pandas library functions
print(csv_file_name)
DOME_Entries_dataframe = pd.read_csv(csv_file_name)

# View data frame to inspect all data appears ok
DOME_Entries_dataframe.head()
DOME_Entries_dataframe.shape

#Get number of entries in header row correpsonding to DOME Registry entries fields 
# from a given entry related to its originating JSON file
i=0
header_entries_for_text_file = []
for header_entry in (DOME_Entries_dataframe.columns):
     i = i+1
     header_entries_for_text_file.append(header_entry)
print('Number of DOME Registry field entries: ' + str(i))
print(header_entries_for_text_file)

# to remove redundant fields and check over these - TBC
df = DOME_Entries_dataframe.reindex(sorted(DOME_Entries_dataframe.columns), axis=1)
df.head()
# to create more metadata and graph of entries (to work and think on)
# to add and choose main ID for rows header
# to do DOI checks and regex
# to get PMC full text from DOI and store in local folder

row_names = df['uuid']
#print(row_names)

# Set row names as shortid whichh corresponds to DOME Registry unique short id 
df = pd.DataFrame(df).set_index('shortid')

# Reorder metadata to start of columns list
# Define the prefixes to match and group csv data
prefix_matches_cols = 'matches_'
prefix_publications_cols= 'matches_publication'
prefix_data_cols= 'matches_data'
prefix_optimization_cols= 'matches_optimization'
prefix_model_cols= 'matches_model'
prefix_evaluation_cols= 'matches_evaluation'

# Separate columns based on whether they start with the prefix
matches_publication_columns = [col for col in df.columns if col.startswith(prefix_publications_cols)]
matches_data_columns = [col for col in df.columns if col.startswith(prefix_publications_cols)]
matches_optimization_columns = [col for col in df.columns if col.startswith(prefix_publications_cols)]
matches_model_columns = [col for col in df.columns if col.startswith(prefix_publications_cols)]
matches_evaluation_columns = [col for col in df.columns if col.startswith(prefix_publications_cols)]
other_columns = [col for col in df.columns if not col.startswith(prefix_matches_cols)]

# Reorder columns
reordered_columns = other_columns + matches_publication_columns + matches_data_columns + matches_optimization_columns + matches_model_columns + matches_evaluation_columns
df = df[reordered_columns]

print(df.head())

df.to_csv(csv_file_name, sep=',', index=True, encoding='utf-8')



flattened_DOME_Registry_Contents_2024-12-05.csv
Number of DOME Registry field entries: 45
['matches_evaluation/availability', 'matches_model/duration', 'score', 'matches_publication/authors', 'publication_authors', 'matches_model/availability', 'matches_publication/year', 'matches_optimization/parameters', 'matches_publication/updated', 'matches_publication/doi', 'matches_optimization/fitting', 'publication_skip', 'matches_optimization/config', 'publication_doi', 'matches_optimization/encoding', 'matches_publication/pmid', 'matches_evaluation/method', 'matches_optimization/features', 'matches_optimization/meta', 'updated', 'uuid', 'publication_updated', 'matches_dataset/availability', 'public', 'matches_optimization/algorithm', 'matches_evaluation/measure', 'publication_pmid', 'publication_created', 'matches_evaluation/confidence', 'matches_model/interpretability', 'matches_evaluation/comparison', 'matches_publication/title', 'matches_dataset/provenance', 'publication_year', 'publicati

In [1]:
print(matches_publication_columns = [col for col in df.columns if col.startswith(prefix_publications_cols)])


NameError: name 'df' is not defined

In [8]:
# View data frame to inspect all data appears ok
DOME_Entries_dataframe.head()
DOME_Entries_dataframe.shape

#Get number of entries in header row correpsonding to DOME Registry entries fields 
# from a given entry related to its originating JSON file
i=0
header_entries_for_text_file = []
for header_entry in (DOME_Entries_dataframe.columns):
     i = i+1
     header_entries_for_text_file.append(header_entry)
print('Number of DOME Registry field entries: ' + str(i))
print(header_entries_for_text_file)

# to remove redundant fields and check over these - TBC
df = DOME_Entries_dataframe.reindex(sorted(DOME_Entries_dataframe.columns), axis=1)
df.head()
# to create more metadata and graph of entries (to work and think on)
# to add and choose main ID for rows header
# to do DOI checks and regex
# to get PMC full text from DOI and store in local folder

row_names = df['uuid']
#print(row_names)

# Set row names as shortid whichh corresponds to DOME Registry unique short id 
df = pd.DataFrame(df).set_index('shortid')
df.head()




        

Number of DOME Registry field entries: 45
['matches_evaluation/availability', 'matches_model/duration', 'score', 'matches_publication/authors', 'publication_authors', 'matches_model/availability', 'matches_publication/year', 'matches_optimization/parameters', 'matches_publication/updated', 'matches_publication/doi', 'matches_optimization/fitting', 'publication_skip', 'matches_optimization/config', 'publication_doi', 'matches_optimization/encoding', 'matches_publication/pmid', 'matches_evaluation/method', 'matches_optimization/features', 'matches_optimization/meta', 'updated', 'uuid', 'publication_updated', 'matches_dataset/availability', 'public', 'matches_optimization/algorithm', 'matches_evaluation/measure', 'publication_pmid', 'publication_created', 'matches_evaluation/confidence', 'matches_model/interpretability', 'matches_evaluation/comparison', 'matches_publication/title', 'matches_dataset/provenance', 'publication_year', 'publication_done', 'matches_dataset/splits', 'publication

Unnamed: 0_level_0,_id,created,matches_dataset/availability,matches_dataset/provenance,matches_dataset/redundancy,matches_dataset/splits,matches_evaluation/availability,matches_evaluation/comparison,matches_evaluation/confidence,matches_evaluation/measure,...,publication_done,publication_journal,publication_pmid,publication_skip,publication_title,publication_updated,publication_year,score,updated,uuid
shortid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6i0xepuivt,63516fedb9c880af1f305b5c,2022-09-01T15:16:05.444Z,"yes, https://www.nature.com/articles/s41467-01...",,,,,,,,...,0.0,BMC Bioinformatics,16524483,0.0,Machine learning approaches to supporting the ...,01/28/2022 00:13:56,2006,0.67,2022-09-01T15:16:05.444Z,66a94333-8cd1-499c-86ef-0497a4c4dabc
nlj5x3dld8,63516fedb9c880af1f305b93,2022-09-01T15:16:05.445Z,Casp 11 website (https://predictioncenter.org/...,,Not assessed. In principle de novo protein str...,,,Performance achieved with methods based on aut...,,Precision as a function of effective aligned s...,...,0.0,BMC Genomics,17374164,0.0,Predicting protein function by machine learnin...,03/09/2022 10:14:51,2007,0.67,2022-09-01T15:16:05.445Z,147ddf2b-6b53-4335-b62f-87994d284310
ysqyy92zyr,66030aaa1502715bfe53d65c,2024-03-26T17:49:30.048Z,,Data are extracted from different databases (T...,Random split has been adopted for cross-valida...,"Training set: 11553 data points, No test nor v...",,no comparison with other approaches perfomed,no confidence interval reported. No statistica...,ROC curve,...,,BMC Bioinformatics,17888165,,Combining classifiers to predict gene function...,,2007,0.71,2024-03-26T17:49:30.048Z,600b20de-7c70-41af-ad39-33121af090ef
qx3ex71jye,66041e5d1502715bfe53d70a,2024-03-27T13:25:49.790Z,,Data is extracted from the PDB. Data are in cl...,All proteins in the dataset are non-homologous...,Traning set: 150 proteins; Testing set: 150 pr...,Not available,Cmparison with other approaches is missing. No...,Not reported,accuracy and segment-overlap value for alpha h...,...,,BMC Bioinformatics,17570862,,Learning biophysically-motivated parameters fo...,,2007,0.76,2024-03-27T13:25:49.790Z,b863eb51-d9ae-4fc0-bfd4-006db90d1631
v536tc3b5t,63516fedb9c880af1f305b1c,2022-09-01T15:16:05.443Z,Yes : data to reproduce the results can be do...,"""Data were extracted from publicly available d...",,A 5-fold cross-validation was performed to ass...,Yes. (https://github.com/ggonzalezp/hyperfoods),"A baseline input is used, in which all drug ta...",Confidence intervals and statistical significa...,"Balanced accuracy, F1 score, AUPR. The last tw...",...,0.0,BMC Bioinformatics,19091017,0.0,Semi-automatic conversion of BioProp semantic ...,03/25/2022 13:35:02,2008,0.81,2022-09-01T15:16:05.443Z,28fe7de1-ac05-4cf2-bfa8-d5ddd1ba32b8


In [10]:
#3.2 reorder data frame
import pandas as pd

# Define the path to your CSV file
# csv_file_name = 'path_to_your_csv_file.csv'  # Replace with your actual file name

# Read in DOME Entries CSV as dataframe via pandas library functions
# df = pd.read_csv(csv_file_name)

# Define the prefixes to match and group csv data
prefix_publications_cols = 'matches_publication'
prefix_data_cols = 'matches_data'
prefix_optimization_cols = 'matches_optimization'
prefix_model_cols = 'matches_model'
prefix_evaluation_cols = 'matches_evaluation'

# Separate columns based on whether they start with the prefix
matches_publication_columns = [col for col in df.columns if col.startswith(prefix_publications_cols)]
print(matches_model_columns)
matches_data_columns = [col for col in df.columns if col.startswith(prefix_data_cols)]
matches_optimization_columns = [col for col in df.columns if col.startswith(prefix_optimization_cols)]
matches_model_columns = [col for col in df.columns if col.startswith(prefix_model_cols)]
matches_evaluation_columns = [col for col in df.columns if col.startswith(prefix_evaluation_cols)]
other_columns = [col for col in df.columns if not col.startswith('matches_')]

# Reorder columns
reordered_columns = (other_columns + matches_publication_columns + matches_data_columns +
                     matches_optimization_columns + matches_model_columns + matches_evaluation_columns)
df = df[reordered_columns]

# Print the reordered DataFrame
print(df.head())

df.to_csv(csv_file_name, sep=',', index=True, encoding='utf-8')

['matches_model/availability', 'matches_model/duration', 'matches_model/interpretability', 'matches_model/output']
                                 _id                   created  public  \
shortid                                                                  
6i0xepuivt  63516fedb9c880af1f305b5c  2022-09-01T15:16:05.444Z    True   
nlj5x3dld8  63516fedb9c880af1f305b93  2022-09-01T15:16:05.445Z    True   
ysqyy92zyr  66030aaa1502715bfe53d65c  2024-03-26T17:49:30.048Z    True   
qx3ex71jye  66041e5d1502715bfe53d70a  2024-03-27T13:25:49.790Z    True   
v536tc3b5t  63516fedb9c880af1f305b1c  2022-09-01T15:16:05.443Z    True   

                                          publication_authors  \
shortid                                                         
6i0xepuivt               Wang H, Zheng H, Simpson D, Azuaje F   
nlj5x3dld8               Al-Shahib A, Breitling R, Gilbert DR   
ysqyy92zyr  Hui Lan, Rachel Carson , Nicholas J Provart an...   
qx3ex71jye  Blaise Gassend, Charles W O'D

In [None]:
df['uuid']

0      66a94333-8cd1-499c-86ef-0497a4c4dabc
1      147ddf2b-6b53-4335-b62f-87994d284310
2      600b20de-7c70-41af-ad39-33121af090ef
3      b863eb51-d9ae-4fc0-bfd4-006db90d1631
4      28fe7de1-ac05-4cf2-bfa8-d5ddd1ba32b8
                       ...                 
209    49b4a023-592f-4a04-bad2-827e519896e0
210    bf403e75-6baf-4278-bf96-1469c78c65e0
211    440c11f3-f064-40d7-9b1d-5d29591896b4
212    19954d39-0d13-4f99-9e5d-0624ccd5b638
213    d77983e0-5279-4379-b608-8032a2990b09
Name: uuid, Length: 214, dtype: object

In [None]:
# PMID mapper from NCBI https://pmc.ncbi.nlm.nih.gov/tools/id-converter-api/





In [None]:
import csv
import requests
import os

# Define the path to the "Valid DOME Registry" CSV file
valid_csv_file_name = 'valid_DOME_Registry_Contents.csv'  # Replace with your actual file name

# Define the output folder for the mapped identifiers
output_folder = 'Mapped_Identifiers'
os.makedirs(output_folder, exist_ok=True)

# Function to read CSV data
def read_csv(file_name):
    try:
        with open(file_name, 'r', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            data = [row for row in reader]
        return data
    except Exception as e:
        print(f"Error reading the CSV file: {e}")
        return None

# Function to map PMIDs to PMCIDs using NCBI E-utilities API
def map_pmids_to_pmcids(pmids):
    pmid_str = ','.join(pmids)
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&db=pmc&id={pmid_str}&retmode=json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")
        return None

# Function to save mapped identifiers to a CSV file
def save_mapped_identifiers(mapped_data, output_file_name):
    with open(output_file_name, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['PMID', 'PMCID'])
        for linkset in mapped_data['linksets']:
            pmid = linkset['ids'][0]
            if 'linksetdbs' in linkset:
                for linksetdb in linkset['linksetdbs']:
                    if linksetdb['dbto'] == 'pmc':
                        for link in linksetdb['links']:
                            writer.writerow([pmid, link])
            else:
                writer.writerow([pmid, ''])

# Read CSV data
csv_data = read_csv(valid_csv_file_name)

# Extract PMIDs from the CSV data
pmids = [row.get('publication_pmid', '') for row in csv_data if row.get('publication_pmid', '')]

# Map PMIDs to PMCIDs
mapped_data = map_pmids_to_pmcids(pmids)

# Save the mapped identifiers to a CSV file
if mapped_data:
    output_file_name = os.path.join(output_folder, 'mapped_identifiers.csv')
    save_mapped_identifiers(mapped_data, output_file_name)
    print(f"Mapped identifiers saved to '{output_file_name}'")
else:
    print("No data to process.")

Mapped identifiers saved to 'Mapped_Identifiers/mapped_identifiers.csv'


In [None]:
# EPMC get files XML

#get list of PMIDs
pmid = read_csv('pmids.csv')
#for pmid in pmids:
#    print(pmid)

type(pmid)

pmid[0:4]

# Define the URL for the EPMC API call






[{'PMID': '16524483'}, {'PMID': '17374164'}, {'PMID': '17888165'}, {'PMID': '17570862'}]


TypeError: object of type 'NoneType' has no len()

In [None]:
from Bio import Entrez
import os

# Define your email (required by NCBI)
Entrez.email = "gavinmichael.farrell@studenti.unipd.it"  # Replace with your actual email

# Define the PMID you want to query
pmid = 'PMC1421439'  # Replace with your actual PMID

# Define the output folder for XML files
output_folder = 'PMC_XML_Files'
os.makedirs(output_folder, exist_ok=True)

# Fetch the article from PMC
handle = Entrez.efetch(db="pmc", id=pmid, rettype="full", retmode="xml")
xml_data = handle.read()
handle.close()

# Save the XML data to a file
output_file = os.path.join(output_folder, f"{pmid}.xml")
with open(output_file, 'w', encoding='utf-8') as file:
    file.write(str(xml_data))

print(f"XML data for PMID {pmid} saved to '{output_file}'")

XML data for PMID PMC1421439 saved to 'PMC_XML_Files/PMC1421439.xml'
