In [None]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
import random

# Define the URLs and parameters for the data
bioproject_id = "PRJNA288601"
efetch_bioproject_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
efetch_biosample_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

# Fetch the isolate data from the BioProject
efetch_params = {
    "db": "bioproject",
    "id": bioproject_id,
    "retmode": "xml"
}
response = requests.get(efetch_bioproject_url, params=efetch_params)
response.raise_for_status()

# Parse the BioProject XML content
root = ET.fromstring(response.content)
isolates = root.findall(".//ProjectDescr/LocusTagPrefix")

# Parse each isolate to get biosample_id and locus_tag_prefix
parsed_isolates = []
for isolate in isolates:
    biosample_id = isolate.attrib.get('biosample_id')
    locus_tag_prefix = isolate.text
    if biosample_id and locus_tag_prefix:
        parsed_isolates.append({
            'biosample_id': biosample_id,
            'locus_tag_prefix': locus_tag_prefix
        })

# Check the initial number of isolates
print(f"Number of parsed isolates: {len(parsed_isolates)}")

# Randomly select 20 biosample IDs for testing
sample_size = 20
sample_isolates = random.sample(parsed_isolates, sample_size)

# Function to parse additional data
def parse_additional_data(biosample_root):
    data = {
        'biosample_id': biosample_root.findtext(".//Ids/Id[@db='BioSample']"),
        'organism_group': biosample_root.findtext(".//Organism/OrganismName"),
        'strain': biosample_root.findtext(".//Attribute[@attribute_name='strain']"),
        'isolate_identifiers': biosample_root.findtext(".//Ids/Id[@db_label='Sample name']"),
        'serovar': biosample_root.findtext(".//Attribute[@attribute_name='serovar']"),
        'isolate': biosample_root.findtext(".//Attribute[@attribute_name='isolate']"),
        'create_date': biosample_root.find(".//Status[@status='live']").attrib.get('when') if biosample_root.find(".//Status[@status='live']") is not None else None,
        'location': biosample_root.findtext(".//Attribute[@attribute_name='geo_loc_name']"),
        'isolation_source': biosample_root.findtext(".//Attribute[@attribute_name='isolation_source']"),
        'isolation_type': biosample_root.findtext(".//Attribute[@attribute_name='isolation_type']"),
        'food_origin': biosample_root.findtext(".//Attribute[@attribute_name='food_origin']"),
        'snp_cluster': biosample_root.findtext(".//Attribute[@attribute_name='snp_cluster']"),
        'min_same': biosample_root.findtext(".//Attribute[@attribute_name='min-same']"),
        'min_diff': biosample_root.findtext(".//Attribute[@attribute_name='min-diff']"),
        'assembly': biosample_root.findtext(".//Attribute[@attribute_name='assembly']"),
        'amr_genotypes': biosample_root.findtext(".//Attribute[@attribute_name='amr_genotypes']"),  # Check for AMR genotypes
        'computed_types': biosample_root.findtext(".//Attribute[@attribute_name='computed_types']"),
        'host': biosample_root.findtext(".//Attribute[@attribute_name='host']"),
        'collection_date': biosample_root.findtext(".//Attribute[@attribute_name='collection_date']"),
        'mlst': biosample_root.findtext(".//Attribute[@attribute_name='MLST']"),
        'sample_type': biosample_root.findtext(".//Attribute[@attribute_name='sample_type']"),
        'collected_by': biosample_root.findtext(".//Attribute[@attribute_name='collected_by']"),
        'host_disease': biosample_root.findtext(".//Attribute[@attribute_name='host_disease']"),
        'lat_lon': biosample_root.findtext(".//Attribute[@attribute_name='lat_lon']"),
        'sequenced_by': biosample_root.findtext(".//Attribute[@attribute_name='sequenced_by']")
    }
    # Remove keys with None values
    data = {k: v for k, v in data.items() if v is not None}
    return data

# Fetch additional data for the selected biosample_ids
additional_data = []
error_biosamples = []

for isolate in sample_isolates:
    biosample_id = isolate['biosample_id']
    efetch_params = {
        "db": "biosample",
        "id": biosample_id,
        "retmode": "xml"
    }
    response = requests.get(efetch_biosample_url, params=efetch_params)
    
    if response.status_code != 200:
        print(f"Failed to fetch data for biosample_id: {biosample_id}")
        error_biosamples.append(biosample_id)
        continue
    
    # Parse the biosample XML content
    biosample_root = ET.fromstring(response.content)
    
    # Extract additional data
    biosample_data = parse_additional_data(biosample_root)
    
    # Check if extracted data contains None values and log missing fields
    missing_fields = [k for k, v in biosample_data.items() if v is None]
    if missing_fields:
        print(f"Incomplete data for biosample_id: {biosample_id}, missing fields: {missing_fields}")
        error_biosamples.append(biosample_id)
    else:
        additional_data.append(biosample_data)

# Convert parsed isolates and additional data to DataFrames

df_sample_isolates = pd.DataFrame(sample_isolates)
df_additional_data = pd.DataFrame(additional_data)

# Merge the two DataFrames on biosample_id
df_merged = pd.merge(df_sample_isolates, df_additional_data, on="biosample_id", how="left")

# Save the merged DataFrame to a CSV file
output_file = "isolates_sample_detailed.csv"
df_merged.to_csv(output_file, index=False)

# Log the final number of records
print(f"Final number of records in CSV: {len(df_merged)}")

# Output the path of the saved file for download
output_file

# Latest details about the Entrez databases 

Sure, here's a Python script that uses the requests library to retrieve the latest details about the Entrez databases using the EInfo utility.

In [None]:
import requests
import xml.etree.ElementTree as ET

# Base URL for E-utilities
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

# EInfo URL to get information about all databases
einfo_url = base_url + "einfo.fcgi"

# Send request to EInfo
response = requests.get(einfo_url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the XML response
    root = ET.fromstring(response.content)
    
    # Print database details
    for db_info in root.findall(".//DbName"):
        print(db_info.text)
else:
    print(f"Error: {response.status_code}")



# Getting information about NCBI databases

In [None]:

import requests
import xml.etree.ElementTree as ET
import pandas as pd

# Base URL for E-utilities
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

# EInfo URL to get information about all databases
einfo_url = base_url + "einfo.fcgi"

# Send request to EInfo
response = requests.get(einfo_url)

# Initialize a list to store database details
data = []

# Check if the request was successful
if response.status_code == 200:
    # Parse the XML response
    root = ET.fromstring(response.content)
    
    # Get database names
    for db_info in root.findall("./DbList/DbName"):
        db_name = db_info.text
        
        # Get detailed info for each database
        detailed_url = f"{base_url}einfo.fcgi?db={db_name}"
        detailed_response = requests.get(detailed_url)
        
        if detailed_response.status_code == 200:
            detailed_root = ET.fromstring(detailed_response.content)
            db_details = detailed_root.find(".//DbInfo")
            
            if db_details is not None:
                description = db_details.find("Description").text
                count = db_details.find("Count").text
                last_update = db_details.find("LastUpdate").text
                # Append details to the data list
                data.append([db_name, description, count, last_update])
        else:
            print(f"Error fetching details for {db_name}: {detailed_response.status_code}")
else:
    print(f"Error: {response.status_code}")

# Create a DataFrame from the data list
df = pd.DataFrame(data, columns=["Database Name", "Description", "Record Count", "Last Update"])

# Optionally, save the DataFrame to a CSV file
df.to_csv("entrez_databases.csv", index=False)

# Display the DataFrame
df






In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

# Base URL for E-utilities
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

# EFetch URL to get information about a specific BioProject
efetch_url = base_url + "efetch.fcgi?db=bioproject&id=PRJNA288601&retmode=xml"

# Send request to EFetch
response = requests.get(efetch_url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the XML response
    root = ET.fromstring(response.content)
    
    # Extract relevant information
    project_title = root.find(".//Project/ProjectDescr/Title").text if root.find(".//Project/ProjectDescr/Title") is not None else "N/A"
    project_description = root.find(".//Project/ProjectDescr/Description").text if root.find(".//Project/ProjectDescr/Description") is not None else "N/A"
    project_accession = root.find(".//Project/ProjectID/ArchiveID").attrib.get('accession') if root.find(".//Project/ProjectID/ArchiveID") is not None else "N/A"

    
    # Create a DataFrame
    data = [[project_title, project_description, project_accession]]
    df = pd.DataFrame(data, columns=["Title", "Description", "Accession"])
    
    # Optionally, save the DataFrame to a CSV file
    df.to_csv("bioproject_PRJNA288601.csv", index=False)
    
    # Display the DataFrame
    #print(df)
    
    
else:
    print(f"Error: {response.status_code}")

df

In [None]:
import requests
import xml.etree.ElementTree as ET

def get_bioproject_linked_databases():
    # Base URL for E-utilities
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    
    # EInfo URL to get information about the BioProject database
    einfo_url = base_url + "einfo.fcgi?db=bioproject"
    
    # Send request to EInfo
    response = requests.get(einfo_url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the XML response
        root = ET.fromstring(response.content)
        
        # Extract linked databases
        linked_dbs = set()
        for link in root.findall(".//Link"):
            db_to = link.find("DbTo")
            if db_to is not None:
                linked_dbs.add(db_to.text)
        
        return linked_dbs
    else:
        print(f"Error: {response.status_code}")
        return None

linked_databases = get_bioproject_linked_databases()

if linked_databases:
    print("All linked NCBI Databases with BioProject:")
    for db in linked_databases:
        print(db)
else:
    print("No linked databases found or an error occurred.")



In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

def fetch_bioproject_details(bioproject_id):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    efetch_url = f"{base_url}efetch.fcgi?db=bioproject&id={bioproject_id}&retmode=xml"
    
    response = requests.get(efetch_url)
    if response.status_code == 200:
        return ET.fromstring(response.content)
    else:
        print(f"Error fetching BioProject details: {response.status_code}")
        return None

def fetch_linked_records(bioproject_id, db):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    elink_url = f"{base_url}elink.fcgi?dbfrom=bioproject&db={db}&id={bioproject_id}&cmd=neighbor"
    
    response = requests.get(elink_url)
    if response.status_code == 200:
        return ET.fromstring(response.content)
    else:
        print(f"Error fetching linked records: {response.status_code}")
        return None

def fetch_linked_record_details(ids, db):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    efetch_url = f"{base_url}efetch.fcgi?db={db}&id={','.join(ids)}&retmode=xml&rettype=docsum"
    
    response = requests.get(efetch_url)
    if response.status_code == 200:
        return ET.fromstring(response.content)
    else:
        print(f"Error fetching linked record details: {response.status_code}")
        return None

# Main script
bioproject_id = "PRJNA288601"
bioproject_details = fetch_bioproject_details(bioproject_id)

# Extract and print basic BioProject details
if bioproject_details is not None:
    project_id = bioproject_details.find(".//ArchiveID").attrib.get('accession', 'N/A')
    title = bioproject_details.find(".//ProjectDescr/Title").text if bioproject_details.find(".//ProjectDescr/Title") is not None else "N/A"
    description = bioproject_details.find(".//ProjectDescr/Description").text if bioproject_details.find(".//ProjectDescr/Description") is not None else "N/A"
    access = bioproject_details.find(".//ProjectDescr/Access").text if bioproject_details.find(".//ProjectDescr/Access") is not None else "N/A"
    release_date = bioproject_details.find(".//ProjectDescr/ProjectReleaseDate").text if bioproject_details.find(".//ProjectDescr/ProjectReleaseDate") is not None else "N/A"
    last_update = bioproject_details.find(".//ProjectDescr/ProjectUpdateDate").text if bioproject_details.find(".//ProjectDescr/ProjectUpdateDate") is not None else "N/A"

    print(f"Project ID: {project_id}")
    print(f"Title: {title}")
    print(f"Description: {description}")
    print(f"Access: {access}")
    print(f"Release Date: {release_date}")
    print(f"Last Update: {last_update}")

# Fetch linked records in various databases
linked_dbs = ["biosample", "sra", "nuccore", "nucest", "nucgss","assembly"]
for db in linked_dbs:
    linked_records = fetch_linked_records(bioproject_id, db)
    if linked_records is not None:
        id_list = linked_records.findall(".//LinkSetDb/Link/Id")
        ids = [id_elem.text for id_elem in id_list]
        if ids:
            linked_details = fetch_linked_record_details(ids, db)
            if linked_details is not None:
                # Extract and print details for each linked record
                for record in linked_details.findall(".//DocSum"):
                    record_id = record.find("Id").text if record.find("Id") is not None else "N/A"
                    summary = record.find("Item[@Name='Title']").text if record.find("Item[@Name='Title']") is not None else "N/A"
                    print(f"{db} ID: {record_id}")
                    print(f"Summary: {summary}")
        else:
            print(f"No linked records found for {db}")
    else:
        print(f"No linked records found for {db}")





In [2]:
pip install --upgrade google-api-python-client

Collecting google-api-python-clientNote: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an OSError: [WinError 2] The system cannot find the file specified: 'c:\\Python312\\Scripts\\normalizer.exe' -> 'c:\\Python312\\Scripts\\normalizer.exe.deleteme'




  Downloading google_api_python_client-2.132.0-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting httplib2<1.dev0,>=0.19.0 (from google-api-python-client)
  Downloading httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0 (from google-api-python-client)
  Downloading google_auth-2.30.0-py2.py3-none-any.whl.metadata (4.7 kB)
Collecting google-auth-httplib2<1.0.0,>=0.2.0 (from google-api-python-client)
  Downloading google_auth_httplib2-0.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5 (from google-api-python-client)
  Downloading google_api_core-2.19.0-py3-none-any.whl.metadata (2.7 kB)
Collecting uritemplate<5,>=3.0.1 (from google-api-python-client)
  Downloading uritemplate-4.1.1-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting googleapis-common-protos<2.0.dev0,>=1.56.2 (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-ap

In [2]:
from google.cloud import bigquery

# Construct a BigQuery client object.
client = bigquery.Client()

query = (
    "SELECT name FROM [bigquery-public-data:usa_names.usa_1910_2013] "
    'WHERE state = "TX" '
    "LIMIT 100"
)

# Set use_legacy_sql to True to use legacy SQL syntax.
job_config = bigquery.QueryJobConfig(use_legacy_sql=True)

# Start the query and waits for query job to complete, passing in the extra configuration.
results = client.query_and_wait(
    query, job_config=job_config
)  # Make an API request.

print("The query data:")
for row in results:
    print(row)


ModuleNotFoundError: No module named 'google.cloud'

In [1]:
from google.cloud import bigquery

client = bigquery.Client()

# Perform a query.
QUERY = (
    'SELECT name FROM `bigquery-public-data.usa_names.usa_1910_2013` '
    'WHERE state = "TX" '
    'LIMIT 100')
query_job = client.query(QUERY)  # API request
rows = query_job.result()  # Waits for query to finish

for row in rows:
    print(row.name)

ModuleNotFoundError: No module named 'google.cloud'

In [1]:
import requests
import csv

# Define the API endpoint and parameters
api_url = "https://api.ncbi.nlm.nih.gov/pathogen/v2/pathogenisolate"  # Update this URL based on the actual API endpoint
params = {
    "pathogen": "Enterobacteriaceae",  # Example pathogen
    "collection_date": "2023-01-01:2023-12-31",  # Example date range
    # Add additional parameters as required
}

# Send a GET request to the API
response = requests.get(api_url, params=params)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()
    
    # Define the CSV file path
    csv_file_path = "isolates_data.csv"
    
    # Write data to CSV
    with open(csv_file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(data[0].keys())
        # Write the data
        for isolate in data:
            writer.writerow(isolate.values())
    
    print(f"Data successfully saved to {csv_file_path}")
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")



Failed to fetch data. Status code: 404


In [2]:
import requests
import csv
import xml.etree.ElementTree as ET

# Define the API endpoint and parameters
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
search_url = base_url + "esearch.fcgi"
fetch_url = base_url + "efetch.fcgi"

# Define search parameters
search_params = {
    "db": "nucleotide",  # or the appropriate database, e.g., "genome", "assembly"
    "term": "Enterobacteriaceae[Organism]",  # Replace with your search term
    "retmax": 10,  # Number of records to fetch, increase as needed
    "retmode": "xml"
}

# Send search request to NCBI
search_response = requests.get(search_url, params=search_params)

if search_response.status_code == 200:
    # Parse the search results
    search_data = ET.fromstring(search_response.content)
    id_list = [id_elem.text for id_elem in search_data.findall('.//Id')]
    
    # Fetch the detailed records
    fetch_params = {
        "db": "nucleotide",  # Same as the search database
        "id": ",".join(id_list),  # Comma-separated list of IDs
        "retmode": "xml"
    }
    fetch_response = requests.get(fetch_url, params=fetch_params)
    
    if fetch_response.status_code == 200:
        # Parse the fetch results
        fetch_data = ET.fromstring(fetch_response.content)
        
        # Define the CSV file path
        csv_file_path = "isolates_data.csv"
        
        # Write data to CSV
        with open(csv_file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["ID", "Title", "Source"])  # Update with appropriate headers
            
            for docsum in fetch_data.findall('.//DocSum'):
                uid = docsum.find('.//Id').text
                title = docsum.find('.//Item[@Name="Title"]').text
                source = docsum.find('.//Item[@Name="Source"]').text
                writer.writerow([uid, title, source])
        
        print(f"Data successfully saved to {csv_file_path}")
    else:
        print(f"Failed to fetch details. Status code: {fetch_response.status_code}")
else:
    print(f"Failed to search. Status code: {search_response.status_code}")


Data successfully saved to isolates_data.csv


In [5]:
import requests
import csv
import xml.etree.ElementTree as ET

# Define the API endpoints
search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

# Define search parameters
search_params = {
    "db": "nucleotide",  # Database to search
    "term": "Enterobacteriaceae[Organism]",  # Search term
    "retmax": 10,  # Number of records to fetch
    "retmode": "xml"
}

# Send search request to NCBI
search_response = requests.get(search_url, params=search_params)

if search_response.status_code == 200:
    # Parse the search results
    search_data = ET.fromstring(search_response.content)
    id_list = [id_elem.text for id_elem in search_data.findall('.//Id')]
    
    if id_list:
        # Fetch the detailed records
        fetch_params = {
            "db": "nucleotide",  # Same as the search database
            "id": ",".join(id_list),  # Comma-separated list of IDs
            "retmode": "xml"
        }
        fetch_response = requests.get(fetch_url, params=fetch_params)
        
        if fetch_response.status_code == 200:
            # Parse the fetch results
            fetch_data = ET.fromstring(fetch_response.content)
            
            # Define the CSV file path
            csv_file_path = "isolates_data.csv"
            
            # Write data to CSV
            with open(csv_file_path, mode='w', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(["ID", "Title", "Source"])  # Update with appropriate headers
                
                for docsum in fetch_data.findall('.//Seq-entry'):
                    uid = docsum.find('.//Seq-id_other/Textseq-id_accession').text if docsum.find('.//Seq-id_other/Textseq-id_accession') else 'N/A'
                    title = docsum.find('.//TSeq_gi').text if docsum.find('.//TSeq_gi') else 'N/A'
                    source = docsum.find('.//TSeq_taxid').text if docsum.find('.//TSeq_taxid') else 'N/A'
                    writer.writerow([uid, title, source])
            
            print(f"Data successfully saved to {csv_file_path}")
        else:
            print(f"Failed to fetch details. Status code: {fetch_response.status_code}")
    else:
        print("No IDs found in the search results.")
else:
    print(f"Failed to search. Status code: {search_response.status_code}")


Data successfully saved to isolates_data.csv


In [7]:
import requests
import csv
import xml.etree.ElementTree as ET

# Define the API endpoints
search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

# Define search parameters
search_params = {
    "db": "nucleotide",  # Database to search
    "term": "Enterobacteriaceae[Organism]",  # Search term
    "retmax": 10,  # Number of records to fetch
    "retmode": "xml"
}

# Send search request to NCBI
search_response = requests.get(search_url, params=search_params)

if search_response.status_code == 200:
    # Parse the search results
    search_data = ET.fromstring(search_response.content)
    id_list = [id_elem.text for id_elem in search_data.findall('.//Id')]
    
    print(f"Found {len(id_list)} IDs")
    if id_list:
        # Fetch the detailed records
        fetch_params = {
            "db": "nucleotide",  # Same as the search database
            "id": ",".join(id_list),  # Comma-separated list of IDs
            "retmode": "xml"
        }
        fetch_response = requests.get(fetch_url, params=fetch_params)
        
        if fetch_response.status_code == 200:
            # Parse the fetch results
            fetch_data = ET.fromstring(fetch_response.content)
            
            # Define the CSV file path
            csv_file_path = "isolates_data.csv"
            
            # Write data to CSV
            with open(csv_file_path, mode='w', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(["ID", "Title", "Source"])  # Update with appropriate headers
                
                for docsum in fetch_data.findall('.//TSeq'):
                    uid = docsum.find('.//TSeq_accver').text if docsum.find('.//TSeq_accver') else 'N/A'
                    title = docsum.find('.//TSeq_orgname').text if docsum.find('.//TSeq_orgname') else 'N/A'
                    source = docsum.find('.//TSeq_taxid').text if docsum.find('.//TSeq_taxid') else 'N/A'
                    writer.writerow([uid, title, source])
            
            print(f"Data successfully saved to {csv_file_path}")
        else:
            print(f"Failed to fetch details. Status code: {fetch_response.status_code}")
            print(fetch_response.text)  # Print the response for debugging
    else:
        print("No IDs found in the search results.")
else:
    print(f"Failed to search. Status code: {search_response.status_code}")
    print(search_response.text)  # Print the response for debugging


Found 10 IDs
Data successfully saved to isolates_data.csv


In [13]:
import requests
import csv
import xml.etree.ElementTree as ET

def fetch_bioproject_data(bioproject_acc):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    query_params = {
        "db": "bioproject",
        "term": bioproject_acc,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data: {response.status_code}")
    
    return response.text

def parse_bioproject_data(xml_data):
    root = ET.fromstring(xml_data)
    ids = [id_elem.text for id_elem in root.findall(".//Id")]
    return ids

def fetch_isolate_data(bioproject_id):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    query_params = {
        "db": "bioproject",
        "id": bioproject_id,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch isolate data: {response.status_code}")
    
    return response.text

def parse_isolate_data(xml_data):
    root = ET.fromstring(xml_data)
    isolate_data = []
    for docsum in root.findall(".//DocSum"):
        isolate = {}
        for item in docsum.findall(".//Item"):
            isolate[item.get('Name')] = item.text
        isolate_data.append(isolate)
    return isolate_data

def save_to_csv(data, filename):
    if not data:
        return

    with open(filename, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=data[0].keys())
        writer.writeheader()
        for row in data:
            writer.writerow(row)

def main():
    bioproject_acc = "PRJNA288601"
    xml_data = fetch_bioproject_data(bioproject_acc)
    bioproject_ids = parse_bioproject_data(xml_data)
    
    all_isolate_data = []
    for bioproject_id in bioproject_ids:
        isolate_xml = fetch_isolate_data(bioproject_id)
        isolate_data = parse_isolate_data(isolate_xml)
        all_isolate_data.extend(isolate_data)
    
    save_to_csv(all_isolate_data, "isolates_data.csv")
    print("Data saved to isolates_data.csv")

if __name__ == "__main__":
    main()
isolate_data

Data saved to isolates_data.csv


NameError: name 'isolate_data' is not defined

In [14]:
import requests
import csv
import xml.etree.ElementTree as ET
import pandas as pd

def fetch_bioproject_data(bioproject_acc):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    query_params = {
        "db": "bioproject",
        "term": bioproject_acc,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data: {response.status_code}")
    
    return response.text

def parse_bioproject_data(xml_data):
    root = ET.fromstring(xml_data)
    ids = [id_elem.text for id_elem in root.findall(".//Id")]
    return ids

def fetch_isolate_data(bioproject_id):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    query_params = {
        "db": "bioproject",
        "id": bioproject_id,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch isolate data: {response.status_code}")
    
    return response.text

def parse_isolate_data(xml_data):
    root = ET.fromstring(xml_data)
    isolate_data = []
    for docsum in root.findall(".//DocSum"):
        isolate = {}
        for item in docsum.findall(".//Item"):
            isolate[item.get('Name')] = item.text
        isolate_data.append(isolate)
    return isolate_data

def save_to_csv(data, filename):
    if not data:
        return
    
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    return df

def main():
    bioproject_acc = "PRJNA288601"
    xml_data = fetch_bioproject_data(bioproject_acc)
    bioproject_ids = parse_bioproject_data(xml_data)
    
    all_isolate_data = []
    for bioproject_id in bioproject_ids:
        isolate_xml = fetch_isolate_data(bioproject_id)
        isolate_data = parse_isolate_data(isolate_xml)
        all_isolate_data.extend(isolate_data)
    
    df = save_to_csv(all_isolate_data, "isolates_data.csv")
    print("Data saved to isolates_data.csv")
    return df

if __name__ == "__main__":
    df = main()
    print(df.head())  # Display the first few rows of the DataFrame


Data saved to isolates_data.csv


AttributeError: 'NoneType' object has no attribute 'head'

In [16]:
import requests
import csv
import xml.etree.ElementTree as ET
import pandas as pd

def fetch_bioproject_data(bioproject_acc):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    query_params = {
        "db": "bioproject",
        "term": bioproject_acc,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data: {response.status_code}")
    
    return response.text

def parse_bioproject_data(xml_data):
    root = ET.fromstring(xml_data)
    ids = [id_elem.text for id_elem in root.findall(".//Id")]
    return ids

def fetch_isolate_data(bioproject_id):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    query_params = {
        "db": "bioproject",
        "id": bioproject_id,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch isolate data: {response.status_code}")
    
    return response.text

def parse_isolate_data(xml_data):
    root = ET.fromstring(xml_data)
    isolate_data = []
    for docsum in root.findall(".//DocSum"):
        isolate = {}
        for item in docsum.findall(".//Item"):
            isolate[item.get('Name')] = item.text
        isolate_data.append(isolate)
    return isolate_data

def save_to_csv(data, filename):
    if not data:
        print("No data to save")
        return pd.DataFrame()  # Return an empty DataFrame if data is empty
    
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    return df

def main():
    bioproject_acc = "PRJNA288601"
    xml_data = fetch_bioproject_data(bioproject_acc)
    print(f"Fetched BioProject data: {xml_data[:500]}")  # Debug print
    
    bioproject_ids = parse_bioproject_data(xml_data)
    print(f"BioProject IDs: {bioproject_ids}")  # Debug print
    
    all_isolate_data = []
    for bioproject_id in bioproject_ids:
        isolate_xml = fetch_isolate_data(bioproject_id)
        print(f"Fetched isolate data for {bioproject_id}: {isolate_xml[:500]}")  # Debug print
        
        isolate_data = parse_isolate_data(isolate_xml)
        print(f"Parsed isolate data for {bioproject_id}: {isolate_data[:5]}")  # Debug print
        
        all_isolate_data.extend(isolate_data)
    
    df = save_to_csv(all_isolate_data, "isolates_data.csv")
    print("Data saved to isolates_data.csv")
    return df

if __name__ == "__main__":
    df = main()
    print(df.head())  # Display the first few rows of the DataFrame


Fetched BioProject data: <?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD esearch 20060628//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd">
<eSearchResult><Count>1</Count><RetMax>1</RetMax><RetStart>0</RetStart><IdList>
<Id>288601</Id>
</IdList><TranslationSet/><TranslationStack>   <TermSet>    <Term>PRJNA288601[All Fields]</Term>    <Field>All Fields</Field>    <Count>1</Count>    <Explode>N</Explode>   </TermSet>   <OP>GROUP</OP>  </TranslationStack><QueryTrans
BioProject IDs: ['288601']
Fetched isolate data for 288601: <?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD esummary bioproject 20140903//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20140903/esummary_bioproject.dtd">
<eSummaryResult>
<DocumentSummarySet status="OK">
<DbBuild>Build240609-1500.1</DbBuild>

<DocumentSummary uid="288601">
	<TaxId>0</TaxId>
	<Project_Id>288601</Project_Id>
	<Project_Acc>PRJNA288601</Project_Acc>
	<Pro

In [19]:
import requests
import csv
import xml.etree.ElementTree as ET
import pandas as pd

def fetch_bioproject_data(bioproject_acc):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    query_params = {
        "db": "bioproject",
        "term": bioproject_acc,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data: {response.status_code}")
    
    return response.text

def parse_bioproject_data(xml_data):
    root = ET.fromstring(xml_data)
    ids = [id_elem.text for id_elem in root.findall(".//Id")]
    return ids

def fetch_isolate_data(bioproject_id):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    query_params = {
        "db": "bioproject",
        "id": bioproject_id,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch isolate data: {response.status_code}")
    
    return response.text

def parse_isolate_data(xml_data):
    root = ET.fromstring(xml_data)
    print(f"XML Structure:\n{ET.tostring(root, encoding='utf8').decode('utf8')}")
    
    isolate_data = []
    for docsum in root.findall(".//DocumentSummary"):
        isolate = {}
        for item in docsum.findall(".//Item"):
            isolate[item.get('Name')] = item.text
        isolate_data.append(isolate)
    return isolate_data

def save_to_csv(data, filename):
    if not data:
        print("No data to save")
        return pd.DataFrame()  # Return an empty DataFrame if data is empty
    
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    return df

def main():
    bioproject_acc = "PRJNA288601"
    xml_data = fetch_bioproject_data(bioproject_acc)
    print(f"Fetched BioProject data: {xml_data[:500]}")  # Debug print
    
    bioproject_ids = parse_bioproject_data(xml_data)
    print(f"BioProject IDs: {bioproject_ids}")  # Debug print
    
    all_isolate_data = []
    for bioproject_id in bioproject_ids:
        isolate_xml = fetch_isolate_data(bioproject_id)
        print(f"Fetched isolate data for {bioproject_id}: {isolate_xml[:500]}")  # Debug print
        
        isolate_data = parse_isolate_data(isolate_xml)
        print(f"Parsed isolate data for {bioproject_id}: {isolate_data[:5]}")  # Debug print
        
        all_isolate_data.extend(isolate_data)
    
    df = save_to_csv(all_isolate_data, "isolates_data.csv")
    print("Data saved to isolates_data.csv")
    return df

if __name__ == "__main__":
    df = main()
    print(df.head())  # Display the first few rows of the DataFrame


Fetched BioProject data: <?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD esearch 20060628//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd">
<eSearchResult><Count>1</Count><RetMax>1</RetMax><RetStart>0</RetStart><IdList>
<Id>288601</Id>
</IdList><TranslationSet/><TranslationStack>   <TermSet>    <Term>PRJNA288601[All Fields]</Term>    <Field>All Fields</Field>    <Count>1</Count>    <Explode>N</Explode>   </TermSet>   <OP>GROUP</OP>  </TranslationStack><QueryTrans
BioProject IDs: ['288601']
Fetched isolate data for 288601: <?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD esummary bioproject 20140903//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20140903/esummary_bioproject.dtd">
<eSummaryResult>
<DocumentSummarySet status="OK">
<DbBuild>Build240609-1500.1</DbBuild>

<DocumentSummary uid="288601">
	<TaxId>0</TaxId>
	<Project_Id>288601</Project_Id>
	<Project_Acc>PRJNA288601</Project_Acc>
	<Pro

In [20]:
import requests
import csv
import xml.etree.ElementTree as ET
import pandas as pd

def fetch_bioproject_data(bioproject_acc):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    query_params = {
        "db": "bioproject",
        "term": bioproject_acc,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data: {response.status_code}")
    
    return response.text

def parse_bioproject_data(xml_data):
    root = ET.fromstring(xml_data)
    ids = [id_elem.text for id_elem in root.findall(".//Id")]
    return ids

def fetch_isolate_data(bioproject_id):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
    query_params = {
        "dbfrom": "bioproject",
        "db": "biosample",
        "id": bioproject_id,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch linked data: {response.status_code}")
    
    return response.text

def parse_linked_data(xml_data):
    root = ET.fromstring(xml_data)
    linked_ids = [id_elem.text for id_elem in root.findall(".//LinkSetDb//Link//Id")]
    return linked_ids

def fetch_summary_data(linked_id):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    query_params = {
        "db": "biosample",
        "id": linked_id,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch summary data: {response.status_code}")
    
    return response.text

def parse_summary_data(xml_data):
    root = ET.fromstring(xml_data)
    isolate_data = []
    for docsum in root.findall(".//DocSum"):
        isolate = {}
        for item in docsum.findall(".//Item"):
            isolate[item.get('Name')] = item.text
        isolate_data.append(isolate)
    return isolate_data

def save_to_csv(data, filename):
    if not data:
        print("No data to save")
        return pd.DataFrame()  # Return an empty DataFrame if data is empty
    
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    return df

def main():
    bioproject_acc = "PRJNA288601"
    xml_data = fetch_bioproject_data(bioproject_acc)
    print(f"Fetched BioProject data: {xml_data[:500]}")  # Debug print
    
    bioproject_ids = parse_bioproject_data(xml_data)
    print(f"BioProject IDs: {bioproject_ids}")  # Debug print
    
    all_isolate_data = []
    for bioproject_id in bioproject_ids:
        linked_xml = fetch_isolate_data(bioproject_id)
        print(f"Fetched linked data for {bioproject_id}: {linked_xml[:500]}")  # Debug print
        
        linked_ids = parse_linked_data(linked_xml)
        print(f"Linked IDs for {bioproject_id}: {linked_ids}")  # Debug print
        
        for linked_id in linked_ids:
            summary_xml = fetch_summary_data(linked_id)
            print(f"Fetched summary data for {linked_id}: {summary_xml[:500]}")  # Debug print
            
            isolate_data = parse_summary_data(summary_xml)
            print(f"Parsed summary data for {linked_id}: {isolate_data[:5]}")  # Debug print
            
            all_isolate_data.extend(isolate_data)
    
    df = save_to_csv(all_isolate_data, "isolates_data.csv")
    print("Data saved to isolates_data.csv")
    return df

if __name__ == "__main__":
    df = main()
    print(df.head())  # Display the first few rows of the DataFrame


Fetched BioProject data: <?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD esearch 20060628//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd">
<eSearchResult><Count>1</Count><RetMax>1</RetMax><RetStart>0</RetStart><IdList>
<Id>288601</Id>
</IdList><TranslationSet/><TranslationStack>   <TermSet>    <Term>PRJNA288601[All Fields]</Term>    <Field>All Fields</Field>    <Count>1</Count>    <Explode>N</Explode>   </TermSet>   <OP>GROUP</OP>  </TranslationStack><QueryTrans
BioProject IDs: ['288601']
Fetched linked data for 288601: <?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE eLinkResult PUBLIC "-//NLM//DTD elink 20101123//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20101123/elink.dtd">
<eLinkResult>

  <LinkSet>
    <DbFrom>bioproject</DbFrom>
    <IdList>
      <Id>288601</Id>
    </IdList>
    <LinkSetDb>
      <DbTo>biosample</DbTo>
      <LinkName>bioproject_biosample_sp</LinkName>
      
        <Link>
				<Id>41748333</Id>


KeyboardInterrupt: 

In [24]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

def fetch_bioproject_data(bioproject_acc):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    query_params = {
        "db": "bioproject",
        "term": bioproject_acc,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data: {response.status_code}")
    
    return response.text

def parse_bioproject_data(xml_data):
    root = ET.fromstring(xml_data)
    ids = [id_elem.text for id_elem in root.findall(".//Id")]
    return ids

def fetch_isolate_data(bioproject_id):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
    query_params = {
        "dbfrom": "bioproject",
        "db": "biosample",
        "id": bioproject_id,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch linked data: {response.status_code}")
    
    return response.text

def parse_linked_data(xml_data):
    root = ET.fromstring(xml_data)
    linked_ids = [id_elem.text for id_elem in root.findall(".//LinkSetDb//Link//Id")]
    return linked_ids

def fetch_summary_data(linked_id):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    query_params = {
        "db": "biosample",
        "id": linked_id,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch summary data: {response.status_code}")
    
    return response.text

def parse_summary_data(xml_data):
    root = ET.fromstring(xml_data)
    isolate_data = []
    for docsum in root.findall(".//DocSum"):
        isolate = {}
        for item in docsum.findall(".//Item"):
            isolate[item.get('Name')] = item.text
        isolate_data.append(isolate)
    return isolate_data

def save_to_csv(data, filename):
    if not data:
        print("No data to save")
        return pd.DataFrame()  # Return an empty DataFrame if data is empty
    
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    return df

def main():
    bioproject_acc = "PRJNA288601"
    xml_data = fetch_bioproject_data(bioproject_acc)
    print(f"Fetched BioProject data: {xml_data[:500]}")  # Debug print
    
    bioproject_ids = parse_bioproject_data(xml_data)
    print(f"BioProject IDs: {bioproject_ids}")  # Debug print
    
    all_isolate_data = []
    for bioproject_id in bioproject_ids:
        linked_xml = fetch_isolate_data(bioproject_id)
        print(f"Fetched linked data for {bioproject_id}: {linked_xml[:500]}")  # Debug print
        
        linked_ids = parse_linked_data(linked_xml)
        print(f"Linked IDs for {bioproject_id}: {linked_ids[:20]}")  # Debug print  # Only show first 5 IDs for debugging
        
        for linked_id in linked_ids[:5]:  # Limit to 5 records for testing
            summary_xml = fetch_summary_data(linked_id)
            print(f"Fetched summary data for {linked_id}: {summary_xml[:500]}")  # Debug print
            
            isolate_data = parse_summary_data(summary_xml)
            print(f"Parsed summary data for {linked_id}: {isolate_data[:5]}")  # Debug print
            
            all_isolate_data.extend(isolate_data)
    
    df = save_to_csv(all_isolate_data, "isolates_data.csv")
    print("Data saved to isolates_data.csv")
    return df

if __name__ == "__main__":
    df = main()
    print(df.head())  # Display the first few rows of the DataFrame


Fetched BioProject data: <?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD esearch 20060628//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd">
<eSearchResult><Count>1</Count><RetMax>1</RetMax><RetStart>0</RetStart><IdList>
<Id>288601</Id>
</IdList><TranslationSet/><TranslationStack>   <TermSet>    <Term>PRJNA288601[All Fields]</Term>    <Field>All Fields</Field>    <Count>1</Count>    <Explode>N</Explode>   </TermSet>   <OP>GROUP</OP>  </TranslationStack><QueryTrans
BioProject IDs: ['288601']
Fetched linked data for 288601: <?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE eLinkResult PUBLIC "-//NLM//DTD elink 20101123//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20101123/elink.dtd">
<eLinkResult>

  <LinkSet>
    <DbFrom>bioproject</DbFrom>
    <IdList>
      <Id>288601</Id>
    </IdList>
    <LinkSetDb>
      <DbTo>biosample</DbTo>
      <LinkName>bioproject_biosample_sp</LinkName>
      
        <Link>
				<Id>41748333</Id>


In [25]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

def fetch_bioproject_data(bioproject_acc):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    query_params = {
        "db": "bioproject",
        "term": bioproject_acc,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data: {response.status_code}")
    
    return response.text

def parse_bioproject_data(xml_data):
    root = ET.fromstring(xml_data)
    ids = [id_elem.text for id_elem in root.findall(".//Id")]
    return ids

def fetch_isolate_data(bioproject_id):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
    query_params = {
        "dbfrom": "bioproject",
        "db": "biosample",
        "id": bioproject_id,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch linked data: {response.status_code}")
    
    return response.text

def parse_linked_data(xml_data):
    root = ET.fromstring(xml_data)
    linked_ids = [id_elem.text for id_elem in root.findall(".//LinkSetDb//Link//Id")]
    return linked_ids

def fetch_summary_data(linked_id):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    query_params = {
        "db": "biosample",
        "id": linked_id,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch summary data: {response.status_code}")
    
    return response.text

def parse_summary_data(xml_data):
    root = ET.fromstring(xml_data)
    print(f"Summary XML Structure:\n{ET.tostring(root, encoding='utf8').decode('utf8')}")
    
    isolate_data = []
    for docsum in root.findall(".//DocSum"):
        isolate = {}
        for item in docsum.findall(".//Item"):
            isolate[item.get('Name')] = item.text
        isolate_data.append(isolate)
    return isolate_data

def save_to_csv(data, filename):
    if not data:
        print("No data to save")
        return pd.DataFrame()  # Return an empty DataFrame if data is empty
    
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    return df

def main():
    bioproject_acc = "PRJNA288601"
    xml_data = fetch_bioproject_data(bioproject_acc)
    print(f"Fetched BioProject data: {xml_data[:500]}")  # Debug print
    
    bioproject_ids = parse_bioproject_data(xml_data)
    print(f"BioProject IDs: {bioproject_ids}")  # Debug print
    
    all_isolate_data = []
    for bioproject_id in bioproject_ids:
        linked_xml = fetch_isolate_data(bioproject_id)
        print(f"Fetched linked data for {bioproject_id}: {linked_xml[:500]}")  # Debug print
        
        linked_ids = parse_linked_data(linked_xml)
        print(f"Linked IDs for {bioproject_id}: {linked_ids[:5]}")  # Debug print  # Only show first 5 IDs for debugging
        
        for linked_id in linked_ids[:5]:  # Limit to 5 records for testing
            summary_xml = fetch_summary_data(linked_id)
            print(f"Fetched summary data for {linked_id}: {summary_xml[:500]}")  # Debug print
            
            isolate_data = parse_summary_data(summary_xml)
            print(f"Parsed summary data for {linked_id}: {isolate_data[:5]}")  # Debug print
            
            all_isolate_data.extend(isolate_data)
    
    df = save_to_csv(all_isolate_data, "isolates_data.csv")
    print("Data saved to isolates_data.csv")
    return df

if __name__ == "__main__":
    df = main()
    print(df.head())  # Display the first few rows of the DataFrame


Fetched BioProject data: <?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD esearch 20060628//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd">
<eSearchResult><Count>1</Count><RetMax>1</RetMax><RetStart>0</RetStart><IdList>
<Id>288601</Id>
</IdList><TranslationSet/><TranslationStack>   <TermSet>    <Term>PRJNA288601[All Fields]</Term>    <Field>All Fields</Field>    <Count>1</Count>    <Explode>N</Explode>   </TermSet>   <OP>GROUP</OP>  </TranslationStack><QueryTrans
BioProject IDs: ['288601']
Fetched linked data for 288601: <?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE eLinkResult PUBLIC "-//NLM//DTD elink 20101123//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20101123/elink.dtd">
<eLinkResult>

  <LinkSet>
    <DbFrom>bioproject</DbFrom>
    <IdList>
      <Id>288601</Id>
    </IdList>
    <LinkSetDb>
      <DbTo>biosample</DbTo>
      <LinkName>bioproject_biosample_sp</LinkName>
      
        <Link>
				<Id>41748333</Id>


In [26]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

def fetch_bioproject_data(bioproject_acc):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    query_params = {
        "db": "bioproject",
        "term": bioproject_acc,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data: {response.status_code}")
    
    return response.text

def parse_bioproject_data(xml_data):
    root = ET.fromstring(xml_data)
    ids = [id_elem.text for id_elem in root.findall(".//Id")]
    return ids

def fetch_isolate_data(bioproject_id):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
    query_params = {
        "dbfrom": "bioproject",
        "db": "biosample",
        "id": bioproject_id,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch linked data: {response.status_code}")
    
    return response.text

def parse_linked_data(xml_data):
    root = ET.fromstring(xml_data)
    linked_ids = [id_elem.text for id_elem in root.findall(".//LinkSetDb//Link//Id")]
    return linked_ids

def fetch_summary_data(linked_id):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    query_params = {
        "db": "biosample",
        "id": linked_id,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=query_params)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch summary data: {response.status_code}")
    
    return response.text

def parse_summary_data(xml_data):
    root = ET.fromstring(xml_data)
    print(f"Summary XML Structure:\n{ET.tostring(root, encoding='utf8').decode('utf8')}")
    
    isolate_data = []
    for docsum in root.findall(".//DocumentSummary"):
        isolate = {}
        isolate['Title'] = docsum.findtext('Title')
        isolate['Accession'] = docsum.findtext('Accession')
        isolate['Date'] = docsum.findtext('Date')
        isolate['PublicationDate'] = docsum.findtext('PublicationDate')
        isolate['ModificationDate'] = docsum.findtext('ModificationDate')
        isolate['Organization'] = docsum.findtext('Organization')
        isolate['Taxonomy'] = docsum.findtext('Taxonomy')
        isolate['Organism'] = docsum.findtext('Organism')
        isolate['SourceSample'] = docsum.findtext('SourceSample')
        isolate['Identifiers'] = docsum.findtext('Identifiers')
        isolate['Infraspecies'] = docsum.findtext('Infraspecies')
        isolate['Package'] = docsum.findtext('Package')
        isolate['SortKey'] = docsum.findtext('SortKey')
        
        # Parsing SampleData as well
        sample_data = docsum.find('SampleData')
        if sample_data is not None:
            sample_data_text = ET.tostring(sample_data, encoding='utf8').decode('utf8')
            isolate['SampleData'] = sample_data_text
        
        isolate_data.append(isolate)
    return isolate_data

def save_to_csv(data, filename):
    if not data:
        print("No data to save")
        return pd.DataFrame()  # Return an empty DataFrame if data is empty
    
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    return df

def main():
    bioproject_acc = "PRJNA288601"
    xml_data = fetch_bioproject_data(bioproject_acc)
    print(f"Fetched BioProject data: {xml_data[:500]}")  # Debug print
    
    bioproject_ids = parse_bioproject_data(xml_data)
    print(f"BioProject IDs: {bioproject_ids}")  # Debug print
    
    all_isolate_data = []
    for bioproject_id in bioproject_ids:
        linked_xml = fetch_isolate_data(bioproject_id)
        print(f"Fetched linked data for {bioproject_id}: {linked_xml[:500]}")  # Debug print
        
        linked_ids = parse_linked_data(linked_xml)
        print(f"Linked IDs for {bioproject_id}: {linked_ids[:5]}")  # Debug print  # Only show first 5 IDs for debugging
        
        for linked_id in linked_ids[:5]:  # Limit to 5 records for testing
            summary_xml = fetch_summary_data(linked_id)
            print(f"Fetched summary data for {linked_id}: {summary_xml[:500]}")  # Debug print
            
            isolate_data = parse_summary_data(summary_xml)
            print(f"Parsed summary data for {linked_id}: {isolate_data[:5]}")  # Debug print
            
            all_isolate_data.extend(isolate_data)
    
    df = save_to_csv(all_isolate_data, "isolates_data.csv")
    print("Data saved to isolates_data.csv")
    return df

if __name__ == "__main__":
    df = main()
    print(df.head())  # Display the first few rows of the DataFrame


Fetched BioProject data: <?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD esearch 20060628//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd">
<eSearchResult><Count>1</Count><RetMax>1</RetMax><RetStart>0</RetStart><IdList>
<Id>288601</Id>
</IdList><TranslationSet/><TranslationStack>   <TermSet>    <Term>PRJNA288601[All Fields]</Term>    <Field>All Fields</Field>    <Count>1</Count>    <Explode>N</Explode>   </TermSet>   <OP>GROUP</OP>  </TranslationStack><QueryTrans
BioProject IDs: ['288601']
Fetched linked data for 288601: <?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE eLinkResult PUBLIC "-//NLM//DTD elink 20101123//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20101123/elink.dtd">
<eLinkResult>

  <LinkSet>
    <DbFrom>bioproject</DbFrom>
    <IdList>
      <Id>288601</Id>
    </IdList>
    <LinkSetDb>
      <DbTo>biosample</DbTo>
      <LinkName>bioproject_biosample_sp</LinkName>
      
        <Link>
				<Id>41748333</Id>


In [28]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time

# Initialize the Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Open the URL
driver.get("https://www.ncbi.nlm.nih.gov/pathogens/isolates/#bioproject_acc:(%22PRJNA288601%22)")

# Add a wait to ensure the page loads
time.sleep(5)  # Adjust the sleep time as necessary

# Locate the download button and click it
download_button = driver.find_element(By.XPATH, '//button[text()="Download"]')  # You might need to adjust this XPath
download_button.click()

# Wait for the download options to appear and select CSV
time.sleep(2)  # Adjust the sleep time as necessary
csv_option = driver.find_element(By.XPATH, '//option[text()="Excel (.csv)"]')  # You might need to adjust this XPath
csv_option.click()

# Confirm the download
download_confirm_button = driver.find_element(By.XPATH, '//button[text()="Download"]')  # Adjust the XPath if necessary
download_confirm_button.click()

# Wait for the download to complete
time.sleep(10)  # Adjust the sleep time as necessary

# Close the browser
driver.quit()


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//button[text()="Download"]"}
  (Session info: chrome=125.0.6422.142); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x0118B8E3+45827]
	(No symbol) [0x0111DCC4]
	(No symbol) [0x0101150F]
	(No symbol) [0x010520BC]
	(No symbol) [0x0105216B]
	(No symbol) [0x0108E0F2]
	(No symbol) [0x01072E44]
	(No symbol) [0x0108C034]
	(No symbol) [0x01072B96]
	(No symbol) [0x01046998]
	(No symbol) [0x0104751D]
	GetHandleVerifier [0x01444513+2899763]
	GetHandleVerifier [0x0149793D+3240797]
	GetHandleVerifier [0x012113B4+593364]
	GetHandleVerifier [0x012182DC+621820]
	(No symbol) [0x011270A4]
	(No symbol) [0x011237A8]
	(No symbol) [0x01123947]
	(No symbol) [0x011159FE]
	BaseThreadInitThunk [0x75927BA9+25]
	RtlInitializeExceptionChain [0x7745BE7B+107]
	RtlClearBits [0x7745BDFF+191]


In [29]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

# Initialize the Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Open the URL
driver.get("https://www.ncbi.nlm.nih.gov/pathogens/isolates/#bioproject_acc:(%22PRJNA288601%22)")

try:
    # Wait for the table to load
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "table"))
    )
    
    # Locate the download button and click it
    download_button = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.XPATH, '//button[text()="Download"]'))
    )
    download_button.click()

    # Wait for the download options to appear and select CSV
    csv_option = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.XPATH, '//option[text()="Excel (.csv)"]'))
    )
    csv_option.click()

    # Confirm the download
    download_confirm_button = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.XPATH, '//button[text()="Download"]'))
    )
    download_confirm_button.click()

    # Wait for the download to complete
    time.sleep(10)  # Adjust the sleep time as necessary

finally:
    # Close the browser
    driver.quit()


TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x0118B8E3+45827]
	(No symbol) [0x0111DCC4]
	(No symbol) [0x0101150F]
	(No symbol) [0x010520BC]
	(No symbol) [0x0105216B]
	(No symbol) [0x0108E0F2]
	(No symbol) [0x01072E44]
	(No symbol) [0x0108C034]
	(No symbol) [0x01072B96]
	(No symbol) [0x01046998]
	(No symbol) [0x0104751D]
	GetHandleVerifier [0x01444513+2899763]
	GetHandleVerifier [0x0149793D+3240797]
	GetHandleVerifier [0x012113B4+593364]
	GetHandleVerifier [0x012182DC+621820]
	(No symbol) [0x011270A4]
	(No symbol) [0x011237A8]
	(No symbol) [0x01123947]
	(No symbol) [0x011159FE]
	BaseThreadInitThunk [0x75927BA9+25]
	RtlInitializeExceptionChain [0x7745BE7B+107]
	RtlClearBits [0x7745BDFF+191]
