In [None]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
import random

# Define the URLs and parameters for the data
bioproject_id = "PRJNA288601"
efetch_bioproject_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
efetch_biosample_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

# Fetch the isolate data from the BioProject
efetch_params = {
    "db": "bioproject",
    "id": bioproject_id,
    "retmode": "xml"
}
response = requests.get(efetch_bioproject_url, params=efetch_params)
response.raise_for_status()

# Parse the BioProject XML content
root = ET.fromstring(response.content)
isolates = root.findall(".//ProjectDescr/LocusTagPrefix")

# Parse each isolate to get biosample_id and locus_tag_prefix
parsed_isolates = []
for isolate in isolates:
    biosample_id = isolate.attrib.get('biosample_id')
    locus_tag_prefix = isolate.text
    if biosample_id and locus_tag_prefix:
        parsed_isolates.append({
            'biosample_id': biosample_id,
            'locus_tag_prefix': locus_tag_prefix
        })

# Check the initial number of isolates
print(f"Number of parsed isolates: {len(parsed_isolates)}")

# Randomly select 20 biosample IDs for testing
sample_size = 20
sample_isolates = random.sample(parsed_isolates, sample_size)

# Function to parse additional data
def parse_additional_data(biosample_root):
    data = {
        'biosample_id': biosample_root.findtext(".//Ids/Id[@db='BioSample']"),
        'organism_group': biosample_root.findtext(".//Organism/OrganismName"),
        'strain': biosample_root.findtext(".//Attribute[@attribute_name='strain']"),
        'isolate_identifiers': biosample_root.findtext(".//Ids/Id[@db_label='Sample name']"),
        'serovar': biosample_root.findtext(".//Attribute[@attribute_name='serovar']"),
        'isolate': biosample_root.findtext(".//Attribute[@attribute_name='isolate']"),
        'create_date': biosample_root.find(".//Status[@status='live']").attrib.get('when') if biosample_root.find(".//Status[@status='live']") is not None else None,
        'location': biosample_root.findtext(".//Attribute[@attribute_name='geo_loc_name']"),
        'isolation_source': biosample_root.findtext(".//Attribute[@attribute_name='isolation_source']"),
        'isolation_type': biosample_root.findtext(".//Attribute[@attribute_name='isolation_type']"),
        'food_origin': biosample_root.findtext(".//Attribute[@attribute_name='food_origin']"),
        'snp_cluster': biosample_root.findtext(".//Attribute[@attribute_name='snp_cluster']"),
        'min_same': biosample_root.findtext(".//Attribute[@attribute_name='min-same']"),
        'min_diff': biosample_root.findtext(".//Attribute[@attribute_name='min-diff']"),
        'assembly': biosample_root.findtext(".//Attribute[@attribute_name='assembly']"),
        'amr_genotypes': biosample_root.findtext(".//Attribute[@attribute_name='amr_genotypes']"),  # Check for AMR genotypes
        'computed_types': biosample_root.findtext(".//Attribute[@attribute_name='computed_types']"),
        'host': biosample_root.findtext(".//Attribute[@attribute_name='host']"),
        'collection_date': biosample_root.findtext(".//Attribute[@attribute_name='collection_date']"),
        'mlst': biosample_root.findtext(".//Attribute[@attribute_name='MLST']"),
        'sample_type': biosample_root.findtext(".//Attribute[@attribute_name='sample_type']"),
        'collected_by': biosample_root.findtext(".//Attribute[@attribute_name='collected_by']"),
        'host_disease': biosample_root.findtext(".//Attribute[@attribute_name='host_disease']"),
        'lat_lon': biosample_root.findtext(".//Attribute[@attribute_name='lat_lon']"),
        'sequenced_by': biosample_root.findtext(".//Attribute[@attribute_name='sequenced_by']")
    }
    # Remove keys with None values
    data = {k: v for k, v in data.items() if v is not None}
    return data

# Fetch additional data for the selected biosample_ids
additional_data = []
error_biosamples = []

for isolate in sample_isolates:
    biosample_id = isolate['biosample_id']
    efetch_params = {
        "db": "biosample",
        "id": biosample_id,
        "retmode": "xml"
    }
    response = requests.get(efetch_biosample_url, params=efetch_params)
    
    if response.status_code != 200:
        print(f"Failed to fetch data for biosample_id: {biosample_id}")
        error_biosamples.append(biosample_id)
        continue
    
    # Parse the biosample XML content
    biosample_root = ET.fromstring(response.content)
    
    # Extract additional data
    biosample_data = parse_additional_data(biosample_root)
    
    # Check if extracted data contains None values and log missing fields
    missing_fields = [k for k, v in biosample_data.items() if v is None]
    if missing_fields:
        print(f"Incomplete data for biosample_id: {biosample_id}, missing fields: {missing_fields}")
        error_biosamples.append(biosample_id)
    else:
        additional_data.append(biosample_data)

# Convert parsed isolates and additional data to DataFrames

df_sample_isolates = pd.DataFrame(sample_isolates)
df_additional_data = pd.DataFrame(additional_data)

# Merge the two DataFrames on biosample_id
df_merged = pd.merge(df_sample_isolates, df_additional_data, on="biosample_id", how="left")

# Save the merged DataFrame to a CSV file
output_file = "isolates_sample_detailed.csv"
df_merged.to_csv(output_file, index=False)

# Log the final number of records
print(f"Final number of records in CSV: {len(df_merged)}")

# Output the path of the saved file for download
output_file