## Optional: Find Accession number of reference assembly for easier bulk download

In [21]:
import json
import subprocess as sp
from tqdm import tqdm
import pandas as pd

taxa = [
    "Homo_sapiens", 
    "Macaca_mulatta", 
    "Mus_musculus", 
    "Canis_lupus_familiaris", 
    "Equus_caballus", 
    "Pipistrellus_kuhlii",
    "Bos_taurus", 
    "Loxodonta_africana", 
    "Gallus_gallus", 
    "Xenopus_tropicalis",
]


def parse_ncbidatasets_summary(taxon):
    taxon = taxon.lower().replace('_', ' ')
    summary_string = sp.run(f'datasets summary genome taxon "{taxon}"', capture_output=True, shell=True)
    if summary_string.stderr:
        raise summary_string.stderr.decode('UTF-8')
    summary = json.loads(summary_string.stdout)
    col = []
    for report, items in summary.items():
        for assemblydict in items:
            # assembly info
            assembly_status = assemblydict['assembly_info']['assembly_status']
            if not assembly_status == 'current':
                continue
    
            if 'refseq_category' in assemblydict['assembly_info']:
                refseq_category = assemblydict['assembly_info']['refseq_category']
            else:
                refseq_category = None

            
                
            accession = assemblydict['accession']
            assembly_level = assemblydict['assembly_info']['assembly_level']
            # organism info
            organism_name = assemblydict['organism']['organism_name']
            taxid = assemblydict['organism']['tax_id']
            if 'common_name' in assemblydict['organism']:
                common_name = assemblydict['organism']['common_name']
            else:
                common_name = None

            if refseq_category == 'reference genome' or refseq_category == 'representative genome':
                return accession, common_name
        break


def reference_accessions(taxlist):
    col = []
    for taxon in tqdm(taxlist):
        gcf, specname = parse_ncbidatasets_summary(taxon)
        if gcf:
            col.append([str(taxon), str(specname), str(gcf)])
        else:
            print(f'Warning: No reference assembly found for {taxon}')
    df = pd.DataFrame(col, columns=['Name', 'Common_name', 'Accession'])
    return df

taxdf = reference_accessions(taxa)
taxdf.to_csv('../data/taxa_overview.tsv', sep='\t', index=False)
display(taxdf)

100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:29<00:00,  2.97s/it]


Unnamed: 0,Name,Common_name,Accession
0,Homo_sapiens,human,GCF_000001405.40
1,Macaca_mulatta,Rhesus monkey,GCF_003339765.1
2,Mus_musculus,house mouse,GCF_000001635.27
3,Canis_lupus_familiaris,dog,GCF_011100685.1
4,Equus_caballus,horse,GCF_002863925.1
5,Pipistrellus_kuhlii,Kuhl's pipistrelle,GCF_014108245.1
6,Bos_taurus,cattle,GCF_002263795.3
7,Loxodonta_africana,African savanna elephant,GCF_030014295.1
8,Gallus_gallus,chicken,GCF_016699485.2
9,Xenopus_tropicalis,tropical clawed frog,GCF_000004195.4
