# Get information about the available genomes given a list of taxons

- Guillem Ylla, Ph.D., 2021, Post-Doctoral Fellow at Harvard University 

In [122]:
import sys
import zipfile
import pandas as pd
from pprint import pprint
from datetime import datetime
from collections import defaultdict, Counter
import ncbi.datasets

- This script uses the ncbi.datasets python library. (More info https://www.ncbi.nlm.nih.gov/datasets/)

In [123]:
## Set up api
api_instance = ncbi.datasets.GenomeApi(ncbi.datasets.ApiClient())

**Indicate file to read: here**
- The file must include a header "taxon" and a 1 taxon per row

In [124]:
#Indicate file to read here (replace taxons_to_check.txt for your file)
filetoread="Taxons_to_check.txt"

Check first 5 lines of the file

In [125]:
taxonlist=pd.read_csv(filetoread)
taxonlist[0:5]

FileNotFoundError: [Errno 2] No such file or directory: 'taxons_to_check.txt'

## Number of genome assemblies available per taxon
- I species might have more than 1 assmbly

In [None]:
for lindex, taxon in taxonlist.iterrows():
    tax_name = taxon['taxon']
    # query NCBI
    genome_summary = api_instance.assembly_descriptors_by_taxon(taxon=tax_name, limit='all')
    print(f"- {tax_name}: assemblies; {genome_summary.total_count}")

## List the species of the available genomes

In [None]:
for lindex, taxon in taxonlist.iterrows():
    tax_name =taxon['taxon']
    columns = ['spp', 'acc', 'level',"numChrScaff","sub.date","org.rank","taxID","assembly.length"]
    lst = []
    # query NCBI
    genome_summary = api_instance.assembly_descriptors_by_taxon(taxon=tax_name, limit='all')
    print(f"{tax_name}; {genome_summary.total_count} assemblies")

    if genome_summary.total_count is None:
        print("No genomes")
    else:
        for assembly in map(lambda d: d.assembly, genome_summary.assemblies):
            lst.append([
                assembly.org.sci_name,
                assembly.assembly_accession,
                assembly.assembly_level,
                len(assembly.chromosomes),
                assembly.submission_date,
                assembly.org.rank,
                assembly.org.tax_id,
                assembly.seq_length])
        genomesdf = pd.DataFrame(lst, columns=columns)
        display(genomesdf)

## Number of genome assemblies available per Species

In [None]:
for lindex, taxon in taxonlist.iterrows():
    tax_name = taxon['taxon']
    columns = ['spp', 'acc', 'level',"numChrScaff","sub.date","org.rank","taxID","assembly.length"]
    lst = []
    # query NCBI
    genome_summary = api_instance.assembly_descriptors_by_taxon(taxon=tax_name, limit='all')

    if genome_summary.total_count is None:
        print(f"{tax_name}; sppeecies with genome: 0")
    else:
        for assembly in map(lambda d: d.assembly, genome_summary.assemblies):
            lst.append([
                assembly.org.sci_name,
                assembly.assembly_accession,
                assembly.assembly_level,
                len(assembly.chromosomes),
                assembly.submission_date,
                assembly.org.rank,
                assembly.org.tax_id,
                assembly.seq_length])
        genomesdf = pd.DataFrame(lst, columns=columns)
        print(f"{tax_name}; species with genome: ", len(genomesdf.spp.unique()))
       # print("     -Spp list: ",",".join(genomesdf.spp.unique()))     
