In [1]:
import os
import hashlib
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from zipfile import ZipFile

import pandas as pd
from ncbi import datasets

In [2]:
def download_assembly_package(assembly_accessions, zipfile):
    dl_instance = datasets.DownloadApi(datasets.ApiClient())
    assembly_accessions = [assembly_accessions]
    include_sequence = True
    try:
        api_response = dl_instance.download_assembly_package(
            assembly_accessions,
            include_sequence=include_sequence,
            _preload_content=False,
        )
        with open(zipfile, 'wb') as f:
            f.write(api_response.data)
    except datasets.ApiException as e:
        pass

def uncompress(zipfile, outfile):
    archive = ZipFile(zipfile)
    sequence_files = [filename for filename in archive.namelist() if filename.endswith('fna')]
    with open(outfile, 'wb') as f:
        for sequence_file in sequence_files:
            d = archive.read(sequence_file)
            f.write(d)
    archive.close()

def fn(assembly_accessions, zipfile, seqfile):
    download_assembly_package(assembly_accessions=assembly_accessions, zipfile=zipfile)
    if os.path.exists(zipfile):
        try:
            uncompress(zipfile=zipfile, outfile=seqfile)
        except:
            os.remove(zipfile)
        os.remove(zipfile)

In [3]:
organism = 'Enterococcus faecalis'
level = {'Chromosome': 2, 'Complete Genome': 1, 'Contig': 4, 'Scaffold': 3}

df = pd.read_csv('/media/Central_Lab_Storage/NcbiASM/assembly_summary.txt', sep='\t', header=1, usecols=range(12), error_bad_lines=False, low_memory=False)

df = df[df['organism_name'].notna()]
df = df[df['organism_name'].str.startswith(organism)]
df['assembly_level'] = df['assembly_level'].map(level)
df = df[df['assembly_level']==1]

asm_accs = set(df['# assembly_accession'])
len(asm_accs)

64

In [3]:
asm_info = pd.read_csv('/media/Central_Lab_Storage/NcbiASM/Shigella_dysenteriae/prokaryotes.csv')
asm_info = asm_info[asm_info['RefSeq FTP'].notna()]

asm_accs = set(asm_info['Assembly'])
len(asm_accs)

66

In [4]:
dirpath = Path('/media/Central_Lab_Storage/NcbiASM/Enterococcus_faecalis/RefSeq')

In [5]:
with ThreadPoolExecutor(8) as executor:
    for acc in asm_accs:
        zipfile = dirpath/(acc + '.zip')
        seqfile = dirpath/(acc + '.fa')
        if not seqfile.exists():
            executor.submit(fn, acc, zipfile, seqfile)

In [None]:
ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt