## fetch assemblies

In [7]:
import pandas as pd
from Bio import Entrez
import json
Entrez.email = "Your.Name.Here@example.org"

In [2]:
def get_ids(term):
    ids = []
    handle = Entrez.esearch(db="assembly", term=term)
    record = Entrez.read(handle)
    ids.append(record["IdList"])
    return ids

In [13]:
#Fetch raw output
def get_raw_assembly_summary(id):
    handle = Entrez.esummary(db="assembly",id=id,report="full")
    record = Entrez.read(handle)
    #Return individual fields
    #XML output: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=assembly&id=79781&report=%22full%22
    #return(record['DocumentSummarySet']['DocumentSummary'][0]['AssemblyName']) #This will return the Assembly name
    return(record)

def get_assembly_summary_json(id):
    handle = Entrez.esummary(db="assembly",id=id,report="full")
    record = Entrez.read(handle)
    #Convert raw output to json
    return(json.dumps(record, sort_keys=True,indent=4, separators=(',', ': ')))

In [None]:
term="mycobacterium canetti"
for id in get_ids(term):
    print (id)
    #print(get_raw_assembly_summary(id)) #For raw output
    print(get_assembly_summary_json(id)) #JSON Formatted

In [28]:
handle = Entrez.esummary(db="assembly",id='642398',report="full")
record = Entrez.read(handle)
x=json.dumps(record, sort_keys=True,indent=4, separators=(',', ': '))

## put assembly info into table

In [48]:
def get_assemblies(searchterm):
    handle = Entrez.esearch(db="assembly", term=searchterm,retmax=8000)
    record = Entrez.read(handle)    
    res=[]
    for id in record['IdList']:
        esummary_handle = Entrez.esummary(db="assembly", id=id, report="full")
        esummary_record = Entrez.read(esummary_handle)
        summ = esummary_record['DocumentSummarySet']['DocumentSummary'][0]
        #print(summ.keys())    
        accession_id = summ['AssemblyAccession']
        s = [accession_id, summ['SpeciesName'],summ['AssemblyName'],summ['BioSampleAccn'],summ['GB_BioProjects'][0]['BioprojectAccn']]
        print (s)
        res.append(s)
    df=pd.DataFrame(res,columns=['accession','species','assembly_id','biosample','bioproject'])
    return df

In [None]:
# Do a search and get assemblies
found=[]
for term in ["Mycobacterium marinum[Orgn]","Mycobacterium canettii[Orgn]","Mycobacterium ulcerans[Orgn]","Mycobacterium tuberculosis"]:
    df=get_assemblies(term)
    found.append(df)
found = pd.concat(found)

In [43]:
found.to_csv('assemblies_data.csv',index=False)