In [3]:
from Bio import Entrez
import pandas as pd
Entrez.email = "ecutts@mit.edu"
Entrez.api_key = '057f36326473f362b4123bee57854f2c3208'

Lots of this drawn from https://stackoverflow.com/questions/16504238/attempting-to-obtain-taxonomic-information-from-biopython. Thanks to whoever asked this question 7 years ago.

In [4]:
def get_tax_id(species):
    """to get data from ncbi taxomomy, we need to have the taxid. we can
    get that by passing the species name to esearch, which will return
    the tax id"""
    species = species.replace(' ', "+").strip()
    search = Entrez.esearch(term = species, db = "taxonomy", retmode = "xml")
    record = Entrez.read(search)
    return record['IdList'][0]

def get_tax_data(taxid):
    """once we have the taxid, we can fetch the record"""
    search = Entrez.efetch(id = taxid, db = "taxonomy", retmode = "xml")
    return Entrez.read(search)

taxid = get_tax_id('Gramella forsetii KT0803')
data = get_tax_data(taxid)
lineage = {d['Rank']:d['ScientificName'] for d in \
           data[0]['LineageEx'] if d['Rank'] in ['phylum','class','family', 'order', 'genus', 'species']}

In [5]:
def get_tax_data_for_list(species_list, tax_levels=['phylum','class','family', 'order', 'genus', 'species']):
    taxid_list = []
    lineage_list = []

    for species in species_list:
        print(species)
        taxid = get_tax_id(species) # Apply your functions
        data = get_tax_data(taxid)
        lineage = {d['Rank']:d['ScientificName'] for d in \
                   data[0]['LineageEx'] if d['Rank'] in tax_levels}

        taxid_list.append(taxid) # Append the data to lists already initiated        
        lineage_list.append(lineage)

    for lineage in lineage_list: #fill with '' when no tax data given
        for tax_level in tax_levels:
            if tax_level not in lineage.keys():
                lineage[tax_level] = ''
    
    columns = dict.fromkeys(['Organism Name', 'taxid'] + tax_levels) # Make columns for DataFrame
    columns['Organism Name'] = species_list
    columns['taxid'] = taxid_list
    for tax_level in tax_levels:
        columns[tax_level] = [l[tax_level] for l in lineage_list]

    data_frame = pd.DataFrame(columns)
    
    return taxid_list, lineage_list, data_frame

In [6]:
df = pd.read_csv('Data/NCBI-bacteroidetes-reference-genomes.csv')
df.rename(columns={'#Organism Name': 'Organism Name'}, inplace=True)
df.replace(to_replace=' Chromosome', value='Chromosome', inplace = True) # error with space before chromosome category
taxid_list = []
df.head()

Unnamed: 0,Organism Name,Organism Groups,Strain,BioSample,BioProject,Assembly,Level,Size(Mb),GC%,Replicons,WGS,Scaffolds,CDS,Release Date,GenBank FTP,RefSeq FTP
0,Porphyromonas gingivalis ATCC 33277,Bacteria;FCB group;Bacteroidetes/Chlorobi group,ATCC 33277,SAMD00060922,PRJDA19051,GCA_000010505.1,Complete,2.35489,48.4,chromosome:NC_010729.1/AP009380.1,,1,1891,2008-05-20T23:06:00Z,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...
1,Gramella forsetii KT0803,Bacteria;FCB group;Bacteroidetes/Chlorobi group,KT0803,SAMEA2272667,PRJEA19061,GCA_000060345.1,Complete,3.79847,36.6,chromosome:NC_008571.1/CU207366.1,,1,3368,2006-11-06T00:00:00Z,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...
2,Alistipes putredinis DSM 17216,Bacteria;FCB group;Bacteroidetes/Chlorobi group,DSM 17216,SAMN00000002,PRJNA19655,GCA_000154465.1,Scaffold,2.55068,53.3,,ABFK02,12,2209,2007-09-11T00:00:00Z,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...
3,Phocaeicola coprocola,Bacteria;FCB group;Bacteroidetes/Chlorobi group,AF24-2,SAMN09734467,PRJNA482748,GCA_003458565.1,Scaffold,3.55378,40.8,,QRUU01,166,2733,2018-09-05T00:00:00Z,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003...,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003...
4,Bacteroides intestinalis DSM 17393,Bacteria;FCB group;Bacteroidetes/Chlorobi group,DSM 17393,SAMN00000015,PRJNA20523,GCA_000172175.1,Contig,6.0526,42.8,,ABJL02,8,4494,2008-01-30T00:00:00Z,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...


In [10]:
genome_list = df['Organism Name'].tolist()
chunks = []
for i in range(len(genome_list)//50):
    chunks.append(genome_list[(i*50):(i+1)*50])
    if i == len(genome_list)//50 - 1:
        chunks.append(genome_list[(i+1)*50:len(genome_list)])

In [21]:
taxid_lists = []
lineage_lists = []
data_frames = []
import time

count = 0
for chunk in chunks:
    print('starting chunk ' + str(count))
    taxid_list, lineage_list, data_frame = get_tax_data_for_list(chunk)
    print('appending')
    taxid_lists.append(taxid_list)
    lineage_list.append(lineage_list)
    data_frames.append(data_frame)
    count += 1
    print('sleeping 3 sec')
    time.sleep(3)

starting chunk 0
Porphyromonas gingivalis ATCC 33277
Gramella forsetii KT0803
Alistipes putredinis DSM 17216
Phocaeicola coprocola
Bacteroides intestinalis DSM 17393
Bacteroides caccae
Parabacteroides merdae
Bacteroides uniformis
Bacteroides stercoris ATCC 43183
Microscilla marina ATCC 23134
Polaribacter irgensii 23-P
Parabacteroides distasonis
Psychroflexus torquis ATCC 700755
Croceibacter atlanticus HTCC2559
Leeuwenhoekiella blandensis MED217
Zobellia galactanivorans
Salinibacter ruber DSM 13855
Kordia algicida OT-1
Prevotella ruminicola
Robiginitalea biformata HTCC2501
Dokdonia donghaensis DSW-1
Bacteroides finegoldii CL09T03C10
Bacteroides eggerthii
Phocaeicola plebeius
Phocaeicola dorei
Alloprevotella tannerae ATCC 51259
Porphyromonas uenonis DSM 23387 = JCM 13868
Prevotella bivia DSM 20514
Porphyromonas endodontalis
Capnocytophaga gingivalis
Phocaeicola coprophilus
Capnocytophaga sputigena
Parabacteroides johnsonii CL02T12C29
Prevotella copri
Bacteroides cellulosilyticus
Prevotel

Winogradskyella psychrotolerans RS-3
Flavobacterium saliperosum
Cellulophaga baltica 18
Cyclobacterium qasimii
Coprobacter fastidiosus NSB1
Prevotella brevis P6B11
Capnocytophaga haemolytica
Flavobacterium limnosediminis JC2902
Thermophagus xiamenensis
Porphyromonas crevioricanis
Zhouia amylolytica AD3
Arenitalea lutea
Phocaeicola abscessus CCUG 55929
Sphingobacterium paucimobilis HER1398
Bacteroides neonati
Algoriphagus machipongonensis
Gillisia marina
Galbibacter marinus
Sediminibacterium salmoneum NBRC 103935
Bacteroides reticulotermitis JCM 10512
Olleya marilimosa CAM030
Aquimarina megaterium
Aquimarina macrocephali JAMB N27
Dyadobacter tibetensis Y620-1
Bacteroides timonensis
Alistipes timonensis JC136
Alistipes ihumii AP11
Flavobacterium succinicans
Bacteroides acidifaciens
Prevotella falsenii DSM 22864 = JCM 15124
Prevotella aurantiaca JCM 15754
Prevotella dentasini JCM 15908
Bacteroides rodentium JCM 16496
Prevotella shahii DSM 15611 = JCM 12083
Bacteroides paurosaccharolyticus

Chryseobacterium molle
Pedobacter caeni
Flavobacterium granuli
Hymenobacter daecheongensis DSM 21074
Wenyingzhuangia marina
Flavobacterium micromati
Leeuwenhoekiella marinoflava
Cruoricaptor ignavus
Chryseobacterium polytrichastri
Reichenbachiella agariperforans
Tangfeifania diversioriginum
Myroides xuanwuensis
Chryseobacterium carnipullorum
Muricauda flava
Flavobacterium flevense
Maribacter aquivivus
Winogradskyella jejuensis
Flavobacterium xanthum
Mesonia phycicola
Hymenobacter psychrotolerans DSM 18569
Psychroflexus salarius
Flavobacterium terrae
Flavobacterium fluvii
Flavobacterium pectinovorum
Flavobacterium fontis
Flavobacterium defluvii
Flavobacterium xinjiangense
Aquimarina spongiae
Bacteroides luti
Rufibacter ruber
Prevotella phocaeensis
Flavobacterium cucumis
appending
sleeping 3 sec
starting chunk 12
Algoriphagus zhangzhouensis
Algoriphagus halophilus
Chryseobacterium zeae
Chryseobacterium scophthalmum
Chitinophaga niabensis
Gramella flava JLT2011
Vaginella massiliensis
Chry

Ichthyenterobacterium magnum
Chryseobacterium arachidiradicis
Tenacibaculum lutimaris
Flavobacterium kingsejongi
appending
sleeping 3 sec
starting chunk 18
Sphingobacterium siyangense
Sphingobacterium detergens
Chitinophaga barathri
Bergeyella cardium
Hanstruepera crassostreae
Mucilaginibacter gracilis
Pedobacter chitinilyticus
Flavobacterium endophyticum
Gillisia mitskevichiae
Flavobacterium limicola
Chryseobacterium defluvii
Maribacter vaceletii
Pedobacter alluvionis
Tenacibaculum discolor
Mesonia aquimarina
Ulvibacter antarcticus
Flavobacterium weaverense
Euzebyella marina
Chryseobacterium nematophagum
Larkinella soli
Rufibacter immobilis
Pedobacter jejuensis
Sinomicrobium pectinilyticum
Chryseobacterium daecheongense
Chryseobacterium nakagawai
Aureibaculum marinum
Chryseobacterium shandongense
Chryseobacterium bernardetii
Flavobacterium magnum
Flavobacterium album
Flavobacterium pallidum
Flavobacterium crocinum
Butyricimonas faecihominis
Amniculibacterium aquaticum
Larkinella knui


Flavobacterium petrolei
Cochleicola gelatinilyticus
Sphingobacterium shayense
Pedobacter boryungensis
Capnocytophaga felis
Winogradskyella eckloniae
Winogradskyella litoriviva
Winogradskyella undariae
Sphingobacterium puteale
Chryseobacterium aureum
Ancylomarina salipaludis
Duncaniella freteri
Empedobacter tilapiae
Aequorivita sinensis
Aequorivita lutea
Maribacter luteus
Costertonia aggregata
Macellibacteroides fermentans
Lutibacter citreus
Maribacter algicola
Flavobacterium alkalisoli
Chryseobacterium binzhouense
Chryseobacterium candidae
Cyclobacterium xiamenense
Bacteroides ovatus
Cytophaga hutchinsonii ATCC 33406
Bacteroides thetaiotaomicron
Bacteroides vulgatus ATCC 8482
appending
sleeping 3 sec
starting chunk 25
Flavobacterium psychrophilum
Prevotella melaninogenica ATCC 25845
Chryseobacterium gleum
Sphingobacterium spiritivorum
Marivirga tractuosa DSM 4126
Phocaeicola salanitronis DSM 18170
Riemerella anatipestifer ATCC 11845 = DSM 15868
Haliscomenobacter hydrossis DSM 1100
Rune

In [14]:
# all_taxa = pd.concat(data_frames)
# all_taxa.to_csv('Data/NCBI-bacteroidetes-all.csv', index=False)
# ^ use above only when running the code in the box above

all_taxa = pd.read_csv('Data/NCBI-bacteroidetes-all.csv')
all_taxa.head()

Unnamed: 0,Organism Name,taxid,phylum,class,family,order,genus,species
0,Porphyromonas gingivalis ATCC 33277,431947,Bacteroidetes,Bacteroidia,Porphyromonadaceae,Bacteroidales,Porphyromonas,Porphyromonas gingivalis
1,Gramella forsetii KT0803,411154,Bacteroidetes,Flavobacteriia,Flavobacteriaceae,Flavobacteriales,Gramella,Gramella forsetii
2,Alistipes putredinis DSM 17216,445970,Bacteroidetes,Bacteroidia,Rikenellaceae,Bacteroidales,Alistipes,Alistipes putredinis
3,Phocaeicola coprocola,310298,Bacteroidetes,Bacteroidia,,Bacteroidales,Phocaeicola,
4,Bacteroides intestinalis DSM 17393,471870,Bacteroidetes,Bacteroidia,Bacteroidaceae,Bacteroidales,Bacteroides,Bacteroides intestinalis


I want to have somewhere in the range of 100-200 taxa on the tree. So let's see how each of the taxonomic levels breaks down. Deleted Blattabacterium sp. (Cryptocercus punctulatus) str. Cpu	because it caused error: "list index out of range" when I tried to retrieve its taxid.

In [15]:
tax_levels = all_taxa.columns.tolist()[2:]
tax_level_counts = dict.fromkeys(tax_levels, 0)

for tax_level in tax_levels:
    tax_level_counts[tax_level] = len(set(all_taxa[tax_level]))
    
print(tax_level_counts)

{'phylum': 1, 'class': 7, 'family': 50, 'order': 8, 'genus': 298, 'species': 271}


In [16]:
pure_genus_data_frames = []
for genus in set(all_taxa['genus']):
    pure_genus_data_frames.append(all_taxa[all_taxa['genus'] == genus])

In [56]:
genus_representatives = []
# begin filtering
for pure_genus_data_frame in pure_genus_data_frames:
    
    genus_representative = None
    
    if(len(pure_genus_data_frame)) == 1: # if there is only one representative, it is the genus representative
        genus_representative = pure_genus_data_frame['Organism Name'].tolist()[0]
    
    else: # screen by assembly level
        assembly_levels = {'Contig': [], 'Scaffold': [], 'Complete': [], 'Chromosome': []} # group organisms by assembly level
       
        for organism in pure_genus_data_frame['Organism Name']:
            assembly_level = df[df['Organism Name'] == organism]['Level'].tolist()[0]
            assembly_levels[assembly_level].append(organism)
            
        if len(assembly_levels['Complete']) >= 1: # if 1 or more complete
            if len(assembly_levels['Complete']) == 1: # if only 1 complete
                genus_representative = assembly_levels['Complete'][0] # make it the representative
            else:
                to_consider = assembly_levels['Complete'] # else consider only complete genomes in next step
        
        elif len(assembly_levels['Chromosome']) >= 1: # elif 1 or more chromosome
            if len(assembly_levels['Chromosome']) == 1: # if only 1 chromosme
                genus_representative = assembly_levels['Chromosome'][0]
            else:
                to_consider = assembly_levels['Chromosme'] # else consider only chromosome genomes in next step
        
        elif len(assembly_levels['Scaffold']) >= 1: # elif 1 or more scaffold
            if len(assembly_levels['Scaffold']) == 1: # if only 1 scaffold
                genus_representative = assembly_levels['Scaffold'][0] # make it the representative
            else:
                to_consider = assembly_levels['Scaffold'] # else consider only scaffold genomes in next step
        
        else: 
            to_consider = pure_genus_data_frame['Organism Name'].tolist() # else consider contig genomes
    
    
    if genus_representative != None: # if a genus representative was already found
        genus_representatives.append(genus_representative) # return it an move on
    
    else:
        tax_levels = pure_genus_data_frame.columns[2:]
        filled_tax_level_count = dict.fromkeys(range(len(tax_levels)+1))
        for key in filled_tax_level_count:
            filled_tax_level_count[key] = []
            
        for organism in to_consider:
            row = pure_genus_data_frame[pure_genus_data_frame['Organism Name'] == organism]
            
            count = 0 # count filled tax info columns
            for tax_level in row.columns[2:]:
                if row[tax_level].tolist()[0] != 'nan':
                    count += 1
            filled_tax_level_count[count].append(organism)
        
        for i in range(len(tax_levels)+1):
            key = len(tax_levels)-i
            if len(filled_tax_level_count[key]) >= 1:
                genus_representative = min(filled_tax_level_count[key], key=len)# if there's more than one option, take the one with the shortest name
        genus_representatives.append(genus_representative)

In [71]:
representatives_df = all_taxa[all_taxa['Organism Name'].isin(genus_representatives)]
assemblys = df[df['Organism Name'].isin(genus_representatives)]['Assembly']
levels = df[df['Organism Name'].isin(genus_representatives)]['Level']
representatives_df.insert(2, column='Refseq Assembly', value=assemblys)
representatives_df.insert(3, column='Level', value=levels)

In [74]:
representatives_df.to_csv('Data/NCBI-bacteroidetes-genus-representatives.csv', index=False)
representatives_df['taxid'].to_csv('Data/NCBI-bacteroidetes-genus-representatives-taxids.csv', header=False, index=False)
representatives_df.head()

Unnamed: 0,Organism Name,taxid,Refseq Assembly,Level,phylum,class,family,order,genus,species
9,Microscilla marina ATCC 23134,313606,GCA_000169175.1,Contig,Bacteroidetes,Cytophagia,Microscillaceae,Cytophagales,Microscilla,Microscilla marina
11,Parabacteroides distasonis,823,GCA_012273055.1,Chromosome,Bacteroidetes,Bacteroidia,Tannerellaceae,Bacteroidales,Parabacteroides,
12,Psychroflexus torquis ATCC 700755,313595,GCA_000153485.2,Complete,Bacteroidetes,Flavobacteriia,Flavobacteriaceae,Flavobacteriales,Psychroflexus,Psychroflexus torquis
13,Croceibacter atlanticus HTCC2559,216432,GCA_000196315.1,Complete,Bacteroidetes,Flavobacteriia,Flavobacteriaceae,Flavobacteriales,Croceibacter,Croceibacter atlanticus
15,Zobellia galactanivorans,63186,GCA_000973105.1,Complete,Bacteroidetes,Flavobacteriia,Flavobacteriaceae,Flavobacteriales,Zobellia,
