In [1]:
import sys
sys.path.append('../')
from utility.file_utility import FileUtility
from scipy.sparse import csr_matrix
import os
import tqdm
path = '/mounts/data/proj/asgari/dissertation/datasets/deepbio/taxonomy/ncbi-blast-2.5.0+/bin/'
os.environ['PATH'] += ':'+path
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from utility.math_utility import get_sym_kl_rows
from nltk import FreqDist
import operator
import numpy as np
import pandas as pd

In [2]:
class FastaMarkers2Excall:
    
    def __init__(self,fasta_file, matrix_path, feature_file_path, p_value_threshold=0.01, remove_redundants=False):
        self.seq_IDS=FileUtility.read_fasta_sequences_ids(fasta_file)
        self.remove_redundants=remove_redundants
        self.ez_taxa_dict={x.split()[0]:x.split()[1].split(';') for x in FileUtility.load_list('/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/EZ/raw/eztaxon_id_taxonomy.txt')}
        self.mat=FileUtility.load_sparse_csr(matrix_path)
        self.features=FileUtility.load_list(feature_file_path)
        self.align_markers(p_value_threshold)        
        self.redundant_columns_indentification()

    def get_pandas_df(self):
        res=self.finalize_the_results()
        idx2col={0:'taxonomy',1:'marker',2:'direction',3:'taxonomylevel',4:'nummarkers',5:'pvalue'}
        table={'taxonomy':[],'marker':[],'direction':[],'taxonomylevel':[],'nummarkers':[],'pvalue':[]}
        for row in res:
            for idx, val in enumerate(row):
                table[idx2col[idx]].append(val)
        return pd.DataFrame(data=table,columns=['direction', 'taxonomy','marker','pvalue','nummarkers','taxonomylevel'])

    def finalize_the_results(self):
        results=self.extract_results()
        final_results=[]
        for x in results:
            if len(x)>1:
                final_results.append(FastaMarkers2Excall.find_best_record(x))
            else:
                x=x[0]
                taxa=x[1][0:-1]
                seq=x[0]
                d=x[1][-1]
                length=len(taxa.split(';'))-np.sum([1 if FastaMarkers2Excall.isGenomeName(name) else 0 for name in taxa.split(';')])
                final_results.append([taxa,seq,d,length,1,x[2]])
        candidates=sorted(final_results, key=lambda element: (element[2],-element[3],element[5],element[4]))
        return candidates
        
    def extract_results(self):
        results=[]
        for group in self.equiv_classes:
            results.append([self.aligned_markers[x] for x in group])
        return results
            
    def update_matrix_by_markers(self):
        '''
            called by align_markers
        '''
        new_matrix=[]
        for feature, taxnomy, pvalue in self.aligned_markers:
            column=self.features.index(feature)
            new_matrix.append(self.mat[:,column].toarray().T[0].tolist())
        new_matrix=np.array(new_matrix)
        self.update_matrix=new_matrix

    @staticmethod
    def find_best_record(records):
        candiates=[(x[0],x[1],x[2],-len(x[1].split(';'))) for x in records]
        taxa_freq=FreqDist([y[1] for y in candiates]).most_common()
        final=[]
        for taxa, freq in taxa_freq:
            min_idx = np.argmin([rec[2] for rec in records if rec[1]==taxa])
            rep_marker=[rec[0] for rec in records if rec[1]==taxa][min_idx]
            pval=np.median([rec[2] for rec in records if rec[1]==taxa])
            direction=taxa[-1]
            taxlevel=len(taxa[0:-1].split(';'))-np.sum([1 if FastaMarkers2Excall.isGenomeName(name) else 0 for name in taxa[0:-1].split(';')])
            final.append([taxa[0:-1],rep_marker,direction, taxlevel, freq,pval])
        candidates=sorted(final, key=lambda element: (-element[3],element[5],element[4]))
        return candidates[0]

    @staticmethod
    def find_equiv_classes(list_of_pairs):
        found_list=[]
        for x,y in list_of_pairs:
            if found_list==[]:
                found_list.append(set([x,y]))
            else:
                idx_to_add=-1
                idx_to_add_list=[]
                for idx,group in enumerate(found_list):
                    if x in group or y in group:
                        idx_to_add=idx
                        idx_to_add_list.append(idx)
                if idx_to_add==-1:
                    found_list.append(set([x,y]))
                else:
                    res=set([x,y])
                    for st in idx_to_add_list:
                        res=res.union(found_list[st])
                    for i in idx_to_add_list[::-1]:
                        found_list.remove(found_list[i])
                    found_list.append(res)
        return found_list
    
    @staticmethod
    def isGenomeName(inputString):
        return np.sum([1 if char.isdigit() else 0 for char in inputString])/np.sum([1 if char.isalpha() else 0 for char in inputString])>0.8

    def lowest_certain_level(self,results):
        
        levels_id={'Superkingdom':1,'phylum':1,'class':2,'order':3,'family':4,'genus':5,'species':6}
        species=set([x[0][levels_id['species']] for x in results])
        genuses=set([x[0][levels_id['genus']] for x in results])
        families=set([x[0][levels_id['family']] for x in results])
        orders=set([x[0][levels_id['order']] for x in results])
        classes=set([x[0][levels_id['class']] for x in results])
        phylums=set([x[0][levels_id['phylum']] for x in results])
        Superkingdoms=set([x[0][levels_id['Superkingdom']] for x in results])

        if len(species)==1:
            return ';'.join(results[0][0])
        elif len(genuses)==1:
            return ';'.join(results[0][0][0:6])
        elif len(families)==1:
            return ';'.join(results[0][0][0:5])
        if len(orders)==1:
            return ';'.join(results[0][0][0:4])
        elif len(classes)==1:
            return ';'.join(results[0][0][0:3]) 
        elif len(phylums)==1:
            return ';'.join(results[0][0][0:2]) 
        elif len(Superkingdoms)==1:
            return ';'.join(results[0][0][0:1]) 
        else:
            return False
    
    def redundant_columns_indentification(self):
        distances=get_sym_kl_rows(self.update_matrix)
        flatten_distances=distances.flatten()
        #self.list_of_pairs=np.argwhere(distances<np.percentile(flatten_distances, 5, axis=0)).tolist()
        if self.remove_redundants:
            self.list_of_pairs=np.argwhere(distances==0).tolist()
            self.equiv_classes=FastaMarkers2Excall.find_equiv_classes(self.list_of_pairs)
        else:
            self.list_of_pairs=[(i,i) for i in range(distances.shape[0])]
            self.equiv_classes=FastaMarkers2Excall.find_equiv_classes(self.list_of_pairs)
            
        
    def align_markers(self,p_value_threshold):
        final_results=[]
        for idx, (seq, description) in tqdm.tqdm(self.seq_IDS.items()):
            pval=float(description.split(':')[1])
            if pval<=p_value_threshold:
                FileUtility.create_fasta_file('temp.fasta',[seq],['temp'])
                blastx_cline=NcbiblastnCommandline(query='temp.fasta', db="/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/EZ/raw/eztaxon_qiime_full.fasta", evalue=0.001, outfmt=5, out="temp.xml")
                blastx_cline()
                f=open("temp.xml",'r')
                blast_records = NCBIXML.parse(f)
                flag=False
                score=-1
                alignment_length=-1
                results=[]
                for blast_record in blast_records:
                    for alignment in blast_record.alignments:
                        for hsp in alignment.hsps:
                            if not flag and score==-1:
                                score=hsp.score
                                alignment_length=hsp.align_length
                                flag=True
                            if hsp.score >= score and hsp.align_length>=alignment_length and 'Eukarya' not in self.ez_taxa_dict[alignment.hit_id]:
                                results.append((self.ez_taxa_dict[alignment.hit_id],hsp.expect))
                if len(results)>0:
                    res=self.lowest_certain_level(results)
                    if res:
                        final_results.append((seq,res+idx[-1],pval))
                    else:
                        final_results.append((seq,'ZZZNOVEL'+idx[-1],pval))
                else:
                    final_results.append((seq,'ZZZNOVEL'+idx[-1],pval))

        # sorted markers by the taxonomy information of the last certain level
        self.aligned_markers=sorted(final_results, key=operator.itemgetter(1), reverse=False)
        self.min_p_value=p_value_threshold
        self.update_matrix_by_markers()

In [4]:
fasta_file='/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/ra/markers/unt_healthy_chi2_relative.fasta'
matrix_path='../../16S_datasets/ra/rep/ra_selfposcpe_50000_cpe_-1.npz'
feature_file_path='../../16S_datasets/ra/rep/ra_selfposcpe_50000_cpe_-1_features'
FM2EXC=FastaMarkers2Excall(fasta_file, matrix_path, feature_file_path, p_value_threshold=0.05, remove_redundants=True)

100%|██████████| 1335/1335 [04:30<00:00,  4.19it/s]


In [10]:
len(FM2EXC.aligned_markers)

1335

In [22]:
def fix_taxonomy(inputstring):
    parts=inputstring.split(';')
    if not isGenomeName(parts[-1]):
        return inputstring
    else:
        return fix_taxonomy(';'.join(parts[0:-1]))
    
def isGenomeName(inputString):
        return np.sum([1 if char.isdigit() else 0 for char in inputString])/np.sum([1 if char.isalpha() else 0 for char in inputString])>0.8

In [12]:
FM2EXC.get_pandas_df()

Unnamed: 0,direction,taxonomy,marker,pvalue,nummarkers,taxonomylevel
0,+,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidal...,aatgtgggggaccttcctctcagaacccctactgatcgtcgccttg...,0.001171,1,7
1,+,Bacteria;Firmicutes;Erysipelotrichi;Erysipelot...,atatttggcatgagctccatgcggtgctcatggctatgcggtatta...,0.002838,1,7
2,+,Bacteria;Tenericutes;Mollicutes;Acholeplasmata...,attcgttcgacttgcatgtattaggcacgccgccagcgttcatcct...,0.002975,1,7
3,+,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidal...,ctgatcgtcgccttggtgggccgttaccccgccaacaagctaatca...,0.002999,1,7
4,+,Bacteria;Firmicutes;Erysipelotrichi;Erysipelot...,atatttggcatgagctccatgcggtgctcatggctatgcggtatta...,0.004144,1,7
5,+,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidal...,ttccaatgtgggggaccttcctctcagaacccctactgatcgtcgc...,0.004227,1,7
6,+,Bacteria;Tenericutes;Mollicutes;Acholeplasmata...,attcgttcgacttgcatgtattaggcacgccgccagcgttcatcct...,0.005245,1,7
7,+,Bacteria;Firmicutes;Bacilli;Lactobacillales;St...,tgagccgttacctcaccaactagctaatacaacgcaggtccatctc...,0.005720,1,7
8,+,Bacteria;Firmicutes;Erysipelotrichi;Erysipelot...,atgcgccataggtccatccctgcgctatccccgaaaggatatttgg...,0.005720,1,7
9,+,Bacteria;Tenericutes;Mollicutes;Acholeplasmata...,tcatctctaaaagagattcgttcgacttgcatgtattaggcacgcc...,0.005720,1,7


In [76]:
font_map={1:15,2:14,3:13,4:12, 5:8,6:7,7:4}
taxonomy=FM2EXC.get_pandas_df()['taxonomy'].tolist()
direction=FM2EXC.get_pandas_df()['direction'].tolist()
taxlev=FM2EXC.get_pandas_df()['taxonomylevel'].tolist()

logpval=[round(-np.log(x)) for x in FM2EXC.get_pandas_df()['pvalue'].tolist()]


taxonomy=['.'.join(fix_taxonomy(x).split(';')) for x in taxonomy]
tax_freq=dict(FreqDist(taxonomy).most_common())
logpval_frq=[tax_freq[x] for idx,x in enumerate(taxonomy)]

#taxonomy=['.'.join(x[0:-1] if isGenomeName(x[-1]) else x) for x in taxonomy]
annot=['\t'.join([taxonomy[idx].split('.')[-1],'annotation_background_color',('r' if x=='+' else ('b' if x=='-' else 'g'))])  for idx, x in enumerate(direction) if len(taxonomy[idx].split('.'))>5]
annot=annot+['\t'.join([taxonomy[idx].split('.')[-1],'annotation_background_color','w'])  for idx, x in enumerate(direction) if len(taxonomy[idx].split('.'))==5]
annot=annot+['\t'.join([taxonomy[idx].split('.')[-1],'annotation',taxonomy[idx].split('.')[-1]])  for idx, x in enumerate(direction) if len(taxonomy[idx].split('.'))>5]

## OUTER RINGS
annot=annot+['\t'.join([taxonomy[idx].split('.')[1],'annotation',taxonomy[idx].split('.')[1]])  for idx, x in enumerate(direction) if len(taxonomy[idx].split('.'))>1]
annot=annot+['\t'.join([taxonomy[idx].split('.')[1],'annotation_rotation',str(1)])  for idx, x in enumerate(direction) if len(taxonomy[idx].split('.'))>1]
annot=annot+['\t'.join([taxonomy[idx].split('.')[1],'annotation_font_size',str(9)])  for idx, x in enumerate(direction) if len(taxonomy[idx].split('.'))>1 ]
annot=annot+['\t'.join([taxonomy[idx].split('.')[1],'annotation_background_color','#eedbfc'])  for idx, x in enumerate(direction) if len(taxonomy[idx].split('.'))>1]

## Clades
annot=annot+['\t'.join([taxonomy[idx].split('.')[-1],'clade_marker_size',str(logpval_frq[idx])])  for idx, x in enumerate(direction)  if len(taxonomy[idx].split('.'))>5 ]
annot=annot+['\t'.join([taxonomy[idx].split('.')[-1],'clade_marker_edge_width',str(logpval[idx])])  for idx, x in enumerate(direction)  if len(taxonomy[idx].split('.'))>5 ]

annot=annot+['\t'.join([taxonomy[idx].split('.')[-1],'annotation_rotation',str(1)])  for idx, x in enumerate(direction)  if len(taxonomy[idx].split('.'))>5]
annot=annot+['\t'.join([taxonomy[idx].split('.')[-1],'annotation_font_size',str(font_map[taxlev[idx]])])  for idx, x in enumerate(direction) if len(taxonomy[idx].split('.'))>5 ]
annot=annot+['annotation_background_offset\t0.5']
annot=annot+['clade_marker_edge_color\t#4f1a49']
annot=annot+['branch_color\t#4f1a49']
annot=annot+['annotation_background_separation\t-0.01']
annot=annot+['annotation_background_width\t0.2']


#https://bitbucket.org/nsegata/graphlan/src/default/readme.txt?fileviewer=file-view-default
#asgari@epsilon1:/mounts/data/proj/asgari/dissertation/libraries/graphlan$ python graphlan_annotate.py --annot ../annot.txt ../test.txt  ../new.xml
#asgari@epsilon1:/mounts/data/proj/asgari/dissertation/libraries/graphlan$ python graphlan.py ../new.xml image_name.pdf --dpi 1000 --size 15
taxonomy=[x for x in taxonomy if len(x.split('.'))>5]
FileUtility.save_list('/mounts/data/proj/asgari/dissertation/libraries/test.txt',taxonomy)
FileUtility.save_list('/mounts/data/proj/asgari/dissertation/libraries/annot.txt',annot)

In [52]:
writer = pd.ExcelWriter('RA_redudant_removed_cleaned.xlsx')
FM2EXC.get_pandas_df().to_excel(writer,'Untreated_Healthy')
writer.save()

In [None]:
fasta_file='/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/ra/markers/tr_healthy_chi2_relative.fasta'
matrix_path='../../16S_datasets/ra/rep/ra_selfposcpe_50000_cpe_-1.npz'
feature_file_path='../../16S_datasets/ra/rep/ra_selfposcpe_50000_cpe_-1_features'
FM2EXCtvsh=FastaMarkers2Excall(fasta_file, matrix_path, feature_file_path, p_value_threshold=0.05, remove_redundants=True)

In [56]:
fasta_file='/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/ra/markers/unt_tr_chi2_relative.fasta'
matrix_path='../../16S_datasets/ra/rep/ra_selfposcpe_50000_cpe_-1.npz'
feature_file_path='../../16S_datasets/ra/rep/ra_selfposcpe_50000_cpe_-1_features'
FM2EXCunvst=FastaMarkers2Excall(fasta_file, matrix_path, feature_file_path, p_value_threshold=0.05, remove_redundants=True)

100%|██████████| 1749/1749 [02:16<00:00, 12.83it/s]


In [66]:
FM2EXCtvsh.get_pandas_dft()

AttributeError: 'FastaMarkers2Excall' object has no attribute 'get_pandas_dft'

Unnamed: 0,direction,taxonomy,marker,pvalue,nummarkers,taxonomylevel
0,+,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidal...,aatgtgggggaccttcctctcagaacccctactgatcgtcgccttg...,0.002085,2,7
1,+,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidal...,aatgtgggggaccttcctctcagaacccctactgatcgtcgccttg...,0.003613,4,7
2,+,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidal...,aatgtgggggaccttcctctcagaacccctactgatcgtcgccttg...,0.003613,4,7
3,+,Bacteria;Tenericutes;Mollicutes;Acholeplasmata...,attcgttcgacttgcatgtattaggcacgccgccagcgttcatcct...,0.004110,2,7
4,+,Bacteria;Tenericutes;Mollicutes;Acholeplasmata...,attcgttcgacttgcatgtattaggcacgccgccagcgttcatcct...,0.004110,2,7
5,+,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidal...,aatgtgggggaccttcctctcagaacccctactgatcgtcgccttg...,0.004227,5,7
6,+,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidal...,aatgtgggggaccttcctctcagaacccctactgatcgtcgccttg...,0.004227,5,7
7,+,Bacteria;Firmicutes;Bacilli;Lactobacillales;St...,tgagccgttacctcaccaactagctaatacaacgcaggtccatctc...,0.005720,1,7
8,+,Bacteria;Tenericutes;Mollicutes;Acholeplasmata...,tcatctctaaaagagattcgttcgacttgcatgtattaggcacgcc...,0.005720,2,7
9,+,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidal...,tgctgcctcccgtaggagtttggaccgtgtctcagttccaatgtgg...,0.005818,1,7


In [57]:
writer = pd.ExcelWriter('RA_redudant_removed_cleaned.xlsx')
FM2EXC.get_pandas_df().to_excel(writer,'Untreated_Healthy')
FM2EXCtvsh.get_pandas_df().to_excel(writer,'Treated_Healthy')
FM2EXCunvst.get_pandas_df().to_excel(writer,'Untreated_Treated')
writer.save()

In [58]:
FM2EXC.equiv_classes

[{0},
 {1},
 {2},
 {4},
 {6},
 {7},
 {9},
 {10},
 {11},
 {12},
 {14},
 {15},
 {16},
 {17},
 {18},
 {13, 19, 20, 21},
 {22},
 {23},
 {24},
 {25},
 {61},
 {87},
 {92},
 {116},
 {118},
 {128},
 {153},
 {171},
 {190},
 {198},
 {196, 199},
 {205},
 {207, 212},
 {214},
 {215},
 {217},
 {219},
 {221},
 {222},
 {225},
 {226},
 {227},
 {228},
 {230},
 {231},
 {232},
 {233},
 {234},
 {235},
 {236},
 {237},
 {238},
 {242},
 {229, 239, 240, 241, 243},
 {244},
 {245},
 {246},
 {247},
 {248},
 {249},
 {250},
 {251},
 {252},
 {253},
 {254},
 {255},
 {256},
 {257},
 {258},
 {259},
 {260},
 {261},
 {262},
 {263},
 {264},
 {265, 266},
 {267},
 {268},
 {269},
 {270},
 {271},
 {272},
 {273},
 {274},
 {275},
 {276},
 {277},
 {278},
 {279},
 {280},
 {281},
 {282, 283},
 {284},
 {285},
 {286},
 {287},
 {288, 289},
 {290},
 {299},
 {301},
 {303, 304},
 {305},
 {309},
 {311},
 {315},
 {307, 316},
 {317},
 {302, 306, 308, 310, 312, 313, 314, 318, 319, 320, 321, 322, 323},
 {324, 325, 326, 327, 328, 329, 330},
 

In [40]:
def find_equiv_classes(list_of_pairs):
    found_list=[]
    for x,y in list_of_pairs:
        if found_list==[]:
            found_list.append(set([x,y]))
        else:
            idx_to_add=-1
            idx_to_add_list=[]
            for idx,group in enumerate(found_list):
                if x in group or y in group:
                    idx_to_add=idx
                    idx_to_add_list.append(idx)
            if idx_to_add==-1:
                found_list.append(set([x,y]))
            else:
                res=set([x,y])
                for st in idx_to_add_list:
                    res=res.union(found_list[st])
                for i in idx_to_add_list[::-1]:
                    found_list.remove(found_list[i])
                found_list.append(res)
    return found_list

In [41]:
a

[[0, 0],
 [1, 1],
 [1, 217],
 [2, 2],
 [3, 3],
 [3, 5],
 [3, 25],
 [3, 26],
 [3, 28],
 [3, 35],
 [4, 4],
 [4, 25],
 [5, 3],
 [5, 5],
 [5, 6],
 [5, 25],
 [5, 26],
 [5, 28],
 [5, 35],
 [5, 37]]

In [42]:
find_equiv_classes(a)

add 0 ,  0
add 1 ,  1
current  [{0}, {1}]
add 1 ,  217  in  {1}
list:  [1]
current  [{0}, {1, 217}]
add 2 ,  2
current  [{0}, {1, 217}, {2}]
add 3 ,  3
current  [{0}, {1, 217}, {2}, {3}]
add 3 ,  5  in  {3}
list:  [3]
current  [{0}, {1, 217}, {2}, {3, 5}]
add 3 ,  25  in  {3, 5}
list:  [3]
current  [{0}, {1, 217}, {2}, {25, 3, 5}]
add 3 ,  26  in  {25, 3, 5}
list:  [3]
current  [{0}, {1, 217}, {2}, {25, 26, 3, 5}]
add 3 ,  28  in  {25, 26, 3, 5}
list:  [3]
current  [{0}, {1, 217}, {2}, {3, 5, 25, 26, 28}]
add 3 ,  35  in  {3, 5, 25, 26, 28}
list:  [3]
current  [{0}, {1, 217}, {2}, {3, 35, 5, 25, 26, 28}]
add 4 ,  4
current  [{0}, {1, 217}, {2}, {3, 35, 5, 25, 26, 28}, {4}]
add 4 ,  25  in  {4}
list:  [3, 4]
current  [{0}, {1, 217}, {2}, {3, 4, 35, 5, 25, 26, 28}]
add 5 ,  3  in  {3, 4, 35, 5, 25, 26, 28}
list:  [3]
current  [{0}, {1, 217}, {2}, {3, 4, 5, 35, 25, 26, 28}]
add 5 ,  5  in  {3, 4, 5, 35, 25, 26, 28}
list:  [3]
current  [{0}, {1, 217}, {2}, {3, 4, 5, 35, 25, 26, 28}]
add 5 

[{0}, {1, 217}, {2}, {3, 4, 5, 6, 25, 26, 28, 35, 37}]

In [25]:
import itertools
a_sorted=list(set(itertools.chain(*a)))
a_sorted.sort()
a_sorted

[0, 1, 2, 3, 4, 5, 6, 25, 26, 28, 35, 37, 217]

In [18]:
a=FM2EXC.list_of_pairs[0:20]

In [19]:
a

[[0, 0],
 [1, 1],
 [1, 217],
 [2, 2],
 [3, 3],
 [3, 5],
 [3, 25],
 [3, 26],
 [3, 28],
 [3, 35],
 [4, 4],
 [4, 25],
 [5, 3],
 [5, 5],
 [5, 6],
 [5, 25],
 [5, 26],
 [5, 28],
 [5, 35],
 [5, 37]]

In [33]:
find_equiv_classes(a)

add 0 ,  0
add 1 ,  1
add 1 ,  217  in  {1}
add 2 ,  2
add 3 ,  3
add 3 ,  5  in  {3}
add 3 ,  25  in  {3, 5}
add 3 ,  26  in  {25, 3}
add 3 ,  28  in  {26, 3}
add 3 ,  35  in  {3, 28}
add 4 ,  4
add 4 ,  25  in  {4}
add 5 ,  3  in  {3, 35}
add 5 ,  5  in  {3, 5}
add 5 ,  6  in  {5}
add 5 ,  25  in  {5, 6}
add 5 ,  26  in  {25, 5}
add 5 ,  28  in  {26, 5}
add 5 ,  35  in  {28, 5}
add 5 ,  37  in  {35, 5}


[{0}, {1, 217}, {2}, {5, 37}]

In [59]:
taxonomy=FileUtility.load_list('../../16S_datasets/EZ/raw/eztaxon_id_taxonomy.txt')

In [61]:
taxonomy=[x.split('\t')[1].replace(';','.') for x in taxonomy if not x.split('\t')[1].split(';')[0]=='Eukarya']

In [63]:
FileUtility.save_list('/mounts/data/proj/asgari/dissertation/libraries/EZBIO.txt',taxonomy)