In [2]:
import sys
sys.path.append('../')
from utility.file_utility import FileUtility
from scipy.sparse import csr_matrix
import os
import tqdm
path = '/mounts/data/proj/asgari/dissertation/datasets/deepbio/taxonomy/ncbi-blast-2.5.0+/bin/'
os.environ['PATH'] += ':'+path
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from utility.math_utility import get_sym_kl_rows
from nltk import FreqDist

In [None]:
class FastaMarkers2Excall:
    
    def __init__(self,fasta_file, matrix_path, feature_file_path, p_value_threshold=0.01):
        self.seq_IDS=FileUtility.read_fasta_sequences_ids(fasta_file)
        self.ez_taxa_dict={x.split()[0]:x.split()[1].split(';') for x in FileUtility.load_list('/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/EZ/raw/eztaxon_id_taxonomy.txt')}
        self.align_markers(p_value_threshold)
        
        self.mat=FileUtility.load_sparse_csr(matrix_path)
        self.features=FileUtility.load_list(feature_file_path)
        self.redundant_columns_indentification()
    
    def extract_results():
        results=[]
        for group in self.equiv_classes:
            results.append([self.aligned_markers[x] for x in group])
        
            
    def update_matrix_by_markers(self):
        '''
            called by align_markers
        '''
        new_matrix=[]
        for feature, taxnomy, pvalue in self.aligned_markers:
            column=self.features.index(feature)
            new_matrix.append(self.mat[:,column].toarray().T[0].tolist())
            new_matrix=np.array(new_matrix)
        self.update_matrix=new_matrix

    @staticmethod
    def find_equiv_classes(list_of_pairs):
        found_list=[]
        for x,y in list_of_pairs:
            if found_list==[]:
                found_list.append(set([x,y]))
            else:
                idx_to_add=-1
                for idx,group in enumerate(found_list):
                    if x in group or y in group:
                        idx_to_add=idx
                if idx_to_add==-1:
                    found_list.append(set([x,y]))
                else:
                    found_list[idx_to_add]=found_list[idx_to_add].union(set([x,y]))
        return found_list
    
    @staticmethod
    def isGenomeName(inputString):
    return np.sum([1 if char.isdigit() else 0 for char in inputString])/np.sum([1 if char.isalpha() else 0 for char in inputString])>0.8

    def lowest_certain_level(self,results):
        
        levels_id={'Superkingdom':1,'phylum':1,'class':2,'order':3,'family':4,'genus':5,'species':6}
        species=set([x[0][levels_id['species']] for x in results])
        genuses=set([x[0][levels_id['genus']] for x in results])
        families=set([x[0][levels_id['family']] for x in results])
        orders=set([x[0][levels_id['order']] for x in results])
        classes=set([x[0][levels_id['class']] for x in results])
        phylums=set([x[0][levels_id['phylum']] for x in results])
        Superkingdoms=set([x[0][levels_id['Superkingdom']] for x in results])

        if len(species)==1:
            return ';'.join(results[0][0])
        elif len(genuses)==1:
            return ';'.join(results[0][0][0:6])
        elif len(families)==1:
            return ';'.join(results[0][0][0:5])
        if len(orders)==1:
            return ';'.join(results[0][0][0:4])
        elif len(classes)==1:
            return ';'.join(results[0][0][0:3]) 
        elif len(phylums)==1:
            return ';'.join(results[0][0][0:2]) 
        elif len(Superkingdoms)==1:
            return ';'.join(results[0][0][0:1]) 
        else:
            return False
    
    def redundant_columns_indentification(self):
        distances=get_sym_kl_rows(self.update_matrix)
        flatten_distances=distances.flatten()
        list_of_pairs=np.argwhere(distances<np.percentile(d, 10, axis=0)).tolist()
        self.equiv_classes=FastaMarkers2Excall.find_equiv_classes(list_of_pairs)
        
    def align_markers(self,p_value_threshold):
        final_results=[]
        for idx, (seq, description) in tqdm.tqdm(self.seq_IDS.items()):
            pval=float(description.split(':')[1])
            if pval<=p_value_threshold:
                FileUtility.create_fasta_file('temp.fasta',[seq],['temp'])
                blastx_cline=NcbiblastnCommandline(query='temp.fasta', db="/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/EZ/raw/eztaxon_qiime_full.fasta", evalue=0.001, outfmt=5, out="temp.xml")
                blastx_cline()
                f=open("temp.xml",'r')
                blast_records = NCBIXML.parse(f)
                flag=False
                score=-1
                alignment_length=-1
                results=[]
                for blast_record in blast_records:
                    for alignment in blast_record.alignments:
                        for hsp in alignment.hsps:
                            if not flag and score==-1:
                                score=hsp.score
                                alignment_length=hsp.align_length
                                flag=True
                            if hsp.score >= score and hsp.align_length>=alignment_length:
                                results.append((self.ez_taxa_dict[alignment.hit_id],hsp.expect))
                if len(results)>0:
                    res=self.lowest_certain_level(results)
                    if res:
                        final_results.append((seq,res+idx[-1],pval))
                else:
                    final_results.append((seq,'ZZZNOVEL'+idx[-1],pval))
        # sorted markers by the taxonomy information of the last certain level
        self.aligned_markers=sorted(final_results, key=operator.itemgetter(1), reverse=False)
        self.min_p_value=p_value_threshold
        self.update_matrix_by_markers()