In [5]:
import sys
sys.path.append('../')
import numpy as np
from utility.file_utility import FileUtility
import random
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from utility.math_utility import normalize_mat


In [6]:
taxonomy_gg_all=FileUtility.load_list('../../16S_general_data/GG/ids_min20.txt')

In [9]:
levels={'phylum':2,'class':3,'order':4,'family':5,'genus':6,'species':7}
for level in levels:
    print (level)
    
    ## making a dic for this level going from tax-id to list or rows in the segmented file
    taxidx_numidx=[(''.join(line.split()[1:(levels[level]+1)]),idx) for idx, line in enumerate(taxonomy_gg_all)]
    taxonomy_ids=dict()
    for tax_id, num_idx in taxidx_numidx:
        if tax_id not in taxonomy_ids:
            taxonomy_ids[tax_id]=list()
        taxonomy_ids[tax_id].append(num_idx)
    
    ## loading the segmented file
    sequences = FileUtility.load_list('../../16S_general_data/GG/segmented_corpus_min20.txt')

    ## make a corpus (each line is a distinct taxonomy) and all merged sequences for that
    print('start corpus making')
    texts=[]
    taxonomy_list=[]
    for taxa, list_id in list(taxonomy_ids.items()):
        seqs=[sequences[idx] for idx in list_id]
        texts.append(' '.join(seqs))
        taxonomy_list.append(taxa)
    print('end corpus making')
    
    tf_vectorizer = TfidfVectorizer(use_idf=False, analyzer='word',
                                            norm=None, stop_words=[], lowercase=True, binary=False, smooth_idf=False, tokenizer=str.split)
    tf_vec = tf_vectorizer.fit_transform(texts)
    feature_names = tf_vectorizer.get_feature_names()
    
    print ('normalization')
    normalize_matrix=normalize_mat(tf_vec,axis=0)
    normalize_matrix=normalize_matrix.toarray()
    prob=[' '.join(['###'.join([taxonomy_list[x],str(np.round(normalize_matrix[x,i],5))]) for x in np.nonzero(normalize_matrix[:,i])[0].tolist() if np.round(normalize_matrix[x,i],5) > 0.1]) for i in range(normalize_matrix.shape[1])]
    print ('saving')
    FileUtility.save_sparse_csr('../../16S_general_data/GG//intermediate/gg_tf_'+level+'_mat', tf_vec)
    FileUtility.save_list('../../16S_general_data/GG/intermediate/gg_'+level,taxonomy_list)
    FileUtility.save_list('../../16S_general_data/GG/intermediate/gg_cpe_'+level+'_prob',prob)
    FileUtility.save_list('../../16S_general_data/GG/intermediate/gg_cpe_'+level+'_features',feature_names)

family
start corpus making
end corpus making
normalization
saving
class
start corpus making
end corpus making
normalization
saving
order
start corpus making
end corpus making
normalization
saving
phylum
start corpus making
end corpus making
normalization
saving
genus
start corpus making
end corpus making
normalization
saving
species
start corpus making
end corpus making
normalization
saving


In [15]:
seq2taxa=dict()
for level in levels:
    prob=FileUtility.load_list('../../16S_general_data/GG/intermediate/gg_cpe_'+level+'_prob')
    features=FileUtility.load_list('../../16S_general_data/GG/intermediate/gg_cpe_'+level+'_features')
    for f_idx,feature in enumerate(features):
        if feature not in seq2taxa:
            seq2taxa[feature]=dict()
        if level not in seq2taxa[feature]:
            seq2taxa[feature][level]=[]
        seq2taxa[feature][level]+=[(pairs.split('###')[0],float(pairs.split('###')[1]))for pairs in prob[f_idx].split()]

In [17]:
FileUtility.save_obj('../../16Seq2Seg/data_config/seq2tax.pickle', seq2taxa)

In [21]:
seq2taxa['aagtccgttgg']

{'class': [('k__Archaea;p__Euryarchaeota;c__Halobacteria;', 0.96667)],
 'family': [('k__Archaea;p__Euryarchaeota;c__Halobacteria;o__Halobacteriales;f__Halobacteriaceae;',
   0.96667)],
 'genus': [('k__Archaea;p__Euryarchaeota;c__Halobacteria;o__Halobacteriales;f__Halobacteriaceae;g__Halobacterium;',
   0.38333),
  ('k__Archaea;p__Euryarchaeota;c__Halobacteria;o__Halobacteriales;f__Halobacteriaceae;g__Halorhabdus;',
   0.5)],
 'order': [('k__Archaea;p__Euryarchaeota;c__Halobacteria;o__Halobacteriales;',
   0.96667)],
 'phylum': [('k__Archaea;p__Euryarchaeota;', 0.96667)],
 'species': [('k__Archaea;p__Euryarchaeota;c__Halobacteria;o__Halobacteriales;f__Halobacteriaceae;g__Halorhabdus;s__',
   0.5),
  ('k__Archaea;p__Euryarchaeota;c__Halobacteria;o__Halobacteriales;f__Halobacteriaceae;g__Halobacterium;s__',
   0.38333)]}