In [1]:
import sys
sys.path.append('../')
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
import itertools
import numpy as np
from multiprocessing import Pool
import tqdm
import random
from scipy import sparse
from utility.file_utility import FileUtility
from Bio import SeqIO
import timeit
from make_representations.cpe_apply import CPE



In [2]:

class Metagenomic16SReadRepresentation:
    '''
        Make k-mer from directory of fasta files
    '''

    def __init__(self, fasta_files, indexing, sampling_number=3000, num_p=20):
        '''
        :param fasta_files: list of fasta files
        :param indexing: the index
        :param sampling_number:
        :param num_p:
        '''
        self.fasta_files=fasta_files
        self.num_p=num_p
        self.sampling_number=sampling_number
        self.indexing=indexing

    def generate_cpes_all_reads(self, cpe_file, vocab, save=False, norm=False):
        f=open(cpe_file,'r')
        self.CPE_Applier=CPE(f,separator='', merge_size=vocab)
        self.cpe_vocab=[''.join(x.split()).replace('</w>','').lower() for x in FileUtility.load_list(cpe_file)[1::]]
        self.cpe_vocab=list(set(self.cpe_vocab))
        self.cpe_vocab.sort()
        self.cpe_vectorizer = TfidfVectorizer(use_idf=False, vocabulary=self.cpe_vocab, analyzer='word',
                                          norm=None, stop_words=[], lowercase=True, binary=False, tokenizer=str.split)

        data = dict()#np.zeros((len(self.fasta_files), len(self.cpe_vocab))).astype(np.float64)

        # multi processing extraction of cpe distributions
        pool = Pool(processes=self.num_p)
        for ky, (v,s) in tqdm.tqdm(pool.imap_unordered(self.get_cpe_distribution_reads, self.fasta_files, chunksize=1),
                               total=len(self.fasta_files)):
            FileUtility.save_obj('RA_reads_'+str(self.indexing[ky]), v)
        FileUtility.save_list('RA_reads_vocab',self.cpe_vocab)
        # normalize the frequencies
        if norm:
            data = normalize(data, axis=1, norm='l1')
        data = sparse.csr_matrix(data)

        return data

    def get_cpe_distribution_reads(self, file_name):
        corpus=[]
        if file_name[-1]=='q':
            for cur_record in SeqIO.parse(file_name, "fastq"):
                corpus.append(str(cur_record.seq).lower())
        else:
            for cur_record in SeqIO.parse(file_name, "fasta"):
                corpus.append(str(cur_record.seq).lower())
        tot_size=len(corpus)
        if self.sampling_number==-1:
            random.shuffle(corpus)
        else:
            corpus = random.sample(corpus, min(self.sampling_number,len(corpus)))
        corpus=[self.CPE_Applier.segment(x) for x in corpus]
        return file_name,(self.cpe_vectorizer.fit_transform(corpus).toarray(),tot_size)


In [3]:
fasta_files, mapping = FileUtility.read_fasta_directory('../../../datasets/deepbio/microbiome/RA/', 'fastq')

In [4]:
Meta16S=Metagenomic16SReadRepresentation(fasta_files,mapping, sampling_number=5000)

In [5]:
Meta16S.generate_cpes_all_reads('../../16S_datasets/ra/cpe/ra_cpe',vocab=50000)

100%|██████████| 114/114 [1:13:18<00:00, 18.56s/it]


<1x1 sparse matrix of type '<class 'numpy.object_'>'
	with 0 stored elements in Compressed Sparse Row format>

In [None]:
for i in range(0,114):
    print(i)
    a=FileUtility.load_obj('RA_reads_'+str(i)+'.pickle')

Process ForkPoolWorker-36:
Process ForkPoolWorker-24:
Process ForkPoolWorker-34:
Process ForkPoolWorker-30:
Process ForkPoolWorker-28:
Process ForkPoolWorker-40:
Process ForkPoolWorker-22:
Process ForkPoolWorker-35:
Process ForkPoolWorker-39:
Process ForkPoolWorker-25:
Process ForkPoolWorker-31:
Process ForkPoolWorker-37:
Process ForkPoolWorker-32:
Process ForkPoolWorker-23:
Process ForkPoolWorker-29:
Process ForkPoolWorker-26:
Process ForkPoolWorker-21:
Process ForkPoolWorker-38:
Process ForkPoolWorker-27:
Process ForkPoolWorker-33:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Trace