In [5]:
import sys
sys.path.append('../')
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from utility.file_utility import FileUtility
import numpy as np
import matplotlib.pyplot as plt
%pylab inline
%matplotlib inline 

from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from Bio import SeqIO
from nltk import FreqDist
import random
import itertools

from make_representations.cpe_efficient import train_cpe

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [12]:
class ReadFasta(object):
    '''
    '''
    def __init__(self, fasta_address, label_idf_func):
        '''
            Fasta: address
            Label: function
        '''
        self.labels=[]
        self.corpus=[]
        for cur_record in SeqIO.parse(fasta_address, 'fasta'):
            self.corpus.append(str(cur_record.seq).lower())
            self.labels.append(str(cur_record.id).lower())
        self.labels=[label_idf_func(l) for l in self.labels]
    def get_samples(self, envs, N):
        '''
            Envs: list of envs
            N: sample size
        '''
        labels=[]
        corpus=[]
        for env in envs:
            selected=[idx for idx,v in enumerate(self.labels) if env==v]
            if N==-1:
                N=len(selected)
            idxs=random.sample(selected, N)
            corpus=corpus+[self.corpus[idx] for idx in idxs]
            labels=labels+[self.labels[idx] for idx in idxs]
        return corpus, labels
    
    def get_vector_rep(self, corpus, k, restricted=True):
        if restricted:
            vocab = [''.join(xs) for xs in itertools.product('atcg', repeat=k)]
            tf_vec = TfidfVectorizer(use_idf=True, vocabulary=vocab, analyzer='char', ngram_range=(k, k),
                                                  norm='l1', stop_words=[], lowercase=True, binary=False)
        else:
            tf_vec = TfidfVectorizer(use_idf=True, analyzer='char', ngram_range=(k, k),
                                                  norm='l1', stop_words=[], lowercase=True, binary=False)
        return tf_vec.fit_transform(corpus)

In [13]:
FST=ReadFasta('/mounts/data/proj/asgari/dissertation/datasets/deepbio/microbiome/new/environment_16S.fa', lambda x:x.split('.')[0] )

In [14]:
map_type=FileUtility.load_obj('../../datasets/processed_data/eco/map_label_type.pickle')
eco=['soil', 'marine','bioreactor','freshwater','groundwater','sediment','bioreactor_sludge','food_fermentation','compost','rhizosphere','food','hydrocarbon','marine_sediment','activated_sludge','aquatic','hot_springs','freshwater_sediment','ant_fungus_garden']
orgs=['human_gut','bovine_gut','mouse_gut','chicken_gut','termite_gut']

In [1]:
eco=['soil', 'marine','bioreactor','freshwater','groundwater','sediment','bioreactor_sludge','food_fermentation','compost','rhizosphere','food','hydrocarbon','marine_sediment','activated_sludge','aquatic','hot_springs','freshwater_sediment','ant_fungus_garden']
orgs=['human_gut','bovine_gut','mouse_gut','chicken_gut','termite_gut']

In [5]:
orgs.sort()
print(', '.join([x.replace('_',' ') for x in orgs]))

bovine gut, chicken gut, human gut, mouse gut, termite gut


In [16]:
corpus_eco, labels_eco=FST.get_samples(eco,10000)

In [17]:
for k in [3,4,5,6,7,8]:
    print (k)
    vec_pres=FST.get_vector_rep(corpus_eco, k,restricted=True)
    FileUtility.save_sparse_csr('../../datasets/processed_data/eco_10000/K/'+str(k)+'-mer'+'_eco_restrictedmer.npz', vec_pres)
    FileUtility.save_list('../../datasets/processed_data/eco_10000/K/eco_label_restrictedkmer.txt',labels_eco)
    #RF=RFClassifier(vec_pres, labels_eco)
    #RF.tune_and_eval('/mounts/data/proj/asgari/dissertation/git_repos/MicroPheno/results/classification_results/env/'+str(k)+'_'+'eco_restrcited')

3
4
5
6
7
8


In [7]:
train_cpe(corpus_eco,'../../datasets/env/npe_eco_10000',10000,'../../datasets/env/npe_eco_10000_freq')

0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [17]:
corpus_orgs, labels_orgs=FST.get_samples(orgs,620)

In [9]:
train_cpe(corpus_orgs,'../../datasets/env/npe_org_10000',10000,'../../datasets/env/npe_org_10000_freq')

0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [20]:
from make_representations.cpe_apply import BPE

In [38]:
f=open('../../datasets/env/npe_eco_10000','r')
CPE_Applier_echo=BPE(f,separator='')

new_corpus_echo=[]
for x in corpus_eco:
    new_corpus_echo.append(CPE_Applier_echo.segment(x))


In [49]:
tf_vec = TfidfVectorizer(use_idf=True, analyzer='word', ngram_range=(1,1),
                                                  norm='l1', stop_words=[], lowercase=True, binary=False)

In [53]:
FileUtility.save_list('../../datasets/env/cpe_eco_vocab',tf_vec.get_feature_names())

In [50]:
eco_tf_idf=tf_vec.fit_transform(new_corpus_echo)

In [51]:
FileUtility.save_sparse_csr('../../datasets/env/cpe_eco',eco_tf_idf)

In [52]:
FileUtility.save_list('../../datasets/env/data_config/cpe_eco_label',labels_eco)

In [44]:
f=open('../../datasets/env/npe_org_10000','r')
CPE_Applier_org=BPE(f,separator='')

new_corpus_org=[]
for x in corpus_orgs:
    new_corpus_org.append(CPE_Applier_org.segment(x))



In [48]:
tf_vec = TfidfVectorizer(use_idf=True, analyzer='word', ngram_range=(1,1),
                                                  norm='l1', stop_words=[], lowercase=True, binary=False)

eco_tf_idf=tf_vec.fit_transform(new_corpus_org)

FileUtility.save_list('../../datasets/env/cpe_org_vocab',tf_vec.get_feature_names())

FileUtility.save_sparse_csr('../../datasets/env/cpe_org',eco_tf_idf)

FileUtility.save_list('../../datasets/env/data_config/cpe_org_label',labels_orgs)


In [2]:
from classifier.random_forest import RFClassifier


In [46]:
for k in [3,4,5,6,7,8]:
    print (k)
    #vec_pres=FST.get_vector_rep(corpus_eco, k,restricted=True)
    #FileUtility.save_sparse_csr('../../datasets/env/'+str(k)+'-mer'+'_eco_restrictedmer.npz', vec_pres)
    FileUtility.save_list('../../datasets/env/data_config/eco_label_restrictedkmer.txt',labels_eco)
    #RF=RFClassifier(vec_pres, labels_eco)
    #RF.tune_and_eval('/mounts/data/proj/asgari/dissertation/git_repos/MicroPheno/results/classification_results/env/'+str(k)+'_'+'eco_restrcited')

3
4
5
6
7
8


In [19]:
for k in [6]:
    print (k)
    vec_pres=FileUtility.load_sparse_csr('../../datasets/processed_data/eco_10000/K/'+str(k)+'-mer'+'_eco_restrictedmer.npz')
    labels_eco=FileUtility.load_list('../../datasets/processed_data/eco_10000/K/eco_label_restrictedkmer.txt')
    print(vec_pres.shape)
    RF=RFClassifier(vec_pres, labels_eco)
    RF.tune_and_eval('../../datasets/results/eco_10000/K/'+str(k)+'_'+'eco_restrcited')

6
(180000, 4096)


KeyboardInterrupt: 

In [47]:
for k in range(3,9):
    print (k)
    vec_pres=FST.get_vector_rep(corpus_orgs, k, restricted=True)
    FileUtility.save_sparse_csr('../../datasets/env/'+str(k)+'-mer'+'_org_restrictedkmer',vec_pres)
    FileUtility.save_list('../../datasets/env/data_config/org_label_restrictedkmer.txt',labels_orgs)
    #RF=RFClassifier(vec_pres, labels_orgs)
    #RF.tune_and_eval('/mounts/data/proj/asgari/dissertation/git_repos/MicroPheno/results/classification_results/env/'+str(k)+'_'+'org_restrictedkmer')

3
4
5
6
7
8


In [6]:
1

1