In [1]:
import sys
sys.path.append('../')
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from utility.file_utility import FileUtility
import numpy as np
import matplotlib.pyplot as plt
%pylab inline
%matplotlib inline 

from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from Bio import SeqIO
from nltk import FreqDist
import random
import itertools

from make_representations.cpe_efficient import train_cpe

Populating the interactive namespace from numpy and matplotlib


In [2]:
class ReadFasta(object):
    '''
    '''
    def __init__(self, fasta_address, label_idf_func):
        '''
            Fasta: address
            Label: function
        '''
        self.labels=[]
        self.corpus=[]
        for cur_record in SeqIO.parse(fasta_address, 'fasta'):
            self.corpus.append(str(cur_record.seq).lower())
            self.labels.append(str(cur_record.id).lower())
        self.labels=[label_idf_func(l) for l in self.labels]
    def get_samples(self, envs, N):
        '''
            Envs: list of envs
            N: sample size
        '''
        labels=[]
        corpus=[]
        for env in envs:
            selected=[idx for idx,v in enumerate(self.labels) if env==v]
            if N==-1:
                random.shuffle(selected)
                idxs=selected
            else:
                idxs=random.sample(selected, N)
            corpus=corpus+[self.corpus[idx] for idx in idxs]
            labels=labels+[self.labels[idx] for idx in idxs]
        return corpus, labels
    
    def get_vector_rep(self, corpus, k, restricted=True):
        if restricted:
            vocab = [''.join(xs) for xs in itertools.product('atcg', repeat=k)]
            tf_vec = TfidfVectorizer(use_idf=True, vocabulary=vocab, analyzer='char', ngram_range=(k, k),
                                                  norm='l1', stop_words=[], lowercase=True, binary=False)
        else:
            tf_vec = TfidfVectorizer(use_idf=True, analyzer='char', ngram_range=(k, k),
                                                  norm='l1', stop_words=[], lowercase=True, binary=False)
        return tf_vec.fit_transform(corpus)

In [3]:
FST=ReadFasta('/mounts/data/proj/asgari/dissertation/datasets/deepbio/microbiome/new/environment_16S.fa', lambda x:x.split('.')[0] )

In [4]:
map_type=FileUtility.load_obj('../../datasets/processed_data/eco/map_label_type.pickle')


In [6]:
complete_eco=[]
for c,s in map_type.items():
    if s=='echo':
        complete_eco.append(c)

In [17]:
eco=['soil', 'marine','bioreactor','freshwater','groundwater','sediment','bioreactor_sludge','food_fermentation','compost','rhizosphere','food','hydrocarbon','marine_sediment','activated_sludge','aquatic','hot_springs','freshwater_sediment','ant_fungus_garden']
orgs=['human_gut','bovine_gut','mouse_gut','chicken_gut','termite_gut']

In [8]:
corpus_eco, labels_eco=FST.get_samples(complete_eco, 200)

In [9]:
for k in [6]:
    print (k)
    vec_pres=FST.get_vector_rep(corpus_eco, k,restricted=True)
    FileUtility.save_sparse_csr('../../datasets/processed_data/eco_all_classes/'+str(k)+'-mer'+'_eco_restrictedmer_all.npz', vec_pres)
    FileUtility.save_list('../../datasets/processed_data/eco_all_classes/eco_label_restrictedkmer_all.txt',labels_eco)
    #RF=RFClassifier(vec_pres, labels_eco)
    #RF.tune_and_eval('/mounts/data/proj/asgari/dissertation/git_repos/MicroPheno/results/classification_results/env/'+str(k)+'_'+'eco_restrcited')

6


In [3]:
from classifier.svm import SVM
from classifier.random_forest import RFClassifier


In [4]:
X=FileUtility.load_sparse_csr('../../datasets/processed_data/eco_all_classes/6-mer_eco_restrictedmer_all.npz')
Y=FileUtility.load_list('../../datasets/processed_data/eco_all_classes/eco_label_restrictedkmer_all.txt')
MRF = RFClassifier(X, Y)
MRF.tune_and_eval('../../datasets/results/eco_all/RF_all_classes')

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [5]:
MRF = SVM(X, Y)
MRF.tune_and_eval('../../datasets/results/eco_all/SVM_all_classes')

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

In [13]:
X.shape

(6800, 4096)

In [15]:
len(Y)

6800