Merge branch 'master' of https://github.com/fabriziocosta/EDeN

fabriziocosta · Aug 4, 2016 · bf4b9b5 · bf4b9b5
2 parents 9898cfb + 2c6e210
commit bf4b9b5
Show file tree

Hide file tree

Showing 6 changed files with 393 additions and 24 deletions.
diff --git a/eden/model.py b/eden/model.py
@@ -88,6 +88,16 @@ def predict(self, iterable):
         data_matrix = self._data_matrix(iterable)
         return self.estimator.predict(data_matrix)
 
+    def annotate(self, iterable):
+        assert(is_iterable(iterable)), 'Not iterable'
+        graphs = mp_pre_process(iterable,
+                                pre_processor=self.pre_processor,
+                                pre_processor_args=self.pre_processor_args,
+                                n_blocks=self.pre_processor_n_blocks,
+                                block_size=self.pre_processor_block_size,
+                                n_jobs=self.pre_processor_n_jobs)
+        return self.vectorizer.annotate(graphs, self.estimator)
+
     def decision_function(self, iterable):
         data_matrix = self._data_matrix(iterable)
         return self.estimator.decision_function(data_matrix)

diff --git a/graphprot/graphprot_seqmodel b/graphprot/graphprot_seqmodel
@@ -1,15 +1,21 @@
 #!/usr/bin/env python
 
-import os
+import argparse
 import logging
-
+import os
+import sys
 from collections import deque
 from eden.model_base import ModelInitializerBase, main_script
 from eden.sequence import Vectorizer
+from eden.util import configure_logging
+from eden.util import save_output
+from eden.util import serialize_dict
 from GArDen.convert.sequence import FastaToSeq
 from numpy.random import randint, uniform
 from sklearn.linear_model import SGDClassifier
+from time import time
 
+logger = logging.getLogger(__name__)
 
 DESCRIPTION = """
 GraphProt:
@@ -76,28 +82,30 @@ class ModelInitializer(ModelInitializerBase):
                     # sniff kmer length from first dictionary key
                     kmer_len = len(priorstable.keys()[0])
                     # look up probabilities
-                    for i in range(len(seq)):
+                    for i in range(len(seq) - kmer_len + 1):
                         try:
                             prob = priorstable[seq[i:(i + kmer_len)]]
                         except KeyError:
-                            prob = None
+                            raise Exception("error, no prior assigned to kmer '{}'".format(seq[i:(i + kmer_len)]))
                         priors.append(prob)
-                    # compensate for the fact that the probabilities relate
-                    # to the center positions of the kmers
+                    # fill in values for positions without kmer score
+                    priors += [None] * (kmer_len - 1)
+                    # scores relate to the center positions of the kmers, so
+                    # we shift the values accordingly
                     priors.rotate(int(kmer_len / 2))
                     priors = list(priors)
                     # combine weights and priors
                     comb = []
                     for w, p in zip(weights, priors):
                         if p is not None:
-                            comb.append(max(0, w - (priors_weight * p)))
+                            comb.append(max(0.0000000001, w - (priors_weight * p)))
                         else:
-                            comb.append(0)
+                            comb.append(w)
                     weights = comb
                 yield id, seq, weights
 
         pre_processor_parameters = {"vp_weight": [1.0],
-                                    "context_weight": [0.0]}
+                                    "context_weight": [0.0000000001]}
 
         if (args.priors_file is not None):
             # add k-mer probabilities to parameters
@@ -117,6 +125,22 @@ class ModelInitializer(ModelInitializerBase):
             'nbits': [14, 16, 18]}
         return vectorizer, vectorizer_parameters
 
+    def estimator_init(self, args):
+        """Setup the estimator and set of matching parameter choices."""
+        # defaults to first entry of each list if no parameter optimization is done
+        # set to defaults of SGDClassifier here
+        estimator = SGDClassifier(average=True, class_weight='balanced', shuffle=True)
+        estimator_parameters = {'n_iter': [5] + list(randint(5, 200, size=args.n_iter)),
+                                'penalty': ['l2', 'l1', 'elasticnet'],
+                                'l1_ratio': [0.15] + list(uniform(0.1, 0.9, size=args.n_iter)),
+                                'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
+                                'power_t': [0.5] + list(uniform(0.1, size=args.n_iter)),
+                                'alpha': [0.0001] + [10 ** x for x in range(-8, 0)],
+                                'eta0': [10 ** x for x in range(-4, -1)],
+                                'learning_rate': ["optimal", "invscaling", "constant"],
+                                'n_jobs': [-1]}
+        return estimator, estimator_parameters
+
     def add_arguments(self, parser):
         """Add arguments for the main call."""
         parser.add_argument('--version', action='version', version='0.1')
@@ -167,10 +191,81 @@ class ModelInitializer(ModelInitializerBase):
         return parser
 
 
+def main_predict_profile(model_initializer, args):
+    iterator = model_initializer.load_data(args)
+    from itertools import tee
+    iterator, iterator_ = tee(iterator)
+
+    from eden.model import ActiveLearningBinaryClassificationModel
+    model = ActiveLearningBinaryClassificationModel()
+    model.load(args.model_file)
+    logger.info(model.get_parameters())
+
+    output = []
+    g = model.annotate(iterator)
+    from itertools import izip
+    for (header, seq), (vertices, scores) in izip(iterator_, g):
+        assert "".join(vertices) == seq.upper().replace('T', 'U'), "error: pre_processor was not applied. got raw sequence '{}'".format(seq)
+        for pos, score in enumerate(scores):
+            output.append("{}\t{}\t{}\n".format(header, pos, score))
+    save_output(text=output, output_dir_path=args.output_dir_path, out_file_name='profile.txt')
+
+
 if __name__ == "__main__":
-    model_initializer = ModelInitializer()
-    main_script(model_initializer=model_initializer,
-                description=DESCRIPTION,
-                epilog=EPILOG,
-                prog_name=os.path.basename(__file__),
-                logger=logging.getLogger())
+    if "predict_profile" in sys.argv:
+        parser = argparse.ArgumentParser()
+        parser.add_argument("-v", "--verbosity",
+                            action="count",
+                            help="Increase output verbosity")
+        parser.add_argument("-x", "--no-logging",
+                            dest="no_logging",
+                            help="If set, do not log on file.",
+                            action="store_true")
+        parser.add_argument(
+            "action",
+            help="What to do.")
+        parser.add_argument("-i", "--input-file",
+                            dest="input_file",
+                            help="Path to file containing input.",
+                            required=True)
+        parser.add_argument("-m", "--model-file",
+                            dest="model_file",
+                            help="Model file name. Note: it will be located in the output directory.",
+                            default="model")
+        parser.add_argument("-o", "--output-dir",
+                            dest="output_dir_path",
+                            help="Path to output directory.",
+                            default="out")
+        args = parser.parse_args()
+        assert args.action == "predict_profile"
+        model_initializer = ModelInitializer()
+
+        prog_name = "graphprot_seqmodel"
+        if args.no_logging:
+            configure_logging(logger, verbosity=args.verbosity)
+        else:
+            configure_logging(logger, verbosity=args.verbosity, filename=prog_name + '.log')
+
+        logger.debug('-' * 80)
+        logger.debug('Program: %s' % prog_name)
+        logger.debug('Called with parameters:\n %s' % serialize_dict(args.__dict__))
+
+        start_time = time()
+        try:
+            main_predict_profile(model_initializer, args)
+        except Exception:
+            import datetime
+            curr_time = datetime.datetime.now().strftime("%A, %d. %B %Y %I:%M%p")
+            logger.exception("Program run failed on %s" % curr_time)
+            exit(1)
+        finally:
+            end_time = time()
+            logger.info('Elapsed time: %.1f sec', end_time - start_time)
+        exit(0)
+    else:
+        model_initializer = ModelInitializer()
+        main_script(model_initializer=model_initializer,
+                    description=DESCRIPTION,
+                    epilog=EPILOG,
+                    prog_name=os.path.basename(__file__),
+                    logger=logging.getLogger())
diff --git a/graphprot/prepare_graphprot_seqs.py b/graphprot/prepare_graphprot_seqs.py
@@ -12,6 +12,10 @@
 #   * check if seq length and core length arguments match or handle properly
 #   * handle input/output error gracefully
 #   * check if input bed coordinates are stranded
+#   * seq_len == core_len does not work
+#   * have to set default output prefix
+#   * negatives may be shuffled onto the wrong strand. investigate if this is a recent bedtools thing. otherwise shuffle separately for both strands
+#   * handle "crazy" bed formats (eg 7 fields from piranha)
 
 from __future__ import print_function
 import argparse
@@ -94,7 +98,7 @@
     raise Exception("Error: bad length calculation.")
 
 
-def dbg_head(sites, description="", n=npeek, run=args.debug):
+def dbg_head(sites, description="", n=npeek, run=True):
     """Print the first few bed entries."""
     if run:
         logging.debug(description)
@@ -153,7 +157,7 @@ def get_flanks(cores,
     if cores.count() == flanks_upstream.count() == flanks_downstream.count():
         return flanks_upstream, flanks_downstream
     else:
-        if args.debug:
+        if True:
             cores.saveas("debug_cores.bed")
             flanks_upstream.saveas("debug_upstream.bed")
             flanks_downstream.saveas("debug_downstream.bed")

diff --git a/test/simple_negatives.fa b/test/simple_negatives.fa
@@ -0,0 +1,100 @@
+>id_0
+NEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_1
+ANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_2
+AANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_3
+AAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_4
+AAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_5
+AAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_6
+AAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_7
+AAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_8
+AAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_9
+AAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_10
+AAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_11
+AAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_12
+AAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_13
+AAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_14
+AAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_15
+AAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_16
+AAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_17
+AAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_18
+AAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_19
+AAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_20
+AAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_21
+AAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_22
+AAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_23
+AAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_24
+AAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_25
+AAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAA
+>id_26
+AAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAA
+>id_27
+AAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAA
+>id_28
+AAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAA
+>id_29
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAA
+>id_30
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAA
+>id_31
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAA
+>id_32
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAA
+>id_33
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAA
+>id_34
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAA
+>id_35
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAA
+>id_36
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAA
+>id_37
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAA
+>id_38
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAA
+>id_39
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAA
+>id_40
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAA
+>id_41
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAA
+>id_42
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAA
+>id_43
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAA
+>id_44
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAA
+>id_45
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAA
+>id_46
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAA
+>id_47
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAA
+>id_48
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAA
+>id_49
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEA