Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/fabriziocosta/EDeN
Browse files Browse the repository at this point in the history
  • Loading branch information
fabriziocosta committed Aug 4, 2016
2 parents 9898cfb + 2c6e210 commit bf4b9b5
Show file tree
Hide file tree
Showing 6 changed files with 393 additions and 24 deletions.
10 changes: 10 additions & 0 deletions eden/model.py
Expand Up @@ -88,6 +88,16 @@ def predict(self, iterable):
data_matrix = self._data_matrix(iterable)
return self.estimator.predict(data_matrix)

def annotate(self, iterable):
assert(is_iterable(iterable)), 'Not iterable'
graphs = mp_pre_process(iterable,
pre_processor=self.pre_processor,
pre_processor_args=self.pre_processor_args,
n_blocks=self.pre_processor_n_blocks,
block_size=self.pre_processor_block_size,
n_jobs=self.pre_processor_n_jobs)
return self.vectorizer.annotate(graphs, self.estimator)

def decision_function(self, iterable):
data_matrix = self._data_matrix(iterable)
return self.estimator.decision_function(data_matrix)
Expand Down
125 changes: 110 additions & 15 deletions graphprot/graphprot_seqmodel
@@ -1,15 +1,21 @@
#!/usr/bin/env python

import os
import argparse
import logging

import os
import sys
from collections import deque
from eden.model_base import ModelInitializerBase, main_script
from eden.sequence import Vectorizer
from eden.util import configure_logging
from eden.util import save_output
from eden.util import serialize_dict
from GArDen.convert.sequence import FastaToSeq
from numpy.random import randint, uniform
from sklearn.linear_model import SGDClassifier
from time import time

logger = logging.getLogger(__name__)

DESCRIPTION = """
GraphProt:
Expand Down Expand Up @@ -76,28 +82,30 @@ class ModelInitializer(ModelInitializerBase):
# sniff kmer length from first dictionary key
kmer_len = len(priorstable.keys()[0])
# look up probabilities
for i in range(len(seq)):
for i in range(len(seq) - kmer_len + 1):
try:
prob = priorstable[seq[i:(i + kmer_len)]]
except KeyError:
prob = None
raise Exception("error, no prior assigned to kmer '{}'".format(seq[i:(i + kmer_len)]))
priors.append(prob)
# compensate for the fact that the probabilities relate
# to the center positions of the kmers
# fill in values for positions without kmer score
priors += [None] * (kmer_len - 1)
# scores relate to the center positions of the kmers, so
# we shift the values accordingly
priors.rotate(int(kmer_len / 2))
priors = list(priors)
# combine weights and priors
comb = []
for w, p in zip(weights, priors):
if p is not None:
comb.append(max(0, w - (priors_weight * p)))
comb.append(max(0.0000000001, w - (priors_weight * p)))
else:
comb.append(0)
comb.append(w)
weights = comb
yield id, seq, weights

pre_processor_parameters = {"vp_weight": [1.0],
"context_weight": [0.0]}
"context_weight": [0.0000000001]}

if (args.priors_file is not None):
# add k-mer probabilities to parameters
Expand All @@ -117,6 +125,22 @@ class ModelInitializer(ModelInitializerBase):
'nbits': [14, 16, 18]}
return vectorizer, vectorizer_parameters

def estimator_init(self, args):
"""Setup the estimator and set of matching parameter choices."""
# defaults to first entry of each list if no parameter optimization is done
# set to defaults of SGDClassifier here
estimator = SGDClassifier(average=True, class_weight='balanced', shuffle=True)
estimator_parameters = {'n_iter': [5] + list(randint(5, 200, size=args.n_iter)),
'penalty': ['l2', 'l1', 'elasticnet'],
'l1_ratio': [0.15] + list(uniform(0.1, 0.9, size=args.n_iter)),
'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
'power_t': [0.5] + list(uniform(0.1, size=args.n_iter)),
'alpha': [0.0001] + [10 ** x for x in range(-8, 0)],
'eta0': [10 ** x for x in range(-4, -1)],
'learning_rate': ["optimal", "invscaling", "constant"],
'n_jobs': [-1]}
return estimator, estimator_parameters

def add_arguments(self, parser):
"""Add arguments for the main call."""
parser.add_argument('--version', action='version', version='0.1')
Expand Down Expand Up @@ -167,10 +191,81 @@ class ModelInitializer(ModelInitializerBase):
return parser


def main_predict_profile(model_initializer, args):
iterator = model_initializer.load_data(args)
from itertools import tee
iterator, iterator_ = tee(iterator)

from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel()
model.load(args.model_file)
logger.info(model.get_parameters())

output = []
g = model.annotate(iterator)
from itertools import izip
for (header, seq), (vertices, scores) in izip(iterator_, g):
assert "".join(vertices) == seq.upper().replace('T', 'U'), "error: pre_processor was not applied. got raw sequence '{}'".format(seq)
for pos, score in enumerate(scores):
output.append("{}\t{}\t{}\n".format(header, pos, score))
save_output(text=output, output_dir_path=args.output_dir_path, out_file_name='profile.txt')


if __name__ == "__main__":
model_initializer = ModelInitializer()
main_script(model_initializer=model_initializer,
description=DESCRIPTION,
epilog=EPILOG,
prog_name=os.path.basename(__file__),
logger=logging.getLogger())
if "predict_profile" in sys.argv:
parser = argparse.ArgumentParser()
parser.add_argument("-v", "--verbosity",
action="count",
help="Increase output verbosity")
parser.add_argument("-x", "--no-logging",
dest="no_logging",
help="If set, do not log on file.",
action="store_true")
parser.add_argument(
"action",
help="What to do.")
parser.add_argument("-i", "--input-file",
dest="input_file",
help="Path to file containing input.",
required=True)
parser.add_argument("-m", "--model-file",
dest="model_file",
help="Model file name. Note: it will be located in the output directory.",
default="model")
parser.add_argument("-o", "--output-dir",
dest="output_dir_path",
help="Path to output directory.",
default="out")
args = parser.parse_args()
assert args.action == "predict_profile"
model_initializer = ModelInitializer()

prog_name = "graphprot_seqmodel"
if args.no_logging:
configure_logging(logger, verbosity=args.verbosity)
else:
configure_logging(logger, verbosity=args.verbosity, filename=prog_name + '.log')

logger.debug('-' * 80)
logger.debug('Program: %s' % prog_name)
logger.debug('Called with parameters:\n %s' % serialize_dict(args.__dict__))

start_time = time()
try:
main_predict_profile(model_initializer, args)
except Exception:
import datetime
curr_time = datetime.datetime.now().strftime("%A, %d. %B %Y %I:%M%p")
logger.exception("Program run failed on %s" % curr_time)
exit(1)
finally:
end_time = time()
logger.info('Elapsed time: %.1f sec', end_time - start_time)
exit(0)
else:
model_initializer = ModelInitializer()
main_script(model_initializer=model_initializer,
description=DESCRIPTION,
epilog=EPILOG,
prog_name=os.path.basename(__file__),
logger=logging.getLogger())
8 changes: 6 additions & 2 deletions graphprot/prepare_graphprot_seqs.py
Expand Up @@ -12,6 +12,10 @@
# * check if seq length and core length arguments match or handle properly
# * handle input/output error gracefully
# * check if input bed coordinates are stranded
# * seq_len == core_len does not work
# * have to set default output prefix
# * negatives may be shuffled onto the wrong strand. investigate if this is a recent bedtools thing. otherwise shuffle separately for both strands
# * handle "crazy" bed formats (eg 7 fields from piranha)

from __future__ import print_function
import argparse
Expand Down Expand Up @@ -94,7 +98,7 @@
raise Exception("Error: bad length calculation.")


def dbg_head(sites, description="", n=npeek, run=args.debug):
def dbg_head(sites, description="", n=npeek, run=True):
"""Print the first few bed entries."""
if run:
logging.debug(description)
Expand Down Expand Up @@ -153,7 +157,7 @@ def get_flanks(cores,
if cores.count() == flanks_upstream.count() == flanks_downstream.count():
return flanks_upstream, flanks_downstream
else:
if args.debug:
if True:
cores.saveas("debug_cores.bed")
flanks_upstream.saveas("debug_upstream.bed")
flanks_downstream.saveas("debug_downstream.bed")
Expand Down
100 changes: 100 additions & 0 deletions test/simple_negatives.fa
@@ -0,0 +1,100 @@
>id_0
NEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_1
ANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_2
AANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_3
AAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_4
AAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_5
AAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_6
AAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_7
AAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_8
AAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_9
AAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_10
AAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_11
AAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_12
AAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_13
AAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_14
AAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_15
AAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_16
AAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_17
AAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_18
AAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_19
AAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_20
AAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_21
AAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_22
AAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_23
AAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_24
AAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAAA
>id_25
AAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAAA
>id_26
AAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAAA
>id_27
AAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAAA
>id_28
AAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAAA
>id_29
AAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAAA
>id_30
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAAA
>id_31
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAAA
>id_32
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAAA
>id_33
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAAA
>id_34
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAAA
>id_35
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAAA
>id_36
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAAA
>id_37
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAAA
>id_38
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAAA
>id_39
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAAA
>id_40
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAAA
>id_41
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAAA
>id_42
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAAA
>id_43
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAAA
>id_44
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAAA
>id_45
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAAA
>id_46
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAAA
>id_47
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAAA
>id_48
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEAA
>id_49
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEGATIVEA

0 comments on commit bf4b9b5

Please sign in to comment.