-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
97 lines (81 loc) · 3.77 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import tensorflow as tf
import numpy as np
## DNA Processing
letters = ['A', 'C', 'G', 'T']
# Convert a length-4 list containing 0s and at most one 1 into a DNA base, e.g. [0, 1, 0, 0] becomes 'C'. '?' if all zero.
# See https://stackoverflow.com/questions/19502378/python-find-first-instance-of-non-zero-number-in-list
def to_letter(lst):
x = next((i for i, x in enumerate(lst) if x), None)
if x is not None: return letters[x]
return '?'
# Convert a one-hot encoded numpy array representing a chunk of DNA sequence to a DNA string
def onehot_to_str(elem):
elem = list(elem)
ret = []
for l in elem:
ret.append(to_letter(list(l)))
return ''.join(ret)
# Convert a batch of one-hot encoded numpy arrays representing DNA to a list of DNA strings
def batch_to_strs(batch_x):
b = list(batch_x)
ret = []
for elem in b:
ret.append(onehot_to_str(elem))
return ret
def flatten(arr):
return np.reshape(arr, -1)
# Convert categorical labels to one-hot labels
def to_onehot(labels, num_categories):
rv = np.zeros((len(labels), num_categories), dtype=np.uint8)
for i, label in enumerate(labels):
rv[i][label] = 1
return rv
## Tensor flow helper methods
def weight_variable(shape):
# Xavier initialization
initializer = tf.contrib.layers.xavier_initializer()
return tf.Variable(initializer(shape))
# Weight variable with L2 regularization with coefficient beta
def weight_variable_reg(name, shape, beta):
initializer = tf.contrib.layers.xavier_initializer()
if beta:
return tf.get_variable(name=name, shape=shape, initializer=initializer, regularizer=tf.contrib.layers.l2_regularizer(beta))
return tf.get_variable(name=name, shape=shape, initializer=initializer)
# Bias variable of the given shape, initialized to all 0.1 [TODO: Uniform small random initialization instead?]
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
# 1D convolution with stride 1 and zero padding
def conv1d(x, W):
return tf.nn.conv1d(x, W, stride=1, padding='SAME')
# Cross entropy loss. Higher weight_falsepos will penalize false positives more relative to false negatives
def cross_entropy(y_pred, y_true, weight_falsepos=1):
# Add eps to prevent errors in rare cases of 0 input to log
eps = 1e-12
return tf.reduce_mean(-y_true * tf.log(y_pred + eps) - weight_falsepos * (1-y_true) * tf.log(1-y_pred + eps))
# Leaky ReLU
def lrelu(x, alpha=0.01):
return tf.maximum(x, alpha * x)
# Adam optimizer for given loss, optionally with decayed learning rate (although decay is basically irrelevant for Adam...)
def adam_opt(loss, start_lr, decay_every_num_batches=0, decay_base=0.98):
adam = tf.train.AdamOptimizer
if not decay_every_num_batches:
return adam(start_lr).minimize(loss)
global_step = tf.Variable(0, trainable=False)
lr = tf.train.exponential_decay(start_lr, global_step, decay_every_num_batches,
decay_base, staircase=True)
return adam(lr).minimize(loss, global_step=global_step)
# Compute the percentage of predictions that are correct
def compute_accuracy(y_pred, y_true):
if y_pred.get_shape().as_list()[-1] == 1:
correct_prediction = tf.equal(tf.round(y_pred), y_true)
else: # multiclass: we are dealing with logits, not probabilities
correct_prediction = tf.equal(tf.argmax(y_pred, axis=-1), tf.argmax(y_true, axis=-1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
return accuracy
def coverage_placeholder(length):
return tf.placeholder(tf.float32, shape=None)
#return tf.placeholder(tf.float32, shape=[None, length]) # Confused why this statement is not the one being used (TODO understand why it still works)
def dna_placeholder(length):
# 4 as last dimension because the data is one-hot encoded
return tf.placeholder(tf.float32, shape=[None, length, 4])