-
Notifications
You must be signed in to change notification settings - Fork 0
/
regression_linear_keras.py
173 lines (141 loc) · 6.41 KB
/
regression_linear_keras.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
'''Splits the selected chromosomes into contiguous "buckets" of a specified size and directly attempts to predict the number of
indel mutations within each bucket. Uses a very simple convolutional layer followed by two fully-connected layers'''
# TODO: Compare correlation with that of random selected examples
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import random
from sys import argv
import utils
import cs273b
from sequence_analysis import plot_seq_logo
np.random.seed(1)
data_dir = '/datadrive/project_data/'
reference, ambiguous_bases = cs273b.load_bitpacked_reference(data_dir + "Homo_sapiens_assembly19.fasta.bp")
del ambiguous_bases
# TODO: Can augment training set with overlapping windows (i.e. starting at random positions)
forbidden_chroms = [1, 2]
validation_chrom = 8#np.random.choice(19) + 3
print "Validation chromosome is %d" % validation_chrom
k = 200
window_size = 2*k+1
windows_per_bin = 50
margin = 15
expanded_window_size = window_size + 2*margin
batch_size = 50
num_train_ex = 500000
epochs = 2
num_indels = []
seq = []
for i in range(1, 23):
if i in forbidden_chroms:
continue
if i == 23:
ch = 'X'
else:
ch = str(i)
print('Processing ' + ch)
referenceChr = reference[ch]
c_len = len(referenceChr) # Total chromosome length
num_windows = (c_len-2*margin)//window_size
num_indels_ch = [0]*num_windows # True number of indels in each window
#insertionLocations = np.loadtxt(data_dir + "indelLocations{}_ins.txt".format(ch)).astype(int)
#deletionLocations = np.loadtxt(data_dir + "indelLocations{}_del.txt".format(ch)).astype(int)
#indelLocations = np.concatenate((insertionLocations, deletionLocations)) - 1
indel_data_load = np.load(data_dir + "indelLocationsFiltered" + ch + ".npy")
indelLocations = indel_data_load[indel_data_load[:, 2] == 1, 0] - 1
indelLocations = np.array(indelLocations, dtype = int)
del indel_data_load
for il in indelLocations:
if il < margin: continue
if il >= num_windows*window_size: break
num_indels_ch[il // window_size] += 1
if i == validation_chrom:
num_indels_val = num_indels_ch
else:
num_indels.extend(num_indels_ch)
#del num_indels_ch, insertionLocations, deletionLocations, indelLocations # Preserve memory
del num_indels_ch, indelLocations # Preserve memory
seq_ch = []
for w in range(num_windows):
# First window predictions start at index margin, but we include sequence context of length 'margin' around it, so its input array starts at index 0
window_lb, window_ub = w*window_size, (w+1)*window_size + 2*margin # Include additional sequence context of length 'margin' around each window
seq_ch.append(utils.flatten(referenceChr[window_lb:window_ub]))
if i == validation_chrom:
seq_val = seq_ch
else:
seq.extend(seq_ch)
del seq_ch
del reference, referenceChr
order = [x for x in range(len(seq))]
random.shuffle(order)
seq = np.array([seq[i] for i in order]) # Shuffle the training data, so we can easily choose a random subset for testing
num_indels = np.array([num_indels[i] for i in order])
x_train = np.array(seq[:num_train_ex])
y_train = np.array(num_indels[:num_train_ex])
x_test = np.array(seq_val)
y_test = np.array(num_indels_val)
#np.save(data_dir + 'RegrKerasTestSeq' + str(validation_chrom) + str(complexity_thresh) + '.npy', x_test)
#np.save(data_dir + 'RegrKerasLinTestLab' + str(validation_chrom) + '.npy', y_test)
print('Mean # indels per window: {}'.format(float(sum(y_train))/len(y_train)))
import keras
from keras.regularizers import l2
from keras.layers import Conv1D, Dense, Flatten, Dropout
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Sequential
use_lrelu = False
activation = 'linear' if use_lrelu else 'relu' # make linear to enable advanced activations
model = Sequential()
model.add(Dense(1, activation='linear', input_shape=(expanded_window_size*4, )))
model.compile(loss=keras.losses.mean_squared_error,
optimizer=keras.optimizers.Adam(),
metrics=['mse', 'mae']) # Minimize the MSE. Also report mean absolute error
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=1)
# Predictions on the test set
y_pred = utils.flatten(model.predict(x_test, batch_size=batch_size, verbose=1))
#np.save(data_dir + 'RegrKerasLinTestLabPred' + str(validation_chrom) + '.npy', y_pred)
#model.save(data_dir + 'RegrKerasModel' + str(validation_chrom) + str(complexity_thresh) + '.h5')
for layer in model.layers:
weights = layer.get_weights()
weights = np.reshape(np.array(weights[0]), [-1, 4])
print np.mean(weights, axis = 0)
from scipy import stats
from sklearn import linear_model
# Compute the correlation between the test set predictions and the true values
r, p = stats.pearsonr(y_test, y_pred)
print('')
print('r value: {}'.format(r))
print('p value: {}'.format(p))
bin_preds, bin_trues = [], []
for i in range(len(y_test)//windows_per_bin):
pred_agg, true_agg = 0, 0
for j in range(i*windows_per_bin, i*windows_per_bin + windows_per_bin):
pred_agg += y_pred[j]
true_agg += y_test[j]
bin_preds.append(pred_agg)
bin_trues.append(true_agg)
bin_preds, bin_trues = np.array(bin_preds), np.array(bin_trues)
mae = np.mean(np.abs(bin_preds - bin_trues))
rms = np.sqrt(np.mean(np.square(bin_preds - bin_trues)))
r_bin, p_bin = stats.pearsonr(bin_trues, bin_preds)
avg_pred = np.mean(bin_preds)
avg_true = np.mean(bin_trues)
print('Bin size: {}, MAE: {}, RMS error: {}, r: {}, p-value: {}, average indels predicted: {}, average indels actual: {}'.format(windows_per_bin*window_size, mae, rms, r_bin, p_bin, avg_pred, avg_true))
regr = linear_model.LinearRegression()
regr.fit(np.expand_dims(bin_trues, axis=1), bin_preds)
reg_pred = regr.predict(np.expand_dims(bin_trues, axis=1))
plt.scatter(bin_trues, bin_preds)
plt.xlabel('True number of indels')
plt.ylabel('Predicted number of indels')
plt.title('Predicted vs. actual indel mutation counts ($r = {:.2f}'.format(r) + (', p =$ {:.2g})'.format(p) if p else ', p < 10^{-15})$'))
line_x = np.arange(min(np.amax(bin_preds), np.amax(bin_trues)))
plt.plot(line_x, line_x, color='m', linewidth=2.5)
plt.plot(bin_trues, reg_pred, color='g', linewidth=2.5)
#plt.savefig('indel_rate_pred_keras_lin' + str(validation_chrom) + '.png')
#np.save(data_dir + 'RegrKerasBinPred' + str(validation_chrom) + '.npy', bin_preds)
#np.save(data_dir + 'RegrKerasBinTrue' + str(validation_chrom) + '.npy', bin_trues)