-
Notifications
You must be signed in to change notification settings - Fork 0
/
rate_conv_new.py
115 lines (93 loc) · 3.72 KB
/
rate_conv_new.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import numpy as np
import utils
import cs273b
from random import shuffle
from scipy import stats
from sklearn import metrics
import keras
from keras.regularizers import l2
from keras.layers import Conv1D, Dense, Flatten, Dropout
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Sequential
# Loading reference chromosome and setting random seed
np.random.seed(1)
data_dir = '/datadrive/project_data/'
reference, ambiguous_bases = cs273b.load_bitpacked_reference(data_dir + "Homo_sapiens_assembly19.fasta.bp")
del ambiguous_bases
# Size of each bucket over which we predict the total number of indels
bsize = 10000
# Test on a separate chromosome, hold out chromosome 1, train on the rest
test_chrom = np.random.choice(range(2, 23), 1, replace=False)
print "Test chromosome is %d" % test_chrom
# Populate train and test data
train_num_indels = []
train_seq = []
test_num_indels = []
test_seq = []
for i in range(2,23):
ch = str(i)
print('Processing ' + ch)
referenceChr = reference[ch]
c_len = len(referenceChr)
# Load indel data
indel_data_load = np.load(data_dir + "indelLocationsFiltered" + str(i) + ".npy")
# Filter by sequence complexity and filter value around 20 sized window and complexity threshold
indelLocations = indel_data_load[indel_data_load[:, 2] == 1, 0] - 1
num_buckets = c_len//bsize
num_indels_ch = [0]*num_buckets
for il in indelLocations:
if il//bsize >= num_buckets: break
num_indels_ch[il// bsize] += 1
del indelLocations, indel_data_load
seq_ch = np.array_split(referenceChr[:num_buckets*bsize], num_buckets)
if i == test_chrom:
test_seq.extend(seq_ch)
test_num_indels.extend(num_indels_ch)
else:
train_seq.extend(seq_ch)
train_num_indels.extend(num_indels_ch)
del num_indels_ch, seq_ch
del reference, referenceChr
# Shuffle the data
order = [x for x in range(0, len(train_seq))]
shuffle(order)
x_train = np.array([train_seq[i] for i in order])
y_train = np.array([train_num_indels[i] for i in order])
x_test = np.array(test_seq)
y_test = np.array(test_num_indels)
del train_num_indels, test_num_indels, train_seq, test_seq
# Neural network model
model = Sequential()
# Convolutional layer
model.add(Conv1D(10, kernel_size=5, activation='relu', input_shape=(bsize, 4)))#, kernel_regularizer=l2(0.0001)))
model.add(Flatten())
# FC hidden layer
model.add(Dense(500, activation='relu'))#, kernel_regularizer=l2(0.01)))
# Output layer. ReLU activation because we are trying to predict a nonnegative value!
model.add(Dense(1, activation = 'relu'))
# Minimize the MSE. Also report mean absolute error
model.compile(loss = keras.losses.mean_squared_error, optimizer = keras.optimizers.Adam(), metrics = ['mae'])
# Need to use small batch size when buckets are very large, due to memory limitations
batch_size = 10
model.fit(x_train, y_train, batch_size = batch_size, epochs = 6, verbose = 1)#validation_data=(x_test, y_test))
#score = model.evaluate(x_test, y_test, verbose=0)
#print('Test loss:', score[0])
#print('Test accuracy:', score[1])
# Predictions on the test set
ytestout = utils.flatten(model.predict(x_test, batch_size = 10, verbose = 1))
# Compute the correlation between the test set predictions and the true values
r, p = stats.pearsonr(y_test, ytestout)
print "\nMetrics of the model"
print(r)
print(p)
print metrics.mean_squared_error(y_test, ytestout)
# Scatterplot of predicted vs. true values
plt.scatter(y_test, ytestout)
plt.xlabel('True number of indels')
plt.ylabel('Predicted number of indels')
plt.title('Predicted vs. actual indel mutation rates ($r = {:.2f}'.format(r) + ', p < 10^{-10}$)')
plt.plot(y_test, y_test, color='m', linewidth=2.5)
plt.savefig('indel_rate_pred_full_chrom.png')