-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_genome_preds_spaced.py
106 lines (82 loc) · 4.07 KB
/
process_genome_preds_spaced.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Process the mutation rate / mutation number predictions made by some model (e.g. simple_conv_predict)
# and compare the predicted values to the true values.
import numpy as np
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
#from sklearn import metrics
from scipy import stats
from sklearn import linear_model
def process_predictions(arr, small_window, small_window_size, bsize):
arr2 = np.array_split(arr[:(len(arr)//bsize)*bsize], len(arr)//bsize) # split the first part, with a size divisible by bsize
arr2.append(arr[(len(arr)//bsize)*bsize:]) # tack on the last piece
thresh = 0.8 # Threshold above which an example is declared to be an indel
results = [[], []]
for window in arr2:
num_indels_true = np.sum(window[:, 1]) # Actual number of indels in the bucket
num_indels_pred = np.sum(window[window[:, 0]%small_window == small_window_size, 2] > thresh) # Predicted number of indels in the bucket, using thresholding
results[0].append(num_indels_true)
results[1].append(num_indels_pred)
return results
small_window = 2*20 + 1
small_window_size = 20
arr1 = np.load("/datadrive/project_data/genomeIndelPredictionsValChrom.npy")
arr2 = np.load("/datadrive/project_data/genomeIndelPredictionsTestChrom.npy")
#arr1 = np.load("/datadrive/project_data/genomeIndelPredictionsValChromEntrpy.npy")
#arr2 = np.load("/datadrive/project_data/genomeIndelPredictionsTestChromEntrpy.npy")
bsize_val = [10000, 20000, 50000]
for bsize in bsize_val:
print "Bsize = {}".format(bsize)
print "Adjacent non-overlapping window"
results = process_predictions(arr1, small_window, small_window_size, bsize)
r, p = stats.pearsonr(results[0], results[1]) # Compute correlation between the model predictions from the above process, and the true values
print "On the validation set"
print(r)
print(p)
# Linearly rescale predicted values to match true test labels, since we didn't sum over all locations
regr = linear_model.LinearRegression()
regr.fit(np.expand_dims(results[0], axis=1), results[1])
results = process_predictions(arr2, small_window, small_window_size, bsize)
reg_pred = regr.predict(np.expand_dims(results[0], axis=1))
r, p = stats.pearsonr(results[1], reg_pred) # Compute correlation between the model predictions from the above process, and the true values
print "On the test set"
print(r)
print(p)
print "All locations"
results = process_predictions(arr1, 1, 0, bsize)
r, p = stats.pearsonr(results[0], results[1]) # Compute correlation between the model predictions from the above process, and the true values
print "On the validation set"
print(r)
print(p)
# Linearly rescale predicted values to match true test labels, since we didn't sum over all locations
regr = linear_model.LinearRegression()
regr.fit(np.expand_dims(results[0], axis=1), results[1])
results = process_predictions(arr2, 1, 0, bsize)
reg_pred = regr.predict(np.expand_dims(results[0], axis=1))
r, p = stats.pearsonr(results[1], reg_pred) # Compute correlation between the model predictions from the above process, and the true values
print "On the test set"
print(r)
print(p)
plt.plot(range(2000), arr1[2000:4000, 2])
plt.plot(range(2000), arr1[2000:4000, 1], color='r')
plt.title('Variation of predicted probabilities and true labels on validation chromosome')
plt.savefig('var_pred_prob.png')
#plt.savefig('var_pred_prob_entrpy.png')
plt.clf()
bsize = 10000
results = process_predictions(arr1, small_window, small_window_size, bsize)
regr = linear_model.LinearRegression()
regr.fit(np.expand_dims(results[0], axis=1), results[1])
results = process_predictions(arr2, small_window, small_window_size, bsize)
reg_pred = regr.predict(np.expand_dims(results[0], axis=1))
plt.scatter(results[0], results[1])
plt.xlabel('True number of indels')
plt.ylabel('Predicted number of indels without correction')
plt.savefig('true_vs_pred.png')
#plt.savefig('true_vs_pred_entrpy.png')
plt.clf()
#plt.scatter(results[1], reg_pred)
#plt.xlabel('True number of indels')
#plt.ylabel('Predicted number of indels with correction')
#plt.savefig('true_vs_pred_corr.png')
#plt.clf()