-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_genome_preds.py
61 lines (49 loc) · 2.93 KB
/
process_genome_preds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# Process the mutation rate / mutation number predictions made by some model (e.g. simple_conv_predict)
# and compare the predicted values to the true values.
import numpy as np
chrom = 21
arr = np.load("/datadrive/project_data/genomeIndelPredictionsUnSpaced{}.npy".format(chrom))
sumprobs = True # Whether to make predictions based on summing the probabilites of each position being an indel over each bucket, or counting the number of positions whose predicted probability of being an indel is above some amount.
print('Loaded true and predicted indel locations') # arr[:,1] and arr[:,2], respectively (arr[:,0] contains the indices)
bsize = 10000 # Size of the buckets that we aggregate predictions over
arr2 = np.array_split(arr[:(len(arr)//bsize)*bsize], len(arr)//bsize) # split the first part, with a size divisible by bsize
thresh = 0.8 # Threshold above which an example is declared to be an indel (if we are thresholding rather than summing probabilities). Set higher since the classes are heavily imbalanced.
avg_sum = 0
avg_pred = 0
sumcount = 0
results = [[], []]
probs = []
for window in arr2:
num_indels_true = np.sum(window[:, 1]) # Actual number of indels in the bucket
avg_sum += num_indels_true
sumcount += 1
if sumprobs:
num_indels_pred = np.sum(window[:, 2]) # Predicted number of indels in the bucket, using summing of probabilities
else: # Threshold and count instead
num_indels_pred = np.sum(window[:, 2] > thresh) # Predicted number of indels in the bucket, using thresholding
avg_pred += num_indels_pred
results[0].append(num_indels_true)
results[1].append(num_indels_pred)
probs.append(sumprobs)
results[1] = np.array(results[1]) * avg_sum / avg_pred # Rescale to bring predictions around the same order of magnitude as true number of indels
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import linear_model
#r2 = metrics.r2_score(results[0], results[1]) # This gives bizarre values?? TODO: investigate
pred = results[1]
r, p = stats.pearsonr(results[0], pred) # Compute correlation between the model predictions from the above process, and the true values
print('r value: {}'.format(r))
print('p value: {}'.format(p))
# Linearly rescale predicted values to match true test labels (otherwise they will be way too high as the model was calibrated assuming a 50-50 indel vs nonindel split)
regr = linear_model.LinearRegression()
regr.fit(np.expand_dims(results[0], axis=1), pred)
reg_pred = regr.predict(np.expand_dims(results[0], axis=1))
plt.scatter(results[0], pred)
plt.xlabel('True number of indels')
plt.ylabel('Predicted number of indels')
plt.title('Predicted vs. actual indel mutation counts ($r = {:.2f}'.format(r) + (', p =$ {:.2g})'.format(p) if p else ', p < 10^{-15})$'))
plt.plot(results[0], reg_pred, color='m', linewidth=2.5)
descr = 'summed' if sumprobs else 'thresh_{}'.format(thresh)
plt.savefig('indel_rate_pred_{}_unspaced_{}.png'.format(chrom, descr))