-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_genome_locations_sample.py
68 lines (64 loc) · 3.44 KB
/
create_genome_locations_sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import pandas as pd
import numpy as np
import cs273b
import entropy
np.random.seed(1)
data_dir = "/datadrive/project_data/"
# Read the reference chromosome
reference, ambiguous_bases = cs273b.load_bitpacked_reference(data_dir + "Homo_sapiens_assembly19.fasta.bp")
del ambiguous_bases
# Read the gnomad indels data file
indels_file = pd.read_csv(data_dir + "gnomad_indels.tsv", delimiter = '\t')
# Convert true or false filter value to 1 or 0
indels_file[indels_file.columns[4]] = indels_file[indels_file.columns[4]].astype(int)
# Output file names
filename_base = data_dir + "indelLocationsFiltered"
filename_base_2 = data_dir + "nonindelLocationsSampled"
# Windows to compute entropy. The smaller window is for defining sequence complexity, larger for input to CNN
window1 = 50
window2 = 20
k_seq_complexity = window2
# Process per chromosome, from 1-22
for chromosome in range(1, 23):
##
print "Processing chromosome {}".format(chromosome)
# Reference chromosome
reference_chrom = reference[str(chromosome)]
# Read the indels in this chromosome
locations_indels = np.array(np.logical_and(indels_file[indels_file.columns[0]] == chromosome, indels_file[indels_file.columns[1]] + k_seq_complexity < len(reference_chrom)))
indel_indices = np.array(indels_file.iloc[locations_indels, 1], dtype = int)
##
# Store insertions or deletions information
ins_del = np.zeros(len(indel_indices), dtype = int)
ins_del_indices = np.arange(len(locations_indels))
ins_del_indices = ins_del_indices[locations_indels]
for i in range(len(indel_indices)):
loc = ins_del_indices[i]
if(len(list(indels_file.iloc[loc, 3])) > len(list(indels_file.iloc[loc, 2]))): # 'Alt' longer than 'ref': insertion
ins_del[i] = 1
##
# Add complexity information
indel_sequence_indices = np.arange(2*k_seq_complexity + 1) - k_seq_complexity
indel_sequence_indices = np.repeat(indel_sequence_indices, indel_indices.shape[0], axis = 0)
indel_sequence_indices = np.reshape(indel_sequence_indices, [-1, indel_indices.shape[0]])
indel_sequence_indices += np.transpose(indel_indices)
indel_sequence_complexity = entropy.entropySequence(reference_chrom[indel_sequence_indices.transpose(), :])
# Save the indels data
allele_count = np.array(indels_file.iloc[locations_indels, 7], dtype = int)
filter_value = np.array(indels_file.iloc[locations_indels, 4], dtype = int)
print np.sum(indel_sequence_complexity)
arr = np.concatenate((np.expand_dims(indel_indices, axis = 1), np.expand_dims(allele_count, axis = 1), np.expand_dims(filter_value, axis = 1), np.expand_dims(ins_del, axis = 1), np.expand_dims(indel_sequence_complexity, axis = 1)), axis = 1)
np.save(filename_base + str(chromosome) + '.npy', arr)
'''
##
# Sample and save a set of strict non-indels
nonzeroLocationsRef = np.where(np.any(reference_chrom != 0, axis = 1))[0]
rel_size_neg_large = 1.5
# Remove positions that have any indels around their window
window_containing_indel = np.arange(2*window1 + 1) - window1
window_containing_indel = np.repeat(window_containing_indel, len(indel_indices), axis = 0)
window_containing_indel = np.reshape(window_containing_indel, [-1, len(indel_indices)])
window_containing_indel += np.transpose(indel_indices)
neg_positions_large = np.random.choice(list(set(nonzeroLocationsRef) - set(np.reshape(window_containing_indel, -1))), size = int(rel_size_neg_large*len(indel_indices)), replace = False)
np.save(filename_base_2 + str(chromosome) + '.npy', neg_positions_large)
'''