/
SplitBamCellTypes.py
223 lines (181 loc) · 7.74 KB
/
SplitBamCellTypes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import pysam
import pandas as pd
import argparse
import timeit
import sys
import numpy as np
def meta_to_dict(txt,tissue):
metadata = pd.read_csv(txt, delimiter = "\t")
# Clean index column
metadata['Index_clean'] = metadata['Index'].str.replace('-.*$','',regex=True)
# If tissue provided, append tissue id to cell type to be recognised in downstream analysis
if tissue == None:
metadata['Cell_type_clean'] = metadata['Cell_type'].str.replace(' ','_',regex=True)
else:
tissue = tissue.replace(" ", "_")
metadata['Cell_type_clean'] = metadata['Cell_type'].str.replace(' ','_',regex=True)
metadata['Cell_type_clean'] = str(tissue) + '__' + metadata['Cell_type_clean'].astype(str)
# Create dicitionary with cell types and cell barcodes
DICT = metadata.set_index('Index_clean')['Cell_type_clean'].to_dict()
ALL_CELL_TYPES = metadata['Cell_type_clean'].unique()
del metadata
return(DICT, ALL_CELL_TYPES)
def split_bam(bam, txt, outdir,donor,tissue,max_NM,max_NH,min_MAPQ,n_trim):
start = timeit.default_timer()
# 1. Transform table to dictionary
DICT, ALL_CELL_TYPES = meta_to_dict(txt,tissue)
if len(DICT) < 1:
print ('Warning: No cell barcodes found in the --meta file')
sys.exit()
# 2. Open infile
infile=pysam.Samfile(bam, "rb")
# 3. Create and open out bam files
DICT_files = {}
for cell_type in ALL_CELL_TYPES:
outfile="{}/{}.{}.bam".format(outdir, donor, cell_type)
outfile_wb = pysam.AlignmentFile(outfile, "wb",template=infile)
DICT_files[cell_type] = outfile_wb
# 4. Start read counts
total_reads = 0
FILTER_dict = {'Total_reads' : 0,'Pass_reads' : 0, 'CB_not_found' : 0, 'CB_not_matched' : 0} # To store filter reasons
# 5. Check reads and split them in bam files
for read in infile.fetch():
FILTER_dict['Total_reads'] += 1 # To count the total number of reads analysed
total_reads += 1
if (total_reads % 5000000 == 0):
print ('Number of reads already processed: ' + str(total_reads))
# Check if CB tag is present
try:
barcode = read.opt("CB")
except:
FILTER2 = 'CB_not_found'
FILTER_dict[FILTER2] += 1
continue
# Check if CB code matches with an annotated cell type
barcode = barcode.split("-")[0]
try:
CELL_TYPE = DICT[barcode]
except:
FILTER2 = 'CB_not_matched'
FILTER_dict[FILTER2] += 1
continue
# Final filters
FILTER = []
# Check number of mismatches in the read
if (max_NM != None):
try:
if (read.opt("nM") > max_NM):
FILTER.append('nM')
except KeyError as e:
FILTER.append('nM_not_found')
# Check number of hits
if (max_NH != None):
try:
if (read.opt("NH") > max_NH):
FILTER.append('NH')
except KeyError as e:
FILTER.append('NH_not_found')
# Check mapping quality
if (min_MAPQ > 0):
try:
if (read.mapq < min_MAPQ):
FILTER.append('MAPQ')
except:
FILTER.append('MAPQ_not_found')
# Making a decision about the read (filtered or not)
if (len(FILTER) > 0): # If there are reasons to filter read
FILTER2 = ';'.join(FILTER)
FILTER_dict[FILTER2] = FILTER_dict.get(FILTER2, 0) + 1
continue
else:
FILTER_dict['Pass_reads'] += 1
# Only for PASS reads
# Trim last and first bases of the read (reduce quality) if specified
# It does not consider the soft-clip bases at the beginning/end of the read for the counting
# It also considers the expected adapter lengths (up to 30) of 10x library prep to remove bases in long soft-clip sequences
if (n_trim > 0):
CIGAR_tuple = read.cigartuples
if (CIGAR_tuple != None and len(CIGAR_tuple) > 1):
# Check the number of bases to trim at the beginning of the read
# Extension of the soft-clipped bases
if (CIGAR_tuple[0][0] == 4):
# As there are some library preparation adapters inside the 10x prep protocol that
# are not properly removed (up to 30 bps), if we observed that the softclip bases are around more than 20, we
# prefer to be conservative and remove the expected 30 pbs + the n_trim parameter
if (CIGAR_tuple[0][1] >= 20 and CIGAR_tuple[0][1] < 30):
trim_start = 30 + n_trim
else:
trim_start = CIGAR_tuple[0][1] + n_trim
else:
trim_start = n_trim
# Check the number of bases to trim at the end of the read
# Extension of the soft-clipped bases
if (CIGAR_tuple[-1][0] == 4):
# As there are some library preparation adapters inside the 10x prep protocol that
# are not properly removed (up to 30 bps), if we observed that the softclip bases are around more than 20, we
# prefer to be conservative and remove the expected 30 pbs + the n_trim parameter
if (CIGAR_tuple[-1][1] >= 20 and CIGAR_tuple[-1][1] < 30):
trim_end = 30 + n_trim
else:
trim_end = CIGAR_tuple[-1][1] + n_trim
else:
trim_end = n_trim
else:
trim_start = n_trim
trim_end = n_trim
# Set base qualities to 0 at the beginning and the end of the read if specified
# Get first and last qualities
Q_indexes = list(np.arange(trim_start)) + list((np.arange(trim_end) + 1)*-1)
Q = read.query_qualities
for Qi in Q_indexes:
Q[Qi] = 0
# Substitute qualities
read.query_qualities = Q
# Print passed read
DICT_files[CELL_TYPE].write(read)
# 6. Close opened files
infile.close()
for key in DICT_files.keys():
DICT_files[key].close()
# 7. Get and create report
outfile_report = outfile="{}/{}.report.txt".format(outdir, donor, cell_type)
stop = timeit.default_timer()
endtime = round((stop - start),2)
FILTER_dict['Total_time'] = endtime
data_df = pd.DataFrame([FILTER_dict])
data_df.to_csv(outfile_report, index = False, sep = '\t')
# 8. Index bam files
for cell_type in ALL_CELL_TYPES:
bam="{}/{}.{}.bam".format(outdir, donor, cell_type)
pysam.index(bam)
def initialize_parser():
parser = argparse.ArgumentParser(description='Split alignment file into cell type specific BAMs')
parser.add_argument('--bam', type=str, default=1, help='BAM file to be analysed (Sorted by coordinate)', required = True)
parser.add_argument('--meta', type=str, default=1, help='Metadata file mapping cell barcodes to cell type information', required = True)
parser.add_argument('--id', type=str, default = 'Sample', help='Sample ID', required = False)
parser.add_argument('--max_nM', type=int, default = None, help='Maximum number of mismatches permitted to consider reads for analysis. By default, this filter is switched off, although we recommed using --max_nM 5. If applied, this filter requires having the nM tag in the bam file. [Default: Switched off]', required = False)
parser.add_argument('--max_NH', type=int, default = None, help='Maximum number of alignment hits permitted to consider reads for analysis. By default, this filter is switched off, although we recommend using --max_NH 1. This filter requires having the NH tag in the bam file. [Default: Switched off]', required = False)
parser.add_argument('--min_MQ', type=int, default = 255, help='Minimum mapping quality required to consider reads for analysis. Set this value to 0 to switch this filter off. --min_MQ 255 is recommended for RNA data, and --min_MQ 30 for DNA data. [Default: 255]', required = False)
parser.add_argument('--n_trim', type=int, default = 0, help='Number of bases trimmed by setting the base quality to 0 at the beginning and end of each read [Default: 0]', required = False)
parser.add_argument('--outdir', default = '.', help='Out directory', required = False)
return (parser)
def main():
# 1. Arguments
parser = initialize_parser()
args = parser.parse_args()
bam = args.bam
outdir = args.outdir
txt = args.meta
donor = args.id
tissue = None
max_NM = args.max_nM
min_MAPQ = args.min_MQ
n_trim = args.n_trim
max_NH = args.max_NH
# 2. Split bam file
split_bam(bam, txt, outdir,donor,tissue,max_NM,max_NH,min_MAPQ,n_trim)
#-------------
# Execute code
#-------------
if __name__ == '__main__':
main()