/
coverage.py
204 lines (162 loc) · 5.7 KB
/
coverage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/usr/bin/env python
"""
Plot the distribution of ref-aligned RAD tags on chromosomes
of the reference genome and calculate stats such as distance
between markers after applying filtering for missing data.
"""
import tempfile
import h5py
import numpy as np
import pandas as pd
from .locus_extracter import LocusExtracter
# import tested at init
try:
import toyplot
except ImportError:
pass
_TOYPLOT_IMPORT = """
This ipyrad analysis tool requires the following software
that you can install with conda using this command:
conda install toytree -c eaton-lab
"""
class Coverage:
def __init__(
self,
data,
# scaffold_idxs=None,
imap=None,
minmap=None,
mincov=4,
minsnps=0,
maxmissing=1.0,
consensus_reduce=False,
exclude=None,
):
"""
...
Parameters
----------
seqs_database: The .seqs.hdf5 database file from ipyrad.
"""
# check imports
if not sys.modules.get("toyplot"):
raise ImportError(_TOYPLOT_IMPORT)
# load args
self.data = data
self.imap = imap
self.minmap = minmap
self.minsnps = minsnps
self.maxmissing = maxmissing
self.consensus_reduce = consensus_reduce
self.mincov = mincov
self.exclude = exclude
# self.scaffold_idxs = scaffold_idxs
# if self.scaffold_idxs is None:
# self.scaffold_idxs = range(12)
# to be filled
self.phymap = None
self.slens = None
self.snames = None
self.faidict = None
# run functions
self._load_phymap()
self._apply_filters()
def _load_phymap(self):
"load the phymap to get information for selecting scaffolds by idx"
with h5py.File(self.data, 'r') as io5:
# load the phymap with spans of each RAD tag pre-filtering
self.phymap = io5["phymap"][:]
# parse names and lengths from db
scafnames = [i.decode() for i in io5["scaffold_names"][:]]
scaflens = io5["scaffold_lengths"][:]
# organize as a DF
self.scaffold_table = pd.DataFrame(
data={
"scaffold_name": scafnames,
"scaffold_length": scaflens,
},
columns=["scaffold_name", "scaffold_length"],
)
# mask for min scafflen
# if self.scaffold_minlen:
# self.scaffold_table = (
# self.scaffold_table[self.scaffold_table.scaffold_length > self.scaffold_minlen]
# )
# self.scaffold_table.reset_index(inplace=True)
#self.scaffold_table.reset_index(inplace=True, drop=True)
# if self.scaffold_minlen:
# self.mask_minlen = np.array(scaflens) > self.scaffold_minlen
# scafnames = np.array(scafnames)[self.mask_minlen]
# scaflens = np.array(scaflens)[self.mask_minlen]
def _apply_filters(self):
"""
...
"""
# filter all loci
ext = LocusExtracter(
name="coverage-plot",
workdir=tempfile.gettempdir(),
data=self.data,
imap=self.imap,
minmap=self.minmap,
minsnps=self.minsnps,
maxmissing=self.maxmissing,
consensus_reduce=self.consensus_reduce,
mincov=self.mincov,
exclude=self.exclude,
quiet=True,
)
ext.ipcluster['cores'] = 4
ext.run(auto=True)
# subset phymap to the scaffolds in scaffold_idx
self._phymap = ext.phymap.copy()
self.phymap = ext.phymap[~ext.phymap.filtered].reset_index()
def draw(self, scaffold_idxs=None, width=None, height=None, **kwargs):
# check for .apply_filters()
# get phymap for scaffs
scaffilter = np.sum(
[self.phymap.chroms == i + 1 for i in scaffold_idxs],
axis=0).astype(bool)
phymap = self.phymap[scaffilter]
# report number of loci passing filter on scaff idxs
print(
"{} loci passed filtering on selected scaffolds."
.format(phymap.shape[0]))
# subset scaff table
scaffold_table = self.scaffold_table.iloc[scaffold_idxs, :]
# set up
width = (width if width else 800)
height = (height if height else (100 + scaffold_table.shape[0] * 20))
canvas = toyplot.Canvas(width, height)
axes = canvas.cartesian(**kwargs)
# adjusters
nudge = scaffold_table.scaffold_length.max() * 0.01
baseline = 0
# iterate over scaffolds
for scaff in scaffold_table.index:
# draw the length of the scaffold
axes.rectangle(
0, scaffold_table.scaffold_length[scaff],
baseline, baseline - 1,
color=toyplot.color.Palette()[0],
opacity=0.7,
)
# add markers on the scaffolds
scafmarkers = phymap[phymap.chroms == scaff + 1]
axes.scatterplot(
scafmarkers.pos0,
np.repeat(baseline - 0.5, scafmarkers.pos0.size),
color='black',
marker="|",
)
axes.text(
scaffold_table.scaffold_length[scaff] + nudge,
baseline - 0.5,
scaffold_table.scaffold_name[scaff],
style={"text-anchor": "start"},
color="black",
)
baseline -= 1.5
axes.y.show = False
axes.x.label.text = "genomic position (bp)"
return canvas, axes