/
sharing.py
234 lines (195 loc) · 7.92 KB
/
sharing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#!/usr/bin/env python
"""
Plot the pairwise locus sharing and pairwise missingness.
"""
import tempfile
import h5py
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import time
from ..assemble.utils import IPyradError
from .snps_extracter import SNPsExtracter
from .utils import progressbar
# import tested at init
try:
import seaborn as sns
except ImportError:
pass
_SEABORN_IMPORT = """
This ipyrad analysis tool requires the following software
that you can install with conda using this command:
conda install seaborn -c conda-forge
"""
_IMPORT_VCF_INFO = """
Converting vcf to HDF5 using default ld_block_size: {}
Typical RADSeq data generated by ipyrad/stacks will ignore this value.
You can use the ld_block_size parameter of the PCA() constructor to change
this value.
"""
class Sharing:
def __init__(
self,
data,
imap=None,
minmap=None,
mincov=0,
minsnps=0,
maxmissing=1.0,
quiet=True,
):
"""
...
Parameters
----------
data: The .seqs.hdf5 database file from ipyrad.
"""
# check imports
if not sys.modules.get("seaborn"):
raise ImportError(_SEABORN_IMPORT)
# load args
self.data = data
self.imap = imap
self.minmap = minmap
self.mincov = mincov
self.minsnps = minsnps
# to be filled
self.snps = None
self.names = None
# Works now. ld_block_size will have no effect on RAD data
if self.data.endswith((".vcf", ".vcf.gz")):
if not ld_block_size:
self.ld_block_size = 20000
if not self.quiet:
print(_IMPORT_VCF_INFO.format(self.ld_block_size))
converter = vcf_to_hdf5(
name=data.split("/")[-1].split(".vcf")[0],
data=self.data,
ld_block_size=self.ld_block_size,
quiet=quiet,
)
# run the converter
converter.run()
# Set data to the new hdf5 file
self.data = converter.database
ext = SNPsExtracter(data=self.data,
imap=self.imap,
minmap=minmap,
mincov=mincov,
quiet=quiet,
)
# run snp extracter to parse data files
ext.parse_genos_from_hdf5()
self.snps = ext.snps
self.names = ext.names
def run(self):
sidx = np.arange(len(self.names))
# A matrix with shared data above the diagonal and shared
# missingness below the diagonal
sharing_matrix = pd.DataFrame(np.zeros([len(self.names),
len(self.names)]))
# Turns out not to be straightforward to reorder the
# matrix structure above, so keep all the info in 2 separate
# matrices so we can reorder the samples for plotting
_shared = pd.DataFrame(np.zeros([len(self.names),
len(self.names)]))
_missed = pd.DataFrame(np.zeros([len(self.names),
len(self.names)]))
nfinished = 0
ntotal = len(list(itertools.combinations(sidx, 2)))
start = time.time()
message = "Calculating shared loci/missingness"
progressbar(nfinished, ntotal, start, message)
for idx, pair in enumerate(itertools.combinations(sidx, 2)):
progressbar(nfinished, ntotal, start, message)
snps = self.snps[[pair[0], pair[1]]]
sh_missing = np.all(snps == 9, axis=0)
sh_data = np.all(snps != 9, axis=0)
sharing_matrix[pair[0]][pair[1]] = np.sum(sh_data)
sharing_matrix[pair[1]][pair[0]] = np.sum(sh_missing)
_shared[pair[0]][pair[1]] = sharing_matrix[pair[0]][pair[1]]
_shared[pair[1]][pair[0]] = sharing_matrix[pair[0]][pair[1]]
_missed[pair[0]][pair[1]] = sharing_matrix[pair[1]][pair[0]]
_missed[pair[1]][pair[0]] = sharing_matrix[pair[1]][pair[0]]
nfinished += 1
progressbar(nfinished, ntotal, start, message)
for x in [_shared, _missed, sharing_matrix]:
x.columns = self.names
x.index = self.names
self.sharing_matrix = sharing_matrix
self._shared = _shared
self._missed = _missed
def draw(self,
width=None,
height=None,
scaled=True,
keep_order=None,
sort=None,
cmap="YlGnBu",
**kwargs):
"""
...
Parameters
----------
:param float width: Width of the figure to draw.
:param float height: Height of the figure to draw.
:param bool scaled: Whether to plot the heatmap to with absolut absolute
values or rescale so show proportional values with
with respect to maximum values of sharing and
missingness.
:param list/dict keep_order: A list or dict of samples to retain and
the order to plet them in. This list can potentially
be a subset of the samples in the analysis.
:param str sort: Whether to sort samples in descending order of locus
missingness sharing. Possible values are: "loci" or
"missing".
:param str cmap: A valid matplotlib cmap for coloring the figure.
"""
# set up
width = (width if width else 20)
height = (height if height else 20)
# reorder columns if specified either by specified list order
# or by sorting by shared loci or shared missing
if keep_order or sort:
if keep_order:
# allow to pass in imap or dict and flatten to list
if isinstance(keep_order, dict):
keep_order = [item for sublist in keep_order.values() for item in sublist]
# Only retain the names actually in the data
names = [x for x in keep_order if x in self.names]
# Copy the matrix and set the columns/index in the new order
elif sort:
if sort == "loci":
names = list(self._shared.mean(axis=0).sort_values(ascending=False).index)
elif sort == "missing":
names = list(self._missed.mean(axis=0).sort_values(ascending=False).index)
else:
raise IPyradError(" `sort` parameter must be either \"loci\" or \"missing\".")
new_df = self.sharing_matrix.copy()[names].loc[names]
for pair in itertools.combinations(names, 2):
new_df[pair[0]][pair[1]] = self._shared[pair[0]][pair[1]]
new_df[pair[1]][pair[0]] = self._missed[pair[1]][pair[0]]
self.sharing_matrix = new_df
mask = np.zeros_like(self.sharing_matrix)
mask[np.triu_indices_from(mask)] = True
shared = self.sharing_matrix.to_numpy() * mask
missed = self.sharing_matrix.to_numpy() * np.logical_not(mask)
cbar_label = "# shared snps/missingness"
if scaled:
shared = shared/np.max(shared)
missed = missed/np.max(missed)
cbar_label = "proportional shared snps/missigness"
dat = shared + missed
self.dat = pd.DataFrame(dat,
columns=self.sharing_matrix.columns,
index=self.sharing_matrix.index)
fig, ax = plt.subplots(figsize=(width, height))
with sns.axes_style("white"):
ax = sns.heatmap(self.dat,
square=True,
cmap=cmap,
cbar_kws={'label':cbar_label})
ax.figure.axes[-1].yaxis.label.set_size(height)
return fig, ax