ipyrad/analysis/sharing.py

#!/usr/bin/env python

"""
Plot the pairwise locus sharing and pairwise missingness.
"""

import tempfile
import h5py
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import time
from ..assemble.utils import IPyradError
from .snps_extracter import SNPsExtracter
from .utils import progressbar

# import tested at init
try:
    import seaborn as sns
except ImportError:
    pass
_SEABORN_IMPORT = """
This ipyrad analysis tool requires the following software
that you can install with conda using this command:

   conda install seaborn -c conda-forge
"""

_IMPORT_VCF_INFO = """
Converting vcf to HDF5 using default ld_block_size: {}
Typical RADSeq data generated by ipyrad/stacks will ignore this value.
You can use the ld_block_size parameter of the PCA() constructor to change
this value.
"""


class Sharing:
    def __init__(
        self, 
        data,
        imap=None, 
        minmap=None,
        mincov=0,
        minsnps=0,
        maxmissing=1.0,
        quiet=True,
        ):
        """
        ...

        Parameters
        ----------
        data: The .seqs.hdf5 database file from ipyrad.
        """
        # check imports
        if not sys.modules.get("seaborn"):
            raise ImportError(_SEABORN_IMPORT)

        # load args
        self.data = data
        self.imap = imap
        self.minmap = minmap
        self.mincov = mincov
        self.minsnps = minsnps

        # to be filled
        self.snps = None
        self.names = None

        # Works now. ld_block_size will have no effect on RAD data
        if self.data.endswith((".vcf", ".vcf.gz")):
            if not ld_block_size:
                self.ld_block_size = 20000
                if not self.quiet:
                    print(_IMPORT_VCF_INFO.format(self.ld_block_size))

            converter = vcf_to_hdf5(
                name=data.split("/")[-1].split(".vcf")[0],
                data=self.data,
                ld_block_size=self.ld_block_size,
                quiet=quiet,
            )
            # run the converter
            converter.run()
            # Set data to the new hdf5 file
            self.data = converter.database

        ext = SNPsExtracter(data=self.data,
                            imap=self.imap,
                            minmap=minmap,
                            mincov=mincov,
                            quiet=quiet,
        )

        # run snp extracter to parse data files
        ext.parse_genos_from_hdf5()
        self.snps = ext.snps
        self.names = ext.names


    def run(self):

        sidx = np.arange(len(self.names))

        # A matrix with shared data above the diagonal and shared
        # missingness below the diagonal
        sharing_matrix = pd.DataFrame(np.zeros([len(self.names),
                                                len(self.names)]))

        # Turns out not to be straightforward to reorder the
        # matrix structure above, so keep all the info in 2 separate
        # matrices so we can reorder the samples for plotting
        _shared = pd.DataFrame(np.zeros([len(self.names),
                                         len(self.names)]))
        _missed = pd.DataFrame(np.zeros([len(self.names),
                                         len(self.names)]))

        nfinished = 0
        ntotal = len(list(itertools.combinations(sidx, 2)))
        start = time.time()
        message = "Calculating shared loci/missingness"
        progressbar(nfinished, ntotal, start, message)
        for idx, pair in enumerate(itertools.combinations(sidx, 2)):
            progressbar(nfinished, ntotal, start, message)

            snps = self.snps[[pair[0], pair[1]]]
            sh_missing = np.all(snps == 9, axis=0)
            sh_data = np.all(snps != 9, axis=0)

            sharing_matrix[pair[0]][pair[1]] = np.sum(sh_data)
            sharing_matrix[pair[1]][pair[0]] = np.sum(sh_missing)

            _shared[pair[0]][pair[1]] = sharing_matrix[pair[0]][pair[1]]
            _shared[pair[1]][pair[0]] = sharing_matrix[pair[0]][pair[1]]
            _missed[pair[0]][pair[1]] = sharing_matrix[pair[1]][pair[0]]
            _missed[pair[1]][pair[0]] = sharing_matrix[pair[1]][pair[0]]
            nfinished += 1

        progressbar(nfinished, ntotal, start, message)

        for x in [_shared, _missed, sharing_matrix]:
            x.columns = self.names
            x.index = self.names
        self.sharing_matrix = sharing_matrix
        self._shared = _shared
        self._missed = _missed


    def draw(self,
            width=None,
            height=None,
            scaled=True,
            keep_order=None,
            sort=None,
            cmap="YlGnBu",
            **kwargs):
        """
        ...

        Parameters
        ----------
        :param float width: Width of the figure to draw.
        :param float height: Height of the figure to draw.
        :param bool scaled: Whether to plot the  heatmap to with absolut absolute
                        values or rescale so show proportional values with
                        with respect to maximum values of sharing and
                        missingness.
        :param list/dict keep_order: A list or dict of samples to retain and
                        the order to plet them in. This list can potentially
                        be a subset of the samples in the analysis.
        :param str sort: Whether to sort samples in descending order of locus
                        missingness sharing. Possible values are: "loci" or
                        "missing".
        :param str cmap: A valid matplotlib cmap for coloring the figure.
        """
        # set up
        width = (width if width else 20)
        height = (height if height else 20)

        # reorder columns if specified either by specified list order
        # or by sorting by shared loci or shared missing
        if keep_order or sort:
            if keep_order:
                # allow to pass in imap or dict and flatten to list
                if isinstance(keep_order, dict):
                    keep_order = [item for sublist in keep_order.values() for item in sublist]

                # Only retain the names actually in the data
                names = [x for x in keep_order if x in self.names]
                # Copy the matrix and set the columns/index in the new order
            elif sort:
                if sort == "loci":
                    names = list(self._shared.mean(axis=0).sort_values(ascending=False).index)
                elif sort == "missing":
                    names = list(self._missed.mean(axis=0).sort_values(ascending=False).index)
                else:
                    raise IPyradError("  `sort` parameter must be either \"loci\" or \"missing\".")

            new_df = self.sharing_matrix.copy()[names].loc[names]

            for pair in itertools.combinations(names, 2):
                new_df[pair[0]][pair[1]] = self._shared[pair[0]][pair[1]]
                new_df[pair[1]][pair[0]] = self._missed[pair[1]][pair[0]]
            self.sharing_matrix = new_df

        mask = np.zeros_like(self.sharing_matrix)
        mask[np.triu_indices_from(mask)] = True
        shared = self.sharing_matrix.to_numpy() * mask
        missed = self.sharing_matrix.to_numpy() * np.logical_not(mask)

        cbar_label = "# shared snps/missingness"
        if scaled:
            shared = shared/np.max(shared)
            missed = missed/np.max(missed)
            cbar_label = "proportional shared snps/missigness"

        dat = shared + missed

        self.dat = pd.DataFrame(dat,
                                columns=self.sharing_matrix.columns,
                                index=self.sharing_matrix.index)

        fig, ax = plt.subplots(figsize=(width, height))
        
        with sns.axes_style("white"):
            ax = sns.heatmap(self.dat,
                            square=True,
                            cmap=cmap,
                            cbar_kws={'label':cbar_label})
            ax.figure.axes[-1].yaxis.label.set_size(height)

        return fig, ax