# Compare k-mer abundance and presence/absence vs "traditional" single-cell RNA-seq processing

- 500 k-mers, with abundance, were hashed from ~50k cells in Tabula Muris using `sourmash`


## Load Tabula Muris Senis data with fixed annotations

In [1]:
import dask.dataframe as dd
import numpy as np
import pandas as pd

import khtools

%load_ext autoreload
%autoreload 2

In [2]:
import scanpy.api as sc
sc. logging.print_versions()

scanpy==1.3.2 anndata==0.6.11 numpy==1.14.6 scipy==1.2.1 pandas==0.24.2 scikit-learn==0.20.3 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


In [4]:
# %%time

# h5ad = '/home/olga/ibm_lg/tabula-muris-senis/maca-facs-re-processed-annotated-fixed-tissues.h5ad'

# tms = sc.read_h5ad(h5ad)
# tms

## Read 50k x 50k similarity matrix (~50GB csv file)

In [5]:
csv = '/home/olga/pureScratch/olgabot-maca/facs/sourmash_compare_k31_ignore_abundance.csv'
! ls -lha $csv

-rw-rw-r-- 1 olga olga 14G Aug  7 11:51 /home/olga/pureScratch/olgabot-maca/facs/sourmash_compare_k31_ignore_abundance.csv


Had to increase `sample` to be the number of bytes(?) per row for `dd.read_csv` to be happy

In [None]:
%time similarity = dd.read_csv(csv, sample=2048000)

In [None]:
%time similarity.iloc[:, :5].head()

In [None]:
annotations = pd.read_csv('https://github.com/czbiohub/tabula-muris/raw/master/00_data_ingest/18_global_annotation_csv/annotations_facs.csv', 
                          index_col='cell')
annotations.index = annotations.index.str.replace('.', '-')
annotations.columns = annotations.columns.str.replace('.', '_')
annotations['sample_id'] = annotations.index
annotations = annotations.fillna("NA")
print(annotations.shape)
annotations.head()

In [None]:
color_cols = ['tissue', 'subtissue', 'cell_ontology_class', 'mouse_id', 'mouse_sex', 'plate_barcode']
palette_names = ['tab20', 'tab10', 'cubehelix', 'husl', 'Paired', 'Set2']
palettes = dict(zip(color_cols, palette_names))
palettes

In [None]:
graphs = {}
positions = {}

In [None]:
key = 'k31_with_abundance'

n_neighbors = 5
figure_folder = 's3://kmer-hashing/tabula-muris/n_hashes=500/'
figure_prefix = 'molecule=DNA_ksize=31_ignore-abundance=True'
title = 'Tabula Muris'

g, pos = khtools.knn.build_graph_and_plot(similarity, annotations, n_neighbors,
                                          color_cols, palettes, figure_folder, figure_prefix, title)

graphs[key] = g
positions[key] = pos

In [27]:
khtools.__version__

'0.1.0'

## Run Leiden Clustering


Below is excerpted from [scanpy's leiden clustering algorithm](https://github.com/theislab/scanpy/blob/master/scanpy/tools/_leiden.py)

In [6]:


try:
    import leidenalg
except ImportError:
    raise ImportError('Please install the leiden algorithm: `pip3 install leidenalg`.')

# convert it to igraph
g = scanpy.utils.get_igraph_from_adjacency(adjacency, directed=directed)
# flip to the default partition type if not overriden by the user
if partition_type is None:
    partition_type = leidenalg.RBConfigurationVertexPartition
# prepare find_partition arguments as a dictionary, appending to whatever the user provided
# it needs to be this way as this allows for the accounting of a None resolution
# (in the case of a partition variant that doesn't take it on input)

partition_kwargs = None

if partition_kwargs is None:
    partition_kwargs = {}
if use_weights:
    partition_kwargs['weights'] = np.array(g.es['weight']).astype(np.float64)
partition_kwargs['n_iterations'] = n_iterations
partition_kwargs['seed'] = random_state
if resolution is not None:
    partition_kwargs['resolution_parameter'] = resolution
# clustering proper
part = leidenalg.find_partition(g, partition_type, **partition_kwargs)
# store output into adata.obs
groups = np.array(part.membership)


ImportError: Please install the leiden algorithm: `pip3 install leidenalg`.

## Make confusion matrix with Tabula Muris Senis leiden clustering

In [None]:
tms.obs['leiden'].head()