# Compare k-mer abundance and presence/absence vs "traditional" single-cell RNA-seq processing

- 500 k-mers, with abundance, were hashed from ~50k cells in Tabula Muris using `sourmash`


## Load Tabula Muris Senis data with fixed annotations

In [1]:
import dask.dataframe as dd
import numpy as np
import pandas as pd

import khtools

%load_ext autoreload
%autoreload 2

In [2]:
import scanpy.api as sc
sc. logging.print_versions()

scanpy==1.3.2 anndata==0.6.11 numpy==1.14.6 scipy==1.2.1 pandas==0.24.2 scikit-learn==0.20.3 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


In [4]:
# %%time

# h5ad = '/home/olga/ibm_lg/tabula-muris-senis/maca-facs-re-processed-annotated-fixed-tissues.h5ad'

# tms = sc.read_h5ad(h5ad)
# tms

## Read 50k x 50k similarity matrix (~50GB csv file)

In [13]:
csv = '/home/olga/pureScratch/olgabot-maca/facs/sourmash_compare_k31_ignore_abundance.csv'
! ls -lha $csv

-rw-rw-r-- 1 olga olga 14G Aug  7 11:51 /home/olga/pureScratch/olgabot-maca/facs/sourmash_compare_k31_ignore_abundance.csv


Had to increase `sample` to be the number of bytes(?) per row for `dd.read_csv` to be happy

In [14]:
%time similarity = dd.read_csv(csv, sample=2048000)

CPU times: user 6min 57s, sys: 3.74 s, total: 7min 1s
Wall time: 7min


In [None]:
similarity.iloc[:, :5].head()

In [15]:
%time similarity.iloc[:, :5].head()

CPU times: user 45.5 s, sys: 436 ms, total: 45.9 s
Wall time: 45.2 s


Unnamed: 0,A1-B000127-3_38_F-1-1_S1,A1-B000126-3_39_F-1-1_S289,A1-B000167-3_56_F-1-1_S228,A1-B000168-3_57_F-1-1_S177,A1-B000412-3_56_F-1-1_S110
0,1.0,0.048,0.052,0.0,0.034
1,0.048,1.0,0.094,0.002,0.034
2,0.052,0.094,1.0,0.0,0.036
3,0.0,0.002,0.0,1.0,0.0
4,0.034,0.034,0.036,0.0,1.0


In [16]:
annotations = pd.read_csv('https://github.com/czbiohub/tabula-muris/raw/master/00_data_ingest/18_global_annotation_csv/annotations_facs.csv', 
                          index_col='cell')
annotations.index = annotations.index.str.replace('.', '-')
annotations.columns = annotations.columns.str.replace('.', '_')
annotations['sample_id'] = annotations.index
annotations = annotations.fillna("NA")
print(annotations.shape)
annotations.head()

(44949, 24)


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0_level_0,Neurog3>0_raw,Neurog3>0_scaled,cell_ontology_class,cell_ontology_id,cluster_ids,free_annotation,mouse_id,mouse_sex,plate_barcode,subsetA,...,subsetC_cluster_ids,subsetD,subsetD_cluster_ids,subsetE,subsetE_cluster_ids,subtissue,tissue,tissue_tSNE_1,tissue_tSNE_2,sample_id
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1-B000610-3_56_F-1-1,,,bladder cell,CL:1001319,5,Bladder mesenchymal cell,3_56_F,F,B000610,,...,,,,,,,Bladder,-7.182841,-10.895094,A1-B000610-3_56_F-1-1
A1-B002764-3_38_F-1-1,,,bladder urothelial cell,CL:1001428,3,Luminal bladder epithelial cell,3_38_F,F,B002764,,...,,,,,,,Bladder,4.199059,-11.05024,A1-B002764-3_38_F-1-1
A1-B002771-3_39_F-1-1,,,bladder cell,CL:1001319,5,Bladder mesenchymal cell,3_39_F,F,B002771,,...,,,,,,,Bladder,-11.995435,-7.325534,A1-B002771-3_39_F-1-1
A1-D041914-3_8_M-1-1,,,bladder cell,CL:1001319,5,Bladder mesenchymal cell,3_8_M,M,D041914,,...,,,,,,,Bladder,-6.820022,-14.174246,A1-D041914-3_8_M-1-1
A1-D042253-3_9_M-1-1,,,bladder cell,CL:1001319,2,Bladder mesenchymal cell,3_9_M,M,D042253,,...,,,,,,,Bladder,-24.163538,-7.499349,A1-D042253-3_9_M-1-1


## Dimensionality reduction

### PCA on similarties

In [20]:
from sklearn.decomposition import PCA

pca = PCA(n_components=100, copy=False)


In [None]:
similarities_pca = pca.fit_transform(similarity)
print(similarities_pca.shape)

In [18]:
from dask_ml.decomposition import PCA

pca = PCA(n_components=100, copy=False)

ModuleNotFoundError: No module named 'dask_ml'

### Old way of doing nearest neighbor graphs

In [None]:
# color_cols = ['tissue', 'subtissue', 'cell_ontology_class', 'mouse_id', 'mouse_sex', 'plate_barcode']
# palette_names = ['tab20', 'tab10', 'cubehelix', 'husl', 'Paired', 'Set2']
# palettes = dict(zip(color_cols, palette_names))
# palettes

# graphs = {}
# positions = {}

# key = 'k31_with_abundance'

# n_neighbors = 5
# figure_folder = 's3://kmer-hashing/tabula-muris/n_hashes=500/'
# figure_prefix = 'molecule=DNA_ksize=31_ignore-abundance=True'
# title = 'Tabula Muris'

# g, pos = khtools.knn.build_graph_and_plot(similarity, annotations, n_neighbors,
#                                           color_cols, palettes, figure_folder, figure_prefix, title)

# graphs[key] = g
# positions[key] = pos

In [None]:
khtools.__version__

## Run Leiden Clustering


Below is excerpted from [scanpy's leiden clustering algorithm](https://github.com/theislab/scanpy/blob/master/scanpy/tools/_leiden.py)

In [6]:


try:
    import leidenalg
except ImportError:
    raise ImportError('Please install the leiden algorithm: `pip3 install leidenalg`.')

# convert it to igraph
g = scanpy.utils.get_igraph_from_adjacency(adjacency, directed=directed)
# flip to the default partition type if not overriden by the user
if partition_type is None:
    partition_type = leidenalg.RBConfigurationVertexPartition
# prepare find_partition arguments as a dictionary, appending to whatever the user provided
# it needs to be this way as this allows for the accounting of a None resolution
# (in the case of a partition variant that doesn't take it on input)

partition_kwargs = None

if partition_kwargs is None:
    partition_kwargs = {}
if use_weights:
    partition_kwargs['weights'] = np.array(g.es['weight']).astype(np.float64)
partition_kwargs['n_iterations'] = n_iterations
partition_kwargs['seed'] = random_state
if resolution is not None:
    partition_kwargs['resolution_parameter'] = resolution
# clustering proper
part = leidenalg.find_partition(g, partition_type, **partition_kwargs)
# store output into adata.obs
groups = np.array(part.membership)


ImportError: Please install the leiden algorithm: `pip3 install leidenalg`.

## Make confusion matrix with Tabula Muris Senis leiden clustering

In [None]:
tms.obs['leiden'].head()