In [16]:
import glob
import os


# Reminder to install s3fs to read files from aws
import s3fs

import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr

import seaborn as sns

import scanpy.api as sc

import holoviews as hv
hv.extension('bokeh')
hv.archive.auto()

%matplotlib inline

<IPython.core.display.Javascript object>

Automatic capture is now enabled. [2018-08-13 18:00:24]


In [2]:
prefix = 's3://olgabot-maca/lung_cancer/sourmash_v3_compare/'

In [3]:
! aws s3 ls $prefix

2018-08-11 20:55:33  495993571 k21.csv
2018-08-12 04:50:24  493603458 k51.csv


In [4]:
k21 = pd.read_csv(f'{prefix}k21.csv')
print(k21.shape)
k21.head()

(5054, 5054)


Unnamed: 0,N21_B000420_S105,J1_B000420_S289,E17_B003570_S197,J3_B003570_S3,J4_B003528_S220,N7_B003511_S223,J1_B003527_S37,M7_B002097_S235,E11_B003125_S59,P4_B000420_S136,...,N6_B003511_S222,F16_B003586_S124,N5_B002097_S257,H10_B000420_S250,G10_B002097_S94,D11_B003588_S119,I13_B002095_S13,I12_B002078_S96,H7_B003573_S43,H7_B002073_S31
0,1.0,0.233503,0.204714,0.20588,0.273945,0.144828,0.245614,0.234462,0.2531,0.225784,...,0.224091,0.14706,0.05502,0.273144,0.047128,0.231464,0.268401,0.245437,0.227942,0.306655
1,0.233503,1.0,0.500012,0.760364,0.735239,0.090735,0.647935,0.656478,0.501949,0.721579,...,0.603041,0.080676,0.187699,0.704378,0.091348,0.718925,0.701346,0.608602,0.677743,0.385192
2,0.204714,0.500012,1.0,0.488453,0.535987,0.007709,0.534389,0.520645,0.341469,0.505667,...,0.512072,0.005416,0.087916,0.530547,0.004342,0.614005,0.655328,0.485525,0.763197,0.24623
3,0.20588,0.760364,0.488453,1.0,0.762437,0.007388,0.698376,0.63389,0.545744,0.731272,...,0.693178,0.005915,0.122706,0.709315,0.006246,0.724951,0.693079,0.591731,0.680974,0.338534
4,0.273945,0.735239,0.535987,0.762437,1.0,0.020786,0.809938,0.690954,0.56144,0.75421,...,0.686681,0.017176,0.14022,0.809692,0.019579,0.810531,0.766423,0.666658,0.715552,0.384226


In [5]:
k21_distances = 1-k21

In [12]:
import umap

def umap_embedding(distances, n_neighbors=5, min_dist=0.3):
    embedding = umap.UMAP(n_neighbors=n_neighbors,
                      min_dist=min_dist,
                      metric='precomputed').fit_transform(distances)
    embedding_df = pd.DataFrame(embedding, index=distances.columns, columns=['UMAP_1', 'UMAP_2'])
    return embedding_df

In [13]:
n_neighbors = 25

jaccard_embedding = umap_embedding(k21_distances, n_neighbors)
jaccard_embedding.head()

  warn('Using precomputed metric; transform will be unavailable for new data')


Unnamed: 0,UMAP_1,UMAP_2
N21_B000420_S105,-1.99261,-4.15717
J1_B000420_S289,1.604047,0.930919
E17_B003570_S197,2.268971,-5.340893
J3_B003570_S3,2.3369,-0.329377
J4_B003528_S220,2.698936,1.765426


In [19]:
value_dimensions = [("UMAP_1", 'UMAP 1'), ("UMAP_2", "UMAP 2")]
embedding_hv = hv.Table(jaccard_embedding.reset_index(),
                        vdims=value_dimensions)
embedding_hv

In [None]:
%%opts Scatter [width=400 height=400 tools=['hover']]
%%opts Scatter (color='#262626')


# color_cycle = hv.Cycle()

scatters = embedding_hv.to.scatter('UMAP 1', 'UMAP 2')
# scatters.overlay('Cell ontology class')
scatters.overlay()

In [22]:
from scanpy.neighbors import compute_neighbors_umap, compute_connectivities_umap
import louvain
from scanpy.utils import get_igraph_from_adjacency

def louvain_cluster(distances, n_neighbors=5, random_state=0, resolution=None):

    knn_indices, knn_dists = compute_neighbors_umap(
        distances.values, n_neighbors, random_state=0, metric='precomputed', verbose=True)

    n_obs = distances.shape[0]

    distances, adjacency = compute_connectivities_umap(knn_indices, knn_dists, n_obs=n_obs, 
                                                            n_neighbors=n_neighbors)


    g = get_igraph_from_adjacency(adjacency, directed=True)

    if resolution is None: resolution = 1
    try:
    #     logg.info('    using the "louvain" package of Traag (2017)')
        louvain.set_rng_seed(random_state)
        part = louvain.find_partition(g, louvain.RBConfigurationVertexPartition,
                                      resolution_parameter=resolution)
        # adata.uns['louvain_quality'] = part.quality()
    except AttributeError:
        print("Using RBCConfiguration")
    #     logg.warn('Did not find package louvain>=0.6, '
    #               'the clustering result will therefore not '
    #               'be 100% reproducible, '
    #               'but still meaningful. '
    #               'If you want 100% reproducible results, '
    #               'update via "pip install louvain --upgrade".')
        part = louvain.find_partition(g, method='RBConfiguration',
                                      resolution_parameter=resolution)
    groups = np.array(part.membership)
    return groups


groups = louvain_cluster(k21_distances, n_neighbors=25)
unique_groups = np.unique(groups)
n_clusters = len(unique_groups)
# groups = groups.astype('U')
n_clusters

17

In [23]:
groups

array([ 1,  2,  1, ..., 13,  1,  5])

In [26]:
jaccard_embedding['louvain_groups'] = groups

key_dimensions = [('louvain_groups', 'Louvain groups')]
value_dimensions = [("UMAP_1", 'UMAP 1'), ("UMAP_2", "UMAP 2")]
embedding_hv_louvain = hv.Table(jaccard_embedding.reset_index(),
                        kdims=key_dimensions,
                        vdims=value_dimensions)
embedding_hv_louvain

In [28]:
cat20c = hv.Cycle("Category20c")
cat20b = hv.Cycle("Category20b")

new_palette = cat20b.values + cat20c.values

color_cycle = hv.Cycle(new_palette)

In [36]:
%%opts Scatter [width=600 height=400 tools=['hover']]
%%opts Scatter (color=Cycle("Category20"))


# color_cycle = hv.Cycle()

groupby = 'Louvain groups'
scatters = embedding_hv_louvain.to.scatter('UMAP 1', 'UMAP 2', groupby=groupby)
# scatters.overlay('Cell ontology class')
scatters.overlay(groupby)

## Compare to gene counts clustering

In [37]:
results_file = '/mnt/data/lung_cancer.h5ad'

adata = sc.read(results_file)
adata

AnnData object with n_obs × n_vars = 3155 × 6410 
    obs: 'n_genes', 'percent_mito', 'n_counts', 'louvain'
    var: 'n_cells'
    uns: 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'

In [40]:
adata.obs['louvain'].head()

index
A10_B000420_S82     3
A10_B002073_S166    7
A10_B002078_S202    1
A10_B002095_S118    8
A10_B003125_S262    1
Name: louvain, dtype: category
Categories (23, object): [0, 1, 2, 3, ..., 19, 20, 21, 22]

In [45]:
jaccard_embedding['louvain_groups_gene_counts'] = adata.obs['louvain'].astype(int)

key_dimensions = [('louvain_groups', 'Louvain groups (kmer distances)'),
                  ('louvain_groups_gene_counts', 'Louvain groups (gene counts)')]
value_dimensions = [("UMAP_1", 'UMAP 1'), ("UMAP_2", "UMAP 2")]
embedding_hv_louvain_gene_counts = hv.Table(jaccard_embedding.reset_index(),
                        kdims=key_dimensions,
                        vdims=value_dimensions)
embedding_hv_louvain_gene_counts

In [50]:
%%opts Scatter [width=600 height=400 tools=['hover']]
%%opts Scatter (color=Cycle("Category20c"))
%%opts NdOverlay [legend_position='right']

# color_cycle = hv.Cycle()

groupby = 'Louvain groups (gene counts)'
scatters = embedding_hv_louvain_gene_counts.to.scatter('UMAP 1', 'UMAP 2', groupby=groupby)
# scatters.overlay('Cell ontology class')
scatters.overlay(groupby)

In [None]:
adata.obs

In [52]:

adata.obs.loc[adata.obs['louvain'].isnull()]

Unnamed: 0_level_0,n_genes,percent_mito,n_counts,louvain
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
