##### Combined 10 donors (NO SPL3)
##### Batch correction using ComBat
##### Leiden clusters and respective silhouette scores
---
##### hpb29

Date: 2021-02-23

In [1]:
%matplotlib widget

import warnings
warnings.filterwarnings('ignore')

import os, sys, json, operator, getpass, math
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import scanpy as sc

import matplotlib.pyplot as plt
from ipywidgets import widgets

from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples

In [2]:
with open('/.singularity.d/labels.json') as fh:
    singularity = json.load(fh)
    
singularity['Version']

'metztli.25c'

In [3]:
home = str(Path.home())
user = getpass.getuser()

basedir = os.path.join(home, 'datafloor/users', user, '2020/SLX19841/')

sc.settings.writedir = os.path.join(basedir, 'analysis/h5ad/')

In [4]:
sc.settings.verbosity = 3
sc.settings.file_format_figs = 'svg'
sc.settings.savefigs = False

In [5]:
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Arial'
plt.rc('font', size=14)

In [6]:
now = datetime.now()
prefix = now.strftime('%Y%m%d')
print(prefix)

20210223


In [7]:
data = sc.read('COMBO10_NO_SPL3_Seurat3_Integrated_lognorm')

In [12]:
data.obs = data.obs[['batch', 'n_counts', 'n_genes', 'library', 'donor',
                     'organ', 'leiden.1.2', 'silhouette.1.2']].copy()

In [14]:
%%time

sc.pp.combat(data, key='donor')

Standardizing Data across genes.

Found 10 batches

Found 0 numerical variables:
	

Fitting L/S model and finding priors

Finding parametric adjustments

Adjusting data

CPU times: user 8min 8s, sys: 3min 8s, total: 11min 17s
Wall time: 11min 16s


In [15]:
sc.pp.highly_variable_genes(data)
print("Highly variable genes: %d"%sum(data.var.highly_variable))

extracting highly variable genes
    finished (0:00:55)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
Highly variable genes: 1035


In [16]:
sc.pl.highly_variable_genes(data)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [18]:
sc.pp.pca(data, n_comps=30, use_highly_variable=True, svd_solver='arpack')

sc.pp.neighbors(data, n_pcs =30)

sc.tl.umap(data)

computing PCA
    on highly variable genes
    with n_comps=30
    finished (0:00:31)
computing neighbors
    using 'X_pca' with n_pcs = 30
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:57)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:02:03)


In [19]:
sc.pl.umap(data, color=['donor'])

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [20]:
fig,ax = plt.subplots(4,3,  squeeze=True, sharex=True, sharey=True, figsize=(9.5,12.5))
fig.tight_layout()
ax = ax.ravel()
print(len(ax))
i = 0

for g in data.obs.donor.unique():

    try:
        sc.pl.umap(data, ax=ax[i])
        sc.pl.umap(data[data.obs['donor'] == g, :], color=['donor'], ax=ax[i], 
                                                             title=g, legend_loc=None)#, title=g)
        ax[i].collections[1].set_sizes([50])
        ax[i].get_xaxis().set_visible(False)
        ax[i].get_yaxis().set_visible(False)
        i += 1
    except IndexError:
        pass

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

12


In [22]:
fig, ax = plt.subplots(1,1, figsize=(7,6))
fig.tight_layout()
sc.pl.umap(data, color=['leiden.1.2'],ax=ax)
fig.subplots_adjust(right=0.8)

handles, labels = ax.get_legend_handles_labels()
lgnd = plt.legend(loc=7, scatterpoints=1, fontsize=10, bbox_to_anchor=(1.2, 0.5), frameon=False)
for i in range(0, len(handles)):
    lgnd.legendHandles[i]._sizes = [25]

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [24]:
data.obs['leiden.1.2'].nunique()

27

In [25]:
sc.tl.leiden(data, resolution=1, key_added='cleiden.1.0')

running Leiden clustering
    finished: found 16 clusters and added
    'cleiden.1.0', the cluster labels (adata.obs, categorical) (0:01:01)


In [27]:
sc.tl.leiden(data, resolution=1.5, key_added='cleiden.1.5')

running Leiden clustering
    finished: found 23 clusters and added
    'cleiden.1.5', the cluster labels (adata.obs, categorical) (0:01:26)


In [28]:
sc.tl.leiden(data, resolution=1.75, key_added='cleiden.1.75')

running Leiden clustering
    finished: found 24 clusters and added
    'cleiden.1.75', the cluster labels (adata.obs, categorical) (0:00:58)


In [29]:
sc.tl.leiden(data, resolution=2, key_added='cleiden.2')

running Leiden clustering
    finished: found 30 clusters and added
    'cleiden.2', the cluster labels (adata.obs, categorical) (0:02:32)


In [30]:
sc.tl.leiden(data, resolution=1.85, key_added='cleiden.1.85')

running Leiden clustering
    finished: found 27 clusters and added
    'cleiden.1.85', the cluster labels (adata.obs, categorical) (0:01:38)


In [32]:
groups = ['cleiden.1.0', 'cleiden.1.5', 'cleiden.1.75', 'cleiden.1.85', 'cleiden.2']

In [33]:
%%time

silhouette = {}

for group in groups:
    silhouette[group] = silhouette_score(data.obsm['X_umap'], 
                         np.array(data.obs[group]), 
                         metric='euclidean', 
                         sample_size=None, 
                         random_state=None)

CPU times: user 17min 45s, sys: 1min 4s, total: 18min 49s
Wall time: 18min 49s


In [36]:
def compute_samples_silhouettes(data, grouping):
    
    sils = silhouette_samples(data.obsm['X_umap'], 
                 np.array(data.obs[grouping]), 
                 metric='euclidean')
    
    return sils

In [37]:
%%time

sample_sils = {}

for group in groups:
    sample_sils[group] = compute_samples_silhouettes(data, group)    
    colname = 'silhouette.'+group
    data.obs[colname] = sample_sils[group]

CPU times: user 16min 37s, sys: 1min 5s, total: 17min 43s
Wall time: 17min 43s


In [38]:
sc.pl.umap(data, color=['cleiden.1.0', 'cleiden.1.5', 'cleiden.1.75', 'cleiden.1.85', 'cleiden.2'], ncols=1)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [39]:
%%time
sc.write('COMBO10_NO_SPL3_combat_corrected', data)

CPU times: user 5min 2s, sys: 11 s, total: 5min 13s
Wall time: 5min 35s


---

In [51]:
groups

['cleiden.1.0', 'cleiden.1.5', 'cleiden.1.75', 'cleiden.1.85', 'cleiden.2']

In [67]:
clusters_means = {}

for res in groups:
    clusters_means[res] = data.obs.groupby(res)['silhouette.'+res].mean()

In [68]:
meanie = pd.concat([  clusters_means['cleiden.1.0'], clusters_means['cleiden.1.5'], 
                      clusters_means['cleiden.1.75'], clusters_means['cleiden.1.85'],
                      clusters_means['cleiden.2']
                   ], axis=1)

In [69]:
meanie.index = [ int(x) for x in meanie.index ]

In [70]:
meanie.sort_index(axis=0)

Unnamed: 0,silhouette.cleiden.1.0,silhouette.cleiden.1.5,silhouette.cleiden.1.75,silhouette.cleiden.1.85,silhouette.cleiden.2
0,0.325493,0.316371,0.320246,0.297935,0.311623
1,0.314097,0.212526,0.169393,0.197664,0.218906
2,0.250293,0.130227,-0.057439,0.283033,0.221286
3,0.054414,0.220242,0.268229,-0.134856,-0.363323
4,0.066223,-0.103481,0.016166,-0.010915,0.001538
5,0.134902,-0.023925,0.120904,0.195732,0.259844
6,-0.085639,0.069228,0.099612,0.287387,-0.120668
7,0.294425,0.386684,0.3115,-0.015631,0.342686
8,0.115398,-0.147404,0.078021,-0.222368,-0.044534
9,0.091584,0.081789,-0.107878,-0.105591,0.130834


In [71]:
meanie.sort_index(axis=0).to_csv('output/20210223_COMBO10_NO_SPL3_ComBat_silhouette_leiden_clusters_galore_mean_scores.txt', sep='\t')

---