In [1]:
import numpy as np
import pandas as pd
import matplotlib
from numpy import random
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
import matplotlib.pyplot as plt

import os
os.chdir("../../scVI/")
os.getcwd()
save_path = '../CSF/Notebooks/'

In [2]:
%matplotlib inline

In [3]:
import pickle

In [4]:
with open(save_path + 'posterior/all_datasets.vae.full.pkl', 'rb') as f:
    full = pickle.load(f)

latent, batch_indices, labels = full.sequential().get_latent()

In [5]:
latent_u = np.load(save_path + 'UMAP/all_dataset.umap.npy')
celllabels = np.load(save_path + 'meta/celllabels.npy')
louvain = np.load(save_path + 'meta/louvain.k30.allgenes.npy')
celltype, labels = np.unique(celllabels, return_inverse=True)
isMS = np.load(save_path+'meta/isMS.npy')
isCSF = np.load(save_path+'meta/isCSF.npy')
validclusters = (celllabels!='Mono Doublet') & \
(celllabels!='contamination1') & \
(celllabels!='doublet') & \
(celllabels!='B cell doublets') & \
(celllabels!='RBC')

In [6]:
len(np.unique(louvain))

21

In [7]:
colors = ['#ff0000', '#940000', '#c95151', '#5e2626', '#ffe100', '#948a3b', '#165e00', '#63ff33', '#6dc951', '#1e946a', '#66ffc9', '#0077ff', '#002c5e', '#1e5594', '#66adff', '#6a00ff', '#32135e', '#a666ff', '#603b94', '#ff66d1', '#943b79']
mean_pos = []
for i,x in enumerate(np.unique(louvain)):
    idx = (louvain==x)
    mean_pos.append([np.median(latent_u[idx,0]),np.median(latent_u[idx,1])])


In [8]:
fig, ax = plt.subplots(figsize=(10, 10))
celltypes = np.asarray(['CD8a','CD4','CD4','CD4','Gran',
             'B cells','NK1','Tregs','CD8n','RBC',
             'Mono','Tdg','Mono Doublet','mDC','NK2',
             'contamination1','pDC','B cell doublets','plasma','MegaK',
             'doublet']
)
for i,x in enumerate(celltypes):
    idx = (louvain==i)
    ax.scatter(latent_u[idx, 0][0], latent_u[idx, 1][0],label=x,edgecolors='none',c=colors[i],s=5,alpha=0)
    ax.text(mean_pos[i][0]+1, mean_pos[i][1], x, fontsize=30,color=colors[i])
        
plt.axis("off")
plt.tight_layout()
plt.savefig(save_path+'figures/SupFigure2/level1.labels.pdf')


In [9]:

fig, ax = plt.subplots(figsize=(10, 10))

for i,x in enumerate(celltypes):
    idx = (louvain==i)
    ax.scatter(latent_u[idx, 0], latent_u[idx, 1],label=x,edgecolors='none',c=colors[i],s=5)
#         ax.text(mean_pos[i][0]+1, mean_pos[i][1], x, fontsize=30,color=colors[i])
        
plt.axis("off")
plt.tight_layout()
plt.savefig(save_path+'figures/SupFigure2/level1_umap.png',dpi=300)


In [10]:
from sklearn.cluster import SpectralClustering

In [11]:
Bcells = np.unique(louvain)[celltypes=='B cells'][0]

clustering = SpectralClustering(n_clusters=2, n_jobs=-1, affinity ="nearest_neighbors",n_neighbors=15)
%time Bclusters = clustering.fit_predict(latent[louvain==Bcells,:])


CPU times: user 1min 37s, sys: 224 ms, total: 1min 37s
Wall time: 4.9 s


In [12]:
colors = ['#ff0000', '#e30000', '#c70000', '#732b20', '#c7601c', '#ff9924', '#735210', '#ab9f00', '#eeff00', '#2b5724', '#1cc749', '#1cc7a5', '#5fc9e3', '#24b6ff', '#205773', '#3061ab', '#001f73', '#6b7fff', '#601cc7', '#8f3c89', '#ff24d3', '#570023']
colors = np.asarray(colors)
celllabels = np.load(save_path + 'meta/celllabels.npy')
colors = [colors[np.unique(celllabels)=='B2'],colors[np.unique(celllabels)=='B1']]
colors = np.concatenate(colors)

In [13]:
fig, ax = plt.subplots(figsize=(5, 5))
idx = louvain==np.where(celltypes=='B cells')[0][0]
for i,x in enumerate(np.unique(Bclusters)):
    ax.scatter(latent_u[idx, 0][Bclusters==i], latent_u[idx, 1][Bclusters==i],color=colors[i],s=5)

plt.axis("off")
plt.xlim(5,11)
plt.ylim(-19,-13)
plt.tight_layout()
plt.savefig(save_path+'figures/SupFigure2/Bcell.png',dpi=300)



In [14]:
mean_pos = []
for i,x in enumerate(np.unique(Bclusters)):
    mean_pos.append([np.median(latent_u[idx,0][Bclusters==i]),np.median(latent_u[idx,1][Bclusters==i])])

    
fig, ax = plt.subplots(figsize=(5, 5))
idx = louvain==np.where(celltypes=='B cells')[0][0]
for i,x in enumerate(['B2','B1']):
    ax.text(mean_pos[i][0]+1, mean_pos[i][1], x, fontsize=30,color=colors[i])

plt.axis("off")
plt.xlim(5,11)
plt.ylim(-19,-13)
plt.tight_layout()
plt.savefig(save_path+'figures/SupFigure2/Bcell.labels.pdf',dpi=300)


            


In [15]:
Mono = np.unique(louvain)[celltypes=='Mono'][0]
%time Monoclusters = clustering.fit_predict(latent[louvain==Mono,:])

CPU times: user 14.4 s, sys: 15.7 ms, total: 14.4 s
Wall time: 850 ms


In [16]:
colors = ['#ff0000', '#e30000', '#c70000', '#732b20', '#c7601c', '#ff9924', '#735210', '#ab9f00', '#eeff00', '#2b5724', '#1cc749', '#1cc7a5', '#5fc9e3', '#24b6ff', '#205773', '#3061ab', '#001f73', '#6b7fff', '#601cc7', '#8f3c89', '#ff24d3', '#570023']
colors = np.asarray(colors)
celllabels = np.load(save_path + 'meta/celllabels.npy')
colors = [colors[np.unique(celllabels)=='ncMono'],colors[np.unique(celllabels)=='Mono']]
colors = np.concatenate(colors)

In [17]:
fig, ax = plt.subplots(figsize=(5, 5))
idx = louvain==np.where(celltypes=='Mono')[0][0]
for i,x in enumerate(np.unique(Monoclusters)):
    ax.scatter(latent_u[idx, 0][Monoclusters==i], latent_u[idx, 1][Monoclusters==i],color=colors[i],s=5)

plt.axis("off")
plt.xlim(-4,10)
plt.ylim(9,17)
plt.tight_layout()
plt.savefig(save_path+'figures/SupFigure2/Mono.png',dpi=300)



In [18]:
mean_pos = []
for i,x in enumerate(np.unique(Monoclusters)):
    mean_pos.append([np.median(latent_u[idx,0][Monoclusters==i]),np.median(latent_u[idx,1][Monoclusters==i])])

    
fig, ax = plt.subplots(figsize=(5, 5))
idx = louvain==np.where(celltypes=='Mono')[0][0]
for i,x in enumerate(['ncMono','Mono']):
    ax.text(mean_pos[i][0]+1, mean_pos[i][1], x, fontsize=30,color=colors[i])

plt.axis("off")
plt.xlim(-4,10)
plt.ylim(9,17)
plt.tight_layout()
plt.savefig(save_path+'figures/SupFigure2/Mono.labels.pdf',dpi=300)


            


In [19]:
mDC = np.unique(louvain)[celltypes=='mDC'][0]
%time mDCclusters = clustering.fit_predict(latent[louvain==mDC,:])

CPU times: user 2.56 s, sys: 12.7 ms, total: 2.57 s
Wall time: 311 ms


In [20]:
colors = ['#ff0000', '#e30000', '#c70000', '#732b20', '#c7601c', '#ff9924', '#735210', '#ab9f00', '#eeff00', '#2b5724', '#1cc749', '#1cc7a5', '#5fc9e3', '#24b6ff', '#205773', '#3061ab', '#001f73', '#6b7fff', '#601cc7', '#8f3c89', '#ff24d3', '#570023']
colors = np.asarray(colors)
celllabels = np.load(save_path + 'meta/celllabels.npy')
colors = [colors[np.unique(celllabels)=='mDC2'],colors[np.unique(celllabels)=='mDC1']]
colors = np.concatenate(colors)

fig, ax = plt.subplots(figsize=(5, 5))
idx = louvain==np.where(celltypes=='mDC')[0][0]
for i,x in enumerate(np.unique(mDCclusters)):
    ax.scatter(latent_u[idx, 0][mDCclusters==i], latent_u[idx, 1][mDCclusters==i],color=colors[i],s=5)

plt.axis("off")
plt.ylim(13,17)
plt.tight_layout()
plt.savefig(save_path+'figures/SupFigure2/mDC.png',dpi=300)



In [21]:
mean_pos = []
for i,x in enumerate(np.unique(mDCclusters)):
    mean_pos.append([np.median(latent_u[idx,0][mDCclusters==i]),np.median(latent_u[idx,1][mDCclusters==i])])

    
fig, ax = plt.subplots(figsize=(5, 5))
idx = louvain==np.where(celltypes=='mDC')[0][0]
for i,x in enumerate(['mDC2','mDC1']):
    ax.text(mean_pos[i][0]+1, mean_pos[i][1], x, fontsize=30,color=colors[i])

plt.axis("off")
plt.xlim(-4,10)
plt.ylim(9,17)
plt.tight_layout()
plt.savefig(save_path+'figures/SupFigure2/mDC.labels.pdf',dpi=300)

In [22]:
genelist = ['CD3E', 'TRAC', 'IL7R', 'CD8B', 'CCR7', 'FOXP3', 'TRDC', 'GNLY', 'FCGR3A', 'SELL', 'CD79A',
            'IGHD', 'CD27', 'CD38', 'LYZ', 'WDFY4', 'FCER1A', 'S100A8', 'CD14', 'TCF4', 'GNG11']

In [23]:
gene_dataset = full.gene_dataset
logX = np.log10(1+gene_dataset.X.todense())
scaling_factor = logX.mean(axis=1)
norm_X = logX - scaling_factor.reshape(len(scaling_factor), 1)

In [24]:
genenames = gene_dataset.gene_names

In [25]:
exprs = [norm_X[:,genenames==x] for x in genelist]
exprs = np.asarray(exprs).squeeze()
exprs = pd.DataFrame(exprs.T, columns=genelist)

In [26]:
exprs.shape

(65326, 21)

In [27]:
validclusters = (celllabels!='Mono Doublet') & \
(celllabels!='contamination1') & \
(celllabels!='doublet') & \
(celllabels!='B cell doublets') & \
(celllabels!='RBC')
order = np.where(validclusters)[0]
random.shuffle(order)

In [28]:
fig, ax = plt.subplots(figsize=(20, 20))
for i,x in enumerate(genelist[:20]):
    ax = plt.subplot(4,5,i+1)
    ax.scatter(latent_u[order, 0], latent_u[order, 1],c=np.log(1+np.asarray(exprs[x])[order]),
                cmap='RdYlBu_r',edgecolors='none',s=1)
    plt.title(x,color='w',fontsize=30)
    plt.xlim(-12,18)
    ax.yaxis.set_major_locator(plt.NullLocator())
    ax.xaxis.set_major_locator(plt.NullLocator())
    plt.tight_layout()

plt.savefig(save_path + 'figures/SupFigure2/featureplot.png',dpi=300)

In [30]:
fig, ax = plt.subplots(figsize=(20, 20))
for i,x in enumerate(genelist[:20]):
    ax= plt.subplot(4,5,i+1)
    ax.scatter(0, 0,c='white')
    plt.title(x,fontsize=30)
    ax.yaxis.set_major_locator(plt.NullLocator())
    ax.xaxis.set_major_locator(plt.NullLocator())
    plt.tight_layout()

plt.savefig(save_path + 'figures/SupFigure2/featureplot.title.pdf')