In [22]:
import warnings
warnings.filterwarnings('ignore')
import sys

sys.path.append('/data/yosef2/users/chenling/scVI/')
import scvi
print(scvi.__version__)

import os
os.getcwd()

import logging
import os
import pickle

from scipy import sparse

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from hyperopt import hp

from scvi.inference import UnsupervisedTrainer, AlternateSemiSupervisedTrainer, SemiSupervisedTrainer
from scvi.models import VAE, SCANVI

from scvi.dataset.anndataset import AnnDatasetFromAnnData
import scanpy as sc
from anndata import read_h5ad
from anndata import AnnData

import matplotlib

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
%matplotlib inline

n_epochs = 150
save_path = '../data/'
from copy import deepcopy

0.5.0


In [3]:
from os import path

In [4]:
all_data = []
for tissue in [ 'muscle','exopancreas','endopancreas','bladder','lung']:
    filename = '%s.scVI.h5ad'%tissue
    if path.isfile(filename):
        all_data.append(read_h5ad(filename))
        
            

In [37]:
temp = all_data[0].concatenate(all_data[1:])

In [38]:
temp.shape

(55837, 21301)

In [49]:
temp_path = '../../AnnotationsRound1/data/'
tenx = read_h5ad(temp_path + 'tabula-sapiens-10X-pilot-filtered-%s.h5ad'%'blood')
ss2 = read_h5ad(temp_path + 'tabula-sapiens-facs-pilot-filtered-%s.h5ad'%'Blood')
blood = tenx.concatenate(ss2)
blood.obs['compartments']='Immune'

In [85]:
adata = blood.concatenate(all_data)

In [51]:
batch_names = [ 'blood','muscle','exopancreas','endopancreas','bladder','lung']

In [52]:
np.unique(adata.obs['compartments'],return_counts=True)

(array(['Endothelial', 'Epithelial', 'Immune', 'Muscle', 'PNS', 'Pancreas',
        'Stromal', 'mixed'], dtype=object),
 array([ 2594, 14180, 31386,  1069,    41,   253,  7284,  8590]))

In [87]:
batch_names, adata.obs['batchid'] = np.unique(adata.obs['tissue'],return_inverse=True)

In [92]:
gene_dataset = AnnDatasetFromAnnData(adata, batch_label='batchid')

[2019-12-10 06:27:27,529] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2019-12-10 06:27:27,533] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2019-12-10 06:27:27,894] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2019-12-10 06:27:28,236] INFO - scvi.dataset.dataset | Downsampled from 65397 to 65397 cells


In [None]:
n_epochs = 100
lr = 1e-3
use_batches = False
use_cuda = True

vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches * use_batches)
trainer = UnsupervisedTrainer(
    vae,
    gene_dataset,
    train_size=0.75,
    use_cuda=use_cuda,
    frequency=5,
    data_loader_kwargs={"pin_memory":False},
)

if os.path.isfile('%s/vae.pkl' % save_path):
    trainer.model.load_state_dict(torch.load('%s/all.vae.pkl' % save_path))
    trainer.model.eval()
else:
    trainer.train(n_epochs=n_epochs, lr=lr)
    torch.save(trainer.model.state_dict(), '%s/all.vae.pkl' % save_path)

HBox(children=(IntProgress(value=0, description='training', style=ProgressStyle(description_width='initial')),…

In [None]:
elbo_train_set = trainer.history["elbo_train_set"]
elbo_test_set = trainer.history["elbo_test_set"]
x = np.linspace(0, 100, (len(elbo_train_set)))
plt.plot(x[3:], elbo_train_set[3:])
plt.plot(x[3:], elbo_test_set[3:])

In [None]:
full = trainer.create_posterior(trainer.model, gene_dataset, indices=np.arange(len(gene_dataset)))
latent, batch_indices, labels = full.sequential().get_latent()
batch_indices = batch_indices.ravel()

In [None]:
imputed_values = full.sequential().imputation()
normalized_values = full.sequential().get_sample_scale()

In [None]:
per_cluster_de, cluster_id = full.one_vs_all_degenes(cell_labels=gene_dataset.labels.ravel(), min_cells=1)

markers = []
for x in per_cluster_de:
    markers.append(x[:10])
markers = pd.concat(markers)

genes = np.asarray(markers.index)
expression = [x.filter(items=genes, axis=0)['raw_normalized_mean1'] for x in per_cluster_de]
expression = pd.concat(expression, axis=1)
expression = np.log10(1 + expression)
expression.columns = gene_dataset.cell_types

In [None]:
sc.tl.rank_genes_groups(
    adata,
    "compartments",
    method="wilcoxon",
    use_raw=False,
    key_added="compartment_markers",
    n_genes=adata.shape[1]
)
sc.pl.rank_genes_groups(
    adata, key="compartment_markers", sharey=False
)

In [57]:
result = adata.uns['compartment_markers']
groups = result['names'].dtype.names
comp_genes_scores = pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'scores','pvals']})


In [60]:
comp_genes_scores['Endothelial_n'][:10].values

array(['IFI27', 'CLDN5', 'CAV1', 'IFITM3', 'RAMP2', 'EPAS1', 'CLEC14A',
       'GNG11', 'SPARCL1', 'AQP1'], dtype=object)

In [61]:
comp_genes_scores['Epithelial_n'][:10].values

array(['KRT18', 'KRT8', 'SPINK1', 'CD24', 'EPCAM', 'PLA2G1B', 'DSP',
       'PRSS1', 'CLPS', 'CPA1'], dtype=object)

In [63]:
comp_genes_scores['Immune_n'][:10].values

array(['PTPRC', 'SRGN', 'B2M', 'LAPTM5', 'TMSB4X', 'HLA-B', 'FTL',
       'S100A4', 'FTH1', 'GMFG'], dtype=object)

In [62]:
comp_genes_scores['Stromal_n'][:10].values

array(['MGP', 'COL6A2', 'DCN', 'CALD1', 'C1S', 'COL1A2', 'C1R', 'COL6A1',
       'GPX3', 'MFAP4'], dtype=object)

In [64]:
comp_genes_scores['PNS_n'][:10].values

array(['CRYAB', 'PMP22', 'GPM6B', 'FXYD1', 'NRXN1', 'IGFBP7', 'MAL',
       'SPARC', 'PLP1', 'CD9'], dtype=object)

In [65]:
comp_genes_scores['Pancreas_n'][:10].values

array(['PRSS1', 'CPB1', 'CELA3A', 'CTRB1', 'CELA3B', 'PNLIP', 'CLPS',
       'CPA1', 'REG1A', 'CTRC'], dtype=object)

In [73]:
epithelium = comp_genes_scores['Epithelial_n'][:30].values

In [79]:
pancreas = comp_genes_scores['Pancreas_n'][:100].values

In [81]:
print([x for x in epithelium if x not in pancreas])

['KRT18', 'KRT8', 'EPCAM', 'DSP', 'ELF3', 'CDH1', 'CYB5A', 'FAM3B', 'SH3YL1', 'SMIM22']
