In [None]:
import sys
import matplotlib
matplotlib.rcParams["pdf.fonttype"] = 42
matplotlib.rcParams["ps.fonttype"] = 42
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
from datetime import datetime

import numpy as np
import numpy.random as random
import pandas as pd
import scanpy as sc
import louvain
import torch

from scvi.dataset import Dataset10X, CsvDataset, AnnDatasetFromAnnData, CellMeasurement, LoomDataset, DownloadableAnnDataset
from scvi.dataset.dataset import GeneExpressionDataset
from scvi.inference import TotalPosterior, TotalTrainer, load_posterior
from scvi.models import SCANVI, TOTALVI
from scvi import set_seed

from umap import UMAP

# Control UMAP numba warnings
import warnings; warnings.simplefilter('ignore')

%matplotlib inline

set_seed(123)

use_cuda = True
show_plot = True
test_mode = False
sampleFolder = "/PATH/TO/"

# Load data

### Load Sample1

In [None]:
### Read RNA
dataset1 = Dataset10X(
    save_path=sampleFolder+"Sample1/",
    measurement_names_column=0,
)
dataset1

In [None]:
### Read ADT
datasetADT1 = CsvDataset(sampleFolder+"PATH/TO/rawDataADT1.csv")
datasetADT1

In [None]:
### Finish dataset
protein_data = CellMeasurement(
    name="protein_expression",
    data=datasetADT1.X.astype(np.float32),
    columns_attr_name="protein_names",
    columns=datasetADT1.gene_names,
)

dataset1.initialize_cell_measurement(protein_data)
dataset1

### Load Sample2

In [None]:
### Read RNA
dataset2 = Dataset10X(
    save_path=sampleFolder+"Sample2/",
    measurement_names_column=0,
)
dataset2

In [None]:
### Read ADT
datasetADT2 = CsvDataset(sampleFolder+"PATH/TO/rawDataADT2.csv")
datasetADT2

In [None]:
### Finish dataset
protein_data = CellMeasurement(
    name="protein_expression",
    data=datasetADT2.X.astype(np.float32),
    columns_attr_name="protein_names",
    columns=datasetADT2.gene_names,
)

dataset2.initialize_cell_measurement(protein_data)
dataset2

### Merge

In [None]:
### Concatenate datasets - via intersect
all_dataset = GeneExpressionDataset()
all_dataset.populate_from_datasets([dataset1, dataset2])

In [None]:
### Check out merged dataset
all_dataset

In [None]:
### Save before taking HVG
import pickle
pickle.dump(all_dataset, open(sampleFolder+"PATH/TO/all_dataset.pkl", "wb"))

### HVG (4000 genes)

In [None]:
### Get HVGenes
all_dataset.subsample_genes(4000, batch_correction = True, mode = "seurat_v2")
all_dataset

### Save merged dataset after taking HVG

In [None]:
print(os.getcwd())
print(sampleFolder)

In [None]:
### Save
import pickle
pickle.dump(all_dataset, open(sampleFolder+"PATH/TO/HVG_dataset.pkl", "wb"))

### Reload merged dataset

In [None]:
### Load data again
import pickle
all_dataset = pickle.load(open(sampleFolder+"PATH/TO/HVG_dataset.pkl", "rb"))

In [None]:
all_dataset

In [None]:
### Load full dataset again
import pickle
all_dataset_full = pickle.load(open(sampleFolder+"PATH/TO/all_dataset.pkl", "rb"))

In [None]:
all_dataset_full

# Train model

In [None]:
### Initialize trainer
use_cuda = True
lr = 0.001
n_epochs = 400

# early_stopping_kwargs = {
#     "early_stopping_metric": "elbo",
#     "save_best_state_metric": "elbo",
#     "patience": 45,
#     "threshold": 0,
#     "reduce_lr_on_plateau": True,
#     "lr_patience": 30,
#     "lr_factor": 0.6,
#     "posterior_class": TotalPosterior,
# }


totalvae=TOTALVI(all_dataset.nb_genes, len(all_dataset.protein_names), 
                 n_batch=all_dataset.n_batches, n_latent=20, 
                 encoder_batch=True, protein_batch_mask=batch_mask)

### Prepare trainer
trainer = TotalTrainer(
    totalvae,
    all_dataset,
    train_size=0.90,
    test_size=0.10,
    use_cuda=use_cuda,
    frequency=1,
    batch_size=256,
    early_stopping_kwargs="auto",
    use_adversarial_loss=True if all_dataset.n_batches > 1 else False
)

In [None]:
### Do training
print("Start =", datetime.now().strftime("%H:%M:%S"))

trainer.train(lr=lr, n_epochs=n_epochs)

print("End =", datetime.now().strftime("%H:%M:%S"))

In [None]:
### Plotting likelihood
plt.plot(trainer.history["elbo_train_set"], label="train")
plt.plot(trainer.history["elbo_test_set"], label="test")
plt.title("Negative ELBO over training epochs")
plt.ylim(1000,1500)
plt.legend()

# Get full posterior

In [None]:
### Get full posterior
full_posterior = trainer.create_posterior(
    totalvae, all_dataset, indices=np.arange(len(all_dataset)), type_class=TotalPosterior
)
full_posterior = full_posterior.update({"batch_size":32})

### Extract latent space
latent, batch_indices, label, library_gene = full_posterior.sequential().get_latent()
batch_indices = batch_indices.ravel()
batch_indices

In [None]:
latent.shape

# Save

In [None]:
print(os.getcwd())
print(sampleFolder)

In [None]:
### Save
import pickle
pickle.dump(full_posterior, open(sampleFolder+"PATH/TO/fullPosterior.pkl", "wb"))

# Reload

In [None]:
### Load data again
import pickle
full_posterior = pickle.load(open(sampleFolder+"results_TotalVI/fullPosterior.pkl", "rb"))

In [None]:
latent, batch_indices, label, library_gene = full_posterior.sequential().get_latent()
batch_indices

# Clustering 

In [None]:
### Create adata object
post_adata = sc.AnnData(X=all_dataset.X)
post_adata.var.index = all_dataset.gene_names
post_adata.obsm["X_totalVI"] = latent

In [None]:
post_adata.obsm["X_totalVI"].shape

In [None]:
### table() of batch indices
print(np.array(np.unique(all_dataset.batch_indices, return_counts=True)).T)

In [None]:
### Run umap
#The higher the min_dist, the closer the clusters
sc.pp.neighbors(post_adata, use_rep="X_totalVI", n_neighbors=30, metric="correlation")
sc.tl.umap(post_adata, min_dist=0.3)

In [None]:
### Run clustering
sc.tl.louvain(post_adata, key_added="louvain", resolution=1.2)

### Prepare for umap

In [None]:
### Add sample name
d_names = ["Sample1","Sample2"]
post_adata.obs["sample"] = [d_names[int(b)] for b in all_dataset.batch_indices]

In [None]:
inds = np.random.permutation(np.arange(all_dataset.X.shape[0]))

In [None]:
print(all_dataset.X.shape)
print(inds.shape)

In [None]:
### Look at metaData
post_adata.obs

In [None]:
### Create umap
figUmap = sc.pl.umap(
    post_adata, 
    color="louvain",
    ncols=1,
    alpha=0.9,
    legend_loc="on data",
#     legend_loc="right margin",
    return_fig=True
)

In [None]:
### Create umap split
figUmapSplitSample = sc.pl.umap(
    post_adata[inds], 
    color=["sample"],
    ncols=1,
    alpha=0.9,
    return_fig=True
)

In [None]:
### Save plots
figUmap.savefig(sampleFolder+"PATH/TO/umap.png", dpi=200, bbox_inches='tight')
figUmapSplitSample.savefig(sampleFolder+"PATH/TO/umapSplitSample.png", dpi=200, bbox_inches='tight')

# Get denoised data

In [None]:
# Number of Monte Carlo samples to average over
n_samples = 15
parsed_protein_names=all_dataset.protein_names.tolist()

# Probability of background for each (cell, protein)
py_mixing = full_posterior.sequential().get_sample_mixing(n_samples=n_samples, give_mean=True)
protein_foreground_prob = pd.DataFrame(
    data=(1 - py_mixing), columns=parsed_protein_names
)

In [None]:
### Denoised genes, denoised proteins
denoised_genes, denoised_proteins = full_posterior.sequential().get_normalized_denoised_expression(
    n_samples=n_samples, give_mean=True, transform_batch=[0,1]
)
print(denoised_genes.shape)
print(denoised_proteins.shape)

In [None]:
len(parsed_protein_names)

In [None]:
### Get raw values
combined_protein = all_dataset.protein_expression

In [None]:
print(combined_protein.shape)
print(protein_foreground_prob.shape)
print(denoised_proteins.shape)

In [None]:
### Add normalised protein values to post_adata (via obs)
for i, p in enumerate(parsed_protein_names):
    post_adata.obs["{}_fore_prob".format(p)] = protein_foreground_prob[p].values
    post_adata.obs["{}_observed".format(p)] = combined_protein[:, i]

In [None]:
### Add normalised protein values to post_adata (via obs)
for i, p in enumerate(parsed_protein_names):
    post_adata.obs["{}".format(p)] = denoised_proteins[:, i]

In [None]:
### Add normalised gene values to post_adata (via layer)
post_adata.layers["norm_genes"] = denoised_genes

In [None]:
post_adata.obs

In [None]:
post_adata.layers["norm_genes"]

# Get normalised data full dataset

In [None]:
all_dataset_full

In [None]:
post_adata_full = sc.AnnData(X=all_dataset_full.X)
post_adata_full.var.index = all_dataset_full.gene_names

In [None]:
sc.pp.normalize_total(post_adata_full, target_sum=1e4, exclude_highly_expressed=True)

In [None]:
sc.pp.log1p(post_adata_full)

In [None]:
post_adata_full.layers["rawDataFull"] = all_dataset_full.X

In [None]:
post_adata_full.obsm["X_totalVI"] = latent
post_adata_full.obsm['X_umap']=post_adata.obsm['X_umap']

In [None]:
print(post_adata_full.layers["rawDataFull"].shape)
print(post_adata_full.X.shape)
print(post_adata.layers["norm_genes"].shape)

In [None]:
##### Prepare for saving #####
normData=post_adata_full.X
normData.shape

In [None]:
### Save
import scipy.io
scipy.io.mmwrite(sampleFolder+"PATH/TO/normData.mtx", normData)

# Plot Genes

In [None]:
### For example
geneSymbol='ALB'

In [None]:
### Plot gene expression - raw data
sc.pl.umap(
    post_adata_full,
    color=geneSymbol,
    show=True,
    layer="rawDataFull",
    vmax='p99'
)

In [None]:
### Plot gene expression - normData full
sc.pl.umap(
    post_adata_full,
    color=geneSymbol,
    show=True,
    vmax='p99'
)

In [None]:
### Plot gene expression - denoised genes
sc.pl.umap(
    post_adata,
    color=geneSymbol,
    show=True,
    layer='norm_genes',
    vmax='p99'
)

# Plot ABs

In [None]:
### Plot expression of each AB
for i in range(0, len(parsed_protein_names)):
    ABname=parsed_protein_names[i]
    print(i)
    ABplot=sc.pl.umap(
        post_adata,
        color=ABname,
        show=False,
        vmax="p99",
        cmap="coolwarm",
        alpha=0.9,
        return_fig=True
    )
    fileName='plot_'+ABname+'.png'
    ABplot.savefig(sampleFolder+"PATH/TO/"+fileName, dpi=200, bbox_inches='tight')

In [None]:
### Plot AB expression - observed
sc.pl.umap(
    post_adata,
    color=['adt-CLEC4F.1_observed'],
    show=True,
    vmax="p99",
    cmap="bwr",
    alpha=0.9
)

In [None]:
### Plot AB expression - foreground probability
sc.pl.umap(
    post_adata,
    color=['adt-CLEC4F.1_fore_prob'],
    show=True,
    vmax="p99",
    cmap="bwr",
    alpha=0.9
)

# Save post_adata

In [None]:
post_adata.obs["sample"] = post_adata.obs["sample"].astype("str")
post_adata.obs["louvain"] = post_adata.obs["louvain"].astype("int")

In [None]:
### Save
import pickle
pickle.dump(post_adata, open(sampleFolder+"PATH/TO/post_adata.pkl", "wb"))

# Reload

In [None]:
### Load data again
import pickle
post_adata = pickle.load(open(sampleFolder+"PATH/TO/post_adata.pkl", "rb"))

In [None]:
post_adata.obs['sample'] = post_adata.obs['sample'].astype('category')
post_adata.obs['louvain'] = post_adata.obs['louvain'].astype('category')

In [None]:
post_adata.obs

In [None]:
### Create umap
sc.pl.umap(
    post_adata, 
    color="louvain",
    ncols=1,
    alpha=0.9,
    legend_loc="on data"
)

# DE genes/proteins per cluster

In [None]:
### Get clusters
clusters = post_adata.obs.louvain.values.astype(int)
print(clusters.shape)

In [None]:
np.array(np.unique(clusters, return_counts=True)).T

In [None]:
### Calculate markers for each cluster
per_cluster_de, cluster_id = full_posterior.one_vs_all_degenes(
    cell_labels=clusters,
    min_cells=1,
    n_samples=5000,
    use_permutation=False,
    mode="change",
    delta=0.2
)

In [None]:
allGenes_rna = []
allABs_adt = []
for i, cid in enumerate(cluster_id):
    pcd = per_cluster_de[i].sort_values("lfc_median", ascending=False)

    pro_rows = pcd.index.str.contains('adt')
    data_rna = pcd.iloc[~pro_rows]
    data_pro = pcd.iloc[pro_rows]

    allGenes_rna.append(data_rna)
    allABs_adt.append(data_pro)

allGenesTable=pd.concat(allGenes_rna)
allABsTable=pd.concat(allABs_adt)

## DE genes

In [None]:
### Get DE genes: function
def getDEgenes_perCluster(per_cluster_de, cluster_id):
    filtered_rna = []
    for i, cid in enumerate(cluster_id):
        pcd = per_cluster_de[i].sort_values("lfc_median", ascending=False)

        pcd = pcd[pcd.lfc_median > 0.5]

        pro_rows = pcd.index.str.contains('adt')
        data_rna = pcd.iloc[~pro_rows]
        data_rna = data_rna[data_rna["bayes_factor"] > 1]
        data_rna = data_rna[data_rna["non_zeros_proportion1"] > 0.30]
        
        data_rna["score"] = data_rna["raw_normalized_mean1"]/data_rna["raw_normalized_mean2"]*data_rna["lfc_mean"]

        filtered_rna.append(data_rna)
    
    toReturn=pd.concat(filtered_rna)
    toReturn=toReturn.sort_values(['clusters', 'lfc_median'], ascending=[True, False])
    return(toReturn)

In [None]:
### Get DE genes
clustermarkers = getDEgenes_perCluster(per_cluster_de, cluster_id)
columns_oi=['proba_de','bayes_factor','lfc_mean','lfc_median','raw_normalized_mean1','raw_normalized_mean2','clusters','score']
clustermarkers[columns_oi]

In [None]:
clustermarkers.shape

In [None]:
np.array(np.unique(clustermarkers['clusters'], return_counts=True)).T

In [None]:
### Get markers of certain cluster
tmp=clustermarkers[clustermarkers['clusters']==1]
tmp[columns_oi]

### Write to Excel

In [None]:
import xlsxwriter
# Create a Pandas Excel writer using XlsxWriter as the engine.
fileName = sampleFolder+'PATH/TO/DEgenes_totalVI.xlsx'
writer = pd.ExcelWriter(fileName, engine='xlsxwriter')

clusterIDs=np.array(np.unique(clustermarkers.clusters))
for clusterID in clusterIDs:
    tmp=clustermarkers[clustermarkers.clusters==clusterID]
    tmp.to_excel(writer, sheet_name='cl'+str(clusterID))
    
# Close the Pandas Excel writer and output the Excel file.
writer.save()

## DE proteins

In [None]:
### Get DE proteins: function
def getDEproteins_perCluster(per_cluster_de, cluster_id):
    filtered_pro = []
    for i, cid in enumerate(cluster_id):
        pcd = per_cluster_de[i].sort_values("lfc_median", ascending=False)

        pcd = pcd[pcd.lfc_median > 0.5]

        pro_rows = pcd.index.str.contains('adt')
        data_pro = pcd.iloc[pro_rows]
        data_pro = data_pro[data_pro["bayes_factor"] > 0.7]
        
        data_pro["score"] = data_pro["raw_mean1"]/data_pro["raw_mean2"]*data_pro["lfc_mean"]

        filtered_pro.append(data_pro)
    
    toReturn=pd.concat(filtered_pro)
    toReturn=toReturn.sort_values(['clusters', 'lfc_median'], ascending=[True, False])
    return(toReturn)

In [None]:
### Get DE proteins: function
def getDEproteins_perCluster_lessStrict(per_cluster_de, cluster_id):
    filtered_pro = []
    for i, cid in enumerate(cluster_id):
        pcd = per_cluster_de[i].sort_values("lfc_median", ascending=False)

        pcd = pcd[pcd.proba_de > 0.05]

        pro_rows = pcd.index.str.contains('adt')
        data_pro = pcd.iloc[pro_rows]
        
        data_pro["score"] = data_pro["raw_mean1"]/data_pro["raw_mean2"]*data_pro["lfc_mean"]
        data_pro = data_pro[data_pro.score >= 1]

        filtered_pro.append(data_pro)
    
    toReturn=pd.concat(filtered_pro)
    toReturn=toReturn.sort_values(['clusters', 'lfc_median'], ascending=[True, False])
    return(toReturn)

In [None]:
### Get DE proteins
clusterproteins = getDEproteins_perCluster_lessStrict(per_cluster_de, cluster_id)
columns_oi=['proba_de','bayes_factor','lfc_mean','lfc_median','raw_mean1','raw_mean2','clusters','score']
clusterproteins[columns_oi]

In [None]:
np.array(np.unique(clusterproteins['clusters'], return_counts=True)).T

In [None]:
### Get proteins of certain cluster
tmp=clusterproteins[clusterproteins['clusters']==1]
tmp[columns_oi]

### Write to Excel

In [None]:
import xlsxwriter
# Create a Pandas Excel writer using XlsxWriter as the engine.
fileName = sampleFolder+'PATH/TO/DEantibodies_totalVI.xlsx'
writer = pd.ExcelWriter(fileName, engine='xlsxwriter')

clusterIDs=np.array(np.unique(clusterproteins.clusters))
for clusterID in clusterIDs:
    tmp=clusterproteins[clusterproteins.clusters==clusterID]
    tmp.to_excel(writer, sheet_name='cl'+str(clusterID))
    
# Close the Pandas Excel writer and output the Excel file.
writer.save()