# Loading and preparing data 

In [None]:
import scvi
import scanpy as sc

import pandas as pd
import numpy as np
import os

import matplotlib as mpl

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sc.set_figure_params(figsize=(7, 7), dpi_save=1200, fontsize=12, frameon=False, facecolor='white')
mpl.rcParams['figure.facecolor'] = 'white'

In [None]:
os.chdir('/research/peer/fdeckert/FD20200109SPLENO')

In [None]:
scvi.settings.num_threads=32

In [None]:
# Warnings Python 
import warnings
warnings.filterwarnings('ignore')

## Settup rpy2 

In [None]:
os.environ['R_HOME'] = '/nobackup/peer/fdeckert/miniconda3/envs/r.4.1.0/lib/R/'

In [None]:
import rpy2.rinterface_lib.callbacks
import logging

from rpy2.robjects import pandas2ri
import anndata2ri

In [None]:
%load_ext rpy2.ipython

# Import AnnData

In [None]:
# Re-load data
adata = sc.read_h5ad('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/qc.h5ad')

In [None]:
# Subset by expression of gene 
adata = adata[:, (adata.X>=1).sum(axis=0)>=3].copy()

# Train SCVI model

In [None]:
cache_scvi = True

In [None]:
if not cache_scvi: 
    
    scvi.model.SCVI.setup_anndata(
    
        adata,
        batch_key='sample_group', 
        categorical_covariate_keys=['infection'], 
        continuous_covariate_keys=['msS_scale_RNA', 'msG2M_scale_RNA', 'pMt_RNA']

    )
    
    model = scvi.model.SCVI(
    
        adata, 
        n_latent=30, 
        n_hidden=128, 
        n_layers=2, 
        gene_likelihood='nb'

    )
    
    max_epochs = int(np.min([round((20000 / adata.n_obs) * 400), 400]))
    max_epochs = int(100)

    model.train(max_epochs=max_epochs, check_val_every_n_epoch=1)
    
    model.save('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/scvi/run_1/', overwrite=True)
    
else: 
    
    model = scvi.model.SCVI.load('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/scvi/run_1/', adata=adata)

# Training validation 

In [None]:
# Plot model history 
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

axes[0].plot(model.history['reconstruction_loss_train']['reconstruction_loss_train'], label='train')
axes[0].plot(model.history['reconstruction_loss_validation']['reconstruction_loss_validation'], label='validation')
axes[0].set_title('Reconstruction Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()

axes[1].plot(model.history['elbo_train']['elbo_train'], label='train')
axes[1].plot(model.history['elbo_validation']['elbo_validation'], label='validation')
axes[1].set_title('ELBO')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()

# Dim reduction on latent space

In [None]:
cache_dim = True

In [None]:
if not cache_dim: 
    
    adata.obsm['latent'] = model.get_latent_representation()
    
    sc.pp.neighbors(adata, n_neighbors=30, use_rep='latent')
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata)
    
    adata.write_h5ad('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/scvi/run_1/adata.h5ad')
    
else: 
    
    adata = sc.read_h5ad('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/scvi/run_1/adata.h5ad')

In [None]:
sc.pl.umap(adata, color=['leiden', 'sample_group', 'facs', 'infection', 'label_main_immgen', 'label_main_haemopedia', 'solo_label', 'nCount_RNA', 'nFeature_RNA', 'pMt_RNA', 'pRb_RNA', 'pHb_RNA', 'msS_scale_RNA', 'msG2M_scale_RNA', 'msCC_scale_RNA'], frameon=False, ncols=4, wspace=0.5, size=100, legend_loc='on data')

# Add Hist module score 

In [None]:
module = pd.read_csv('result/module/module.csv', index_col=0)
adata.obs = adata.obs.merge(module, left_index=True, right_index=True, how='left')

# Remove low quality cluster

In [None]:
adata = adata[adata.obs.leiden.isin(set(adata.obs.leiden)-set(['12', '17', '18']))].copy()

# Train SCVI model

In [None]:
cache_scvi = True

In [None]:
if not cache_scvi: 
    
    scvi.model.SCVI.setup_anndata(
    
        adata,
        batch_key='sample_group', 
        categorical_covariate_keys=['infection'], 
        continuous_covariate_keys=['msS_scale_RNA', 'msG2M_scale_RNA', 'msHist_scale_RNA', 'pMt_RNA']

    )
    
    model = scvi.model.SCVI(
    
        adata, 
        n_latent=30, 
        n_hidden=128, 
        n_layers=2, 
        gene_likelihood='nb'

    )
    
    max_epochs = int(np.min([round((20000 / adata.n_obs) * 400), 400]))
    max_epochs = int(150)

    model.train(max_epochs=max_epochs, check_val_every_n_epoch=1)
    
    model.save('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/scvi/run_2/', overwrite=True)
    
else: 
    
    model = scvi.model.SCVI.load('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/scvi/run_2/', adata=adata)

# Training validation 

In [None]:
# Plot model history 
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

axes[0].plot(model.history['reconstruction_loss_train']['reconstruction_loss_train'], label='train')
axes[0].plot(model.history['reconstruction_loss_validation']['reconstruction_loss_validation'], label='validation')
axes[0].set_title('Reconstruction Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()

axes[1].plot(model.history['elbo_train']['elbo_train'], label='train')
axes[1].plot(model.history['elbo_validation']['elbo_validation'], label='validation')
axes[1].set_title('ELBO')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()

# Dim reduction on latent space

In [None]:
cache_dim = True

In [None]:
if not cache_dim: 
    
    adata.obsm['latent'] = model.get_latent_representation()
    
    sc.pp.neighbors(adata, n_neighbors=30, use_rep='latent')
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata)
    
    adata.write_h5ad('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/scvi/run_2/adata.h5ad')
    
else: 
    
    adata = sc.read_h5ad('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/scvi/run_2/adata.h5ad')

In [None]:
sc.pl.umap(adata, color=['leiden', 'sample_group', 'facs', 'infection', 'label_main_immgen', 'label_main_haemopedia', 'solo_label', 'nCount_RNA', 'nFeature_RNA', 'pMt_RNA', 'pRb_RNA', 'pHb_RNA', 'msS_scale_RNA', 'msG2M_scale_RNA', 'msCC_scale_RNA', 'msHist_scale_RNA'], frameon=False, ncols=4, wspace=0.5, size=100, legend_loc='on data')