# Annotate data with Tusi et al., 2018 data and scANVI 

In [10]:
import scanpy as sc
import numpy as np
import pandas as pd
import os
from scipy import sparse

import torch
import scvi

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

In [11]:
os.chdir('/research/peer/fdeckert/FD20200109SPLENO')

# Import SPLENO data 

In [None]:
# Create anndata from count mtx wit var_names 
adata = sc.read_mtx('data/object/components/slots/seurat_counts.mtx')
adata.var_names = pd.read_table('data/object/components/slots/seurat_counts_genes.csv', index_col=0).index
# Add obs from meta_data
adata.obs = pd.read_csv('data/object/components/meta_data/seurat_meta.csv', index_col=0)
# Set layers
adata.layers['counts'] = adata.X

# Filter input data 

In [None]:
adata = adata[(adata.obs['treatment'] == 'CpG') & (adata.obs['tissue'] == 'Progenitor')].copy()

In [None]:
adata.obs['cluster'] = adata.obs['rna_snn_res.0.8'].copy()
adata.obs.cluster.value_counts()

In [None]:
# Filter out cluster 19 (B-cell), 8, 13 (Myeloid)
adata = adata[np.isin(adata.obs['rna_snn_res.0.8'], [19, 8, 13], invert=True)]

In [None]:
adata.obs.cluster.value_counts()

# Define seeding cells by Singler Score 

In [None]:
boxplot = adata.obs[['fine_labels_p', 'fine_delta_score_p']].boxplot(by = 'fine_labels_p', grid=False, rot=90, fontsize=10)
adata.obs.fine_labels_p.value_counts()

In [None]:
# Select cellid by hightest delta score
s_cellid = adata.obs.groupby('fine_labels_p')['fine_delta_score_p'].nlargest(10).reset_index(level=0).index

# Select cellid by label frequency 
f_cellid = adata.obs.groupby('fine_labels_p').filter(lambda x: len(x) >= 10).index

# Union of cellid for seed label
seed_cellid = s_cellid.intersection(f_cellid)

# Add seed_label to obs
adata.obs['seed_labels'] = np.where(np.isin(adata.obs.index, seed_cellid), adata.obs.fine_labels_p, 'Unknown')
adata.obs.seed_labels.value_counts()

# Transfer of annotation with scANVI

In [8]:
scvi.data.setup_anndata(adata, batch_key='sample_name', labels_key='seed_labels')

[34mINFO    [0m Using batches from adata.obs[1m[[0m[32m"sample_name"[0m[1m][0m                                         
[34mINFO    [0m Using labels from adata.obs[1m[[0m[32m"seed_labels"[0m[1m][0m                                          
[34mINFO    [0m Using data from adata.X                                                             
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m8170[0m cells, [1;36m15864[0m vars, [1;36m2[0m batches,
         [1;36m6[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


In [None]:
scvi_model = scvi.model.SCVI(adata, n_latent=30, n_layers=2)
scvi_model.train(100)
scvi_model.save('data/scvi/model/spleno_progenitor_cpg_scvi', overwrite = True)
# scvi_model.load('data/scvi/model/spleno_progenitor_nacl_scvi', adata)

In [None]:
scanvi_model = scvi.model.SCANVI.from_scvi_model(scvi_model, 'Unknown')
scanvi_model.train(25)
scanvi_model.save('data/scvi/model/spleno_progenitor_cpg_scanvi', overwrite = True)
# scanvi_model.load('data/scvi/model/spleno_progenitor_nacl_scanvi', adata)

# Transfer results to adata 

In [None]:
adata.obs['C_scANVI'] = scanvi_model.predict(adata)
adata.obsm['X_scANVI'] = scanvi_model.get_latent_representation(adata)

In [None]:
y_pred = scanvi_model.predict(adata, soft=True)
pred = pd.DataFrame(data=y_pred[0:,0:])
adata.obs['p_score_scANVI'] = pred.max(axis=1).to_numpy()

# Dimensional reduction 

In [None]:
sc.pp.neighbors(adata, use_rep="X_scANVI")
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=['Gata1', 'Gata2'], save='umap_marker_scanvi_spleno_prog_cpg.png', wspace = 0.5)

In [None]:
sc.pl.umap(adata, color=['C_scANVI', 'fine_labels_p'], save='umap_label_scanvi_spleno_prog_cpg.png', wspace = 0.5)

In [None]:
sc.pl.umap(adata, color=['p_score_scANVI', 'fine_delta_score_p'], save='umap_score_scanvi_spleno_prog_cpg.png', wspace = 0.5)

# Save results

In [None]:
adata.write_h5ad('data/scvi/spleno_progenitor_cpg_scanvi.h5ad')
# adata = sc.read_h5ad('data/scvi/spleno_progenitor_cpg_scanvi.h5ad')