#Annotation Notebook
This notebook allows you to annotate your data with a number of annotation methods using the tabula sapiens dataset as a reference

Press shift+enter to execute each code block

Usage:
Here, we setup the Google Colab Environment, download the data, and connect to Google drive

Ad instructins on putting data on google drive etc.

# Setup 

In [1]:
#@title Import packages
#@markdown Here we install the necessary packages
import os
import warnings
import sys
warnings.simplefilter(action='ignore', category=FutureWarning)
import scanorama
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import anndata
import scvi
import scanpy as sc
import umap
from scipy import sparse
import scipy

sc.set_figure_params(figsize=(6, 6), frameon=False)
sc.settings.n_jobs=2


In [None]:
# TODO
# Add code to download annotation code and datasets

In [4]:
#@title Connect to Google Drive
#@markdown Link this colab session to Google drive
from google.colab import drive
#mount google drive
drive.mount('/content/drive')

  and should_run_async(code)


ModuleNotFoundError: No module named 'google.colab'

In [5]:
#@title Import annotation code
import sys
sys.path.append('/content/drive/MyDrive/ts_evaluation') #TODO change this path
import importlib
import annotation 

importlib.reload(annotation)
from annotation import (process_query, 
                        save_results,
                        subsample_dataset,
                        run_scvi,
                        run_scanvi,
                        run_scanorama,
                        run_onclass,
                        run_knn_on_scvi,
                        run_knn_on_scanorama,
                        run_svm_on_hvg,
                        run_rf_on_hvg,
                        run_bbknn, 
                        run_knn_on_bbknn)

  and should_run_async(code)


ModuleNotFoundError: No module named 'sklearn.externals.six'

## Load your data

Set the following arguments:

In [None]:
# TODO set location to save results
save_folder = '/content/drive/MyDrive/Thymus_eval/notebook_evaluation'

# TODO set paths query dataset
query_path = '/content/drive/MyDrive/Lung_eval/Lung_lca_processed.h5ad'
# query_path = '/content/drive/MyDrive/Thymus_eval/thymus_query_processed.h5ad'
# query_path = '/content/drive/MyDrive/PBMC/pbmc_query.h5ad'

# TODO set batch key of input anndata
query_batch_key = 'method'


### setup for evaluation ###
ref_batch_key = 'method'
ref_path = '/content/drive/MyDrive/Lung_eval/Lung_ts_processed.h5ad'
# ref_path = '/content/drive/MyDrive/Thymus_eval/thymus_TS_processed.h5ad'
# ref_path = '/content/drive/MyDrive/PBMC/multi.new.h5ad'

#key to the label locations
#this is just for evaulation
ref_labels_key = 'cell_ontology_type'
query_labels_key = 'cell_ontology_type'

unknown_celltype_label = 'unknown'

#want to make this false in the future
training_method = 'offline'
results_adata_path = os.path.join(save_folder, 'results.h5ad')

cl_obo_file = '/content/drive/MyDrive/ts_evaluation/cl.obo'
cl_ontology_file = '/content/drive/MyDrive/ts_evaluation/cl.ontology'


!mkdir -p {save_folder}


In [None]:
ref = anndata.read(ref_path)
query = anndata.read(query_path)

#delete later
query.obs[query_labels_key] = 'unknown'

# combine the query and reference datasets
adata = process_query(ref, 
                      query,
                      ref_labels_key,
                      ref_batch_key,
                      query_labels_key,
                      query_batch_key,
                      unknown_celltype_label = 'unknown',
                      training_method='offline')

All ref genes are in query dataset. Can use pretrained models
[34mINFO    [0m Using batches from adata.obs[1m[[0m[32m"_batch"[0m[1m][0m                                 
[34mINFO    [0m Using labels from adata.obs[1m[[0m[32m"_labels"[0m[1m][0m                                 
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"counts"[0m[1m][0m                                 
[34mINFO    [0m Computing library size prior per batch                                 
[34mINFO    [0m Successfully registered anndata object containing [1;34m107775[0m cells, [1;34m4000[0m   
         vars, [1;34m4[0m batches, [1;34m24[0m labels, and [1;34m0[0m proteins. Also registered [1;34m0[0m extra    
         categorical covariates and [1;34m0[0m extra continuous covariates.              
[34mINFO    [0m Please do not further modify adata until model is trained.             


In [None]:
adata

AnnData object with n_obs × n_vars = 107775 × 4000
    obs: 'cell_id', 'method', 'donor', 'cell_ontology_type', 'donor_method', 'cell_ontology_id', '_labels', '_batch', 'n_counts', 'n_genes', 'batch', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var'
    var: 'feature_types.0.0-0', 'n_cells.0.0-0', 'gene_symbol.0.0-0', 'n_cells.1.0-0', 'n_cells.0-0', 'n_cells.1.1-0', 'feature_types.0.0.0.1-0', 'gene_symbol.0.0.0.1-0', 'n_cells.1.0.0.1-0', 'n_cells.1.0.1-0', 'n_cells-0', 'len-0', 'ensembl_id-0', 'contamination_prop-0-0', 'contamination_prop-1-0', 'contamination_prop-10-0', 'contamination_prop-11-0', 'contamination_prop-12-0', 'contamination_prop-13-0', 'contamination_prop-14-0', 'contamination_prop-2-0', 'contamination_prop-3-0', 'contamination_prop-4-0', 'contamination_prop-5-0', 'contamination_prop-6-0', 'contamination_prop-7-0', 'contamination_prop-8-0', 'contamination_prop-9-0', 'highly_variable-0', 'highly_variable_rank-0', 'means-0', 'variances-0', 'variance

# Annotation methods

## bbknn

We run bbknn to perform batch correction, then run knn with the learned neighbors

In [None]:
adata = run_bbknn(adata)
ref_idx = ref.obs_names
query_idx = query.obs_names
run_knn_on_bbknn(adata, 
                 train_idx=ref_idx,
                 test_idx=query_idx, 
                 labels_key='_labels',
                 result_key='knn_on_bbknn_pred')

Running bbknn
Classifying with knn on bbknn distances


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs[result_key][test_idx] = knn_pred
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [None]:
adata

AnnData object with n_obs × n_vars = 268409 × 4000
    obs: 'donor', 'method', 'donor_method', 'cell_ontology_type', '_labels', '_batch', 'batch', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var', 'knn_on_bbknn_pred'
    var: 'feature_types.0.0-0', 'n_cells.0.0-0', 'gene_symbol.0.0-0', 'n_cells.1.0-0', 'n_cells.0-0', 'n_cells.1.1-0', 'feature_types.0.0.0.1-0', 'gene_symbol.0.0.0.1-0', 'n_cells.1.0.0.1-0', 'n_cells.1.0.1-0', 'n_cells-0', 'len-0', 'ensembl_id-0', 'contamination_prop-0-0', 'contamination_prop-1-0', 'contamination_prop-10-0', 'contamination_prop-11-0', 'contamination_prop-12-0', 'contamination_prop-13-0', 'contamination_prop-14-0', 'contamination_prop-2-0', 'contamination_prop-3-0', 'contamination_prop-4-0', 'contamination_prop-5-0', 'contamination_prop-6-0', 'contamination_prop-7-0', 'contamination_prop-8-0', 'contamination_prop-9-0', 'highly_variable-0', 'highly_variable_rank-0', 'means-0', 'variances-0', 'variances_norm-0', 'GeneName-1', 'GeneID-

In [None]:
save_results(adata, results_adata_path, obs_keys=['knn_on_bbknn_pred'])

##scanorama
We run scanorama here 

In [None]:
def cluster_and_subsample_cells(adata, use_rep, n_samples=100000):
  sc.pp.neighbors(adata, use_rep=use_rep)
  sc.tl.leiden(adata, key_added="leiden")
  train_idx = subsample_dataset(adata,'leiden', n_total_samples=n_samples)
  return train_idx

adata = anndata.read(results_adata_path)
adata = adata[query.obs_names].copy()
train_idx = cluster_and_subsample_cells(adata, use_rep='X_scvi')
ref_and_subsetted_query = np.concatenate((ref.obs_names.values,train_idx.values))
adata = adata[ref_and_subsetted_query].copy()

['0' '1' '10' '11' '12' '13' '14' '15' '16' '17' '18' '19' '2' '20' '21'
 '22' '23' '24' '25' '26' '27' '28' '29' '3' '30' '31' '32' '33' '34' '35'
 '36' '37' '38' '39' '4' '40' '41' '42' '5' '6' '7' '8' '9']
Sampling 2325 per label


In [None]:
ref = anndata.read(ref_path)
query = anndata.read(query_path)

#delete later
query.obs[query_labels_key] = 'unknown'

# combine the query and reference datasets
adata = process_query(ref, 
                      query,
                      ref_labels_key,
                      ref_batch_key,
                      query_labels_key,
                      query_batch_key,
                      unknown_celltype_label = 'unknown',
                      training_method='offline')

All ref genes are in query dataset. Can use pretrained models
[34mINFO    [0m Using batches from adata.obs[1m[[0m[32m"_batch"[0m[1m][0m                                 
[34mINFO    [0m Using labels from adata.obs[1m[[0m[32m"_labels"[0m[1m][0m                                 
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"counts"[0m[1m][0m                                 
[34mINFO    [0m Computing library size prior per batch                                 
[34mINFO    [0m Successfully registered anndata object containing [1;34m268409[0m cells, [1;34m4000[0m   
         vars, [1;34m4[0m batches, [1;34m24[0m labels, and [1;34m0[0m proteins. Also registered [1;34m0[0m extra    
         categorical covariates and [1;34m0[0m extra continuous covariates.              
[34mINFO    [0m Please do not further modify adata until model is trained.             


In [None]:
print(ref)

AnnData object with n_obs × n_vars = 12582 × 4000
    obs: 'donor', 'method', 'donor_method', 'cell_ontology_type', 'cell_ontology_id', '_labels', '_batch'
    var: 'feature_types.0.0-0', 'n_cells.0.0-0', 'gene_symbol.0.0-0', 'n_cells.1.0-0', 'n_cells.0-0', 'n_cells.1.1-0', 'feature_types.0.0.0.1-0', 'gene_symbol.0.0.0.1-0', 'n_cells.1.0.0.1-0', 'n_cells.1.0.1-0', 'n_cells-0', 'len-0', 'ensembl_id-0', 'contamination_prop-0-0', 'contamination_prop-1-0', 'contamination_prop-10-0', 'contamination_prop-11-0', 'contamination_prop-12-0', 'contamination_prop-13-0', 'contamination_prop-14-0', 'contamination_prop-2-0', 'contamination_prop-3-0', 'contamination_prop-4-0', 'contamination_prop-5-0', 'contamination_prop-6-0', 'contamination_prop-7-0', 'contamination_prop-8-0', 'contamination_prop-9-0', 'GeneName-1', 'GeneID-1', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'hvg'


In [None]:
print(adata)

AnnData object with n_obs × n_vars = 268409 × 4000
    obs: 'donor', 'method', 'donor_method', 'cell_ontology_type', 'cell_ontology_id', '_labels', '_batch', 'batch', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var'
    var: 'feature_types.0.0-0', 'n_cells.0.0-0', 'gene_symbol.0.0-0', 'n_cells.1.0-0', 'n_cells.0-0', 'n_cells.1.1-0', 'feature_types.0.0.0.1-0', 'gene_symbol.0.0.0.1-0', 'n_cells.1.0.0.1-0', 'n_cells.1.0.1-0', 'n_cells-0', 'len-0', 'ensembl_id-0', 'contamination_prop-0-0', 'contamination_prop-1-0', 'contamination_prop-10-0', 'contamination_prop-11-0', 'contamination_prop-12-0', 'contamination_prop-13-0', 'contamination_prop-14-0', 'contamination_prop-2-0', 'contamination_prop-3-0', 'contamination_prop-4-0', 'contamination_prop-5-0', 'contamination_prop-6-0', 'contamination_prop-7-0', 'contamination_prop-8-0', 'contamination_prop-9-0', 'highly_variable-0', 'highly_variable_rank-0', 'means-0', 'variances-0', 'variances_norm-0', 'GeneName-1', 'GeneID-1

In [None]:
ref_and_subsetted_query = np.concatenate((ref.obs_names.values,subsetted_query_idx.values))


In [None]:
ref_and_subsetted_query

array(['AAACCCACACCTGAAT_TSP2_Thymus_NA_10X_1_1-12-0',
       'AAACCCAGTTTAGACC_TSP2_Thymus_NA_10X_1_1-12-0',
       'AAACGAAAGGATACCG_TSP2_Thymus_NA_10X_1_1-12-0', ...,
       '5478STDY7935100-AGGTCCGCAATGTTGC-1',
       'WSSS8062670-CTCTGGTTCCAGTAGT-1',
       'T06_TH_TOT_5GEX_3_S11-CAGCTGGTCCCGGATG-1'], dtype=object)

In [None]:
adata[subsetted_adata.obs_names]

View of AnnData object with n_obs × n_vars = 112582 × 4000
    obs: 'donor', 'method', 'donor_method', 'cell_ontology_type', 'cell_ontology_id', '_labels', '_batch', 'batch', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var'
    var: 'feature_types.0.0-0', 'n_cells.0.0-0', 'gene_symbol.0.0-0', 'n_cells.1.0-0', 'n_cells.0-0', 'n_cells.1.1-0', 'feature_types.0.0.0.1-0', 'gene_symbol.0.0.0.1-0', 'n_cells.1.0.0.1-0', 'n_cells.1.0.1-0', 'n_cells-0', 'len-0', 'ensembl_id-0', 'contamination_prop-0-0', 'contamination_prop-1-0', 'contamination_prop-10-0', 'contamination_prop-11-0', 'contamination_prop-12-0', 'contamination_prop-13-0', 'contamination_prop-14-0', 'contamination_prop-2-0', 'contamination_prop-3-0', 'contamination_prop-4-0', 'contamination_prop-5-0', 'contamination_prop-6-0', 'contamination_prop-7-0', 'contamination_prop-8-0', 'contamination_prop-9-0', 'highly_variable-0', 'highly_variable_rank-0', 'means-0', 'variances-0', 'variances_norm-0', 'GeneName-1', '

In [None]:
adata[ref_and_subsetted_query]

View of AnnData object with n_obs × n_vars = 112582 × 4000
    obs: 'donor', 'method', 'donor_method', 'cell_ontology_type', 'cell_ontology_id', '_labels', '_batch', 'batch', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var'
    var: 'feature_types.0.0-0', 'n_cells.0.0-0', 'gene_symbol.0.0-0', 'n_cells.1.0-0', 'n_cells.0-0', 'n_cells.1.1-0', 'feature_types.0.0.0.1-0', 'gene_symbol.0.0.0.1-0', 'n_cells.1.0.0.1-0', 'n_cells.1.0.1-0', 'n_cells-0', 'len-0', 'ensembl_id-0', 'contamination_prop-0-0', 'contamination_prop-1-0', 'contamination_prop-10-0', 'contamination_prop-11-0', 'contamination_prop-12-0', 'contamination_prop-13-0', 'contamination_prop-14-0', 'contamination_prop-2-0', 'contamination_prop-3-0', 'contamination_prop-4-0', 'contamination_prop-5-0', 'contamination_prop-6-0', 'contamination_prop-7-0', 'contamination_prop-8-0', 'contamination_prop-9-0', 'highly_variable-0', 'highly_variable_rank-0', 'means-0', 'variances-0', 'variances_norm-0', 'GeneName-1', '

In [None]:
subsetted_adata = adata[ref_and_subsetted_query].copy()


In [None]:
subsetted_adata[subsetted_query_idx]

View of AnnData object with n_obs × n_vars = 100000 × 4000
    obs: 'donor', 'method', 'donor_method', 'cell_ontology_type', 'cell_ontology_id', '_labels', '_batch', 'batch', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var'
    var: 'feature_types.0.0-0', 'n_cells.0.0-0', 'gene_symbol.0.0-0', 'n_cells.1.0-0', 'n_cells.0-0', 'n_cells.1.1-0', 'feature_types.0.0.0.1-0', 'gene_symbol.0.0.0.1-0', 'n_cells.1.0.0.1-0', 'n_cells.1.0.1-0', 'n_cells-0', 'len-0', 'ensembl_id-0', 'contamination_prop-0-0', 'contamination_prop-1-0', 'contamination_prop-10-0', 'contamination_prop-11-0', 'contamination_prop-12-0', 'contamination_prop-13-0', 'contamination_prop-14-0', 'contamination_prop-2-0', 'contamination_prop-3-0', 'contamination_prop-4-0', 'contamination_prop-5-0', 'contamination_prop-6-0', 'contamination_prop-7-0', 'contamination_prop-8-0', 'contamination_prop-9-0', 'highly_variable-0', 'highly_variable_rank-0', 'means-0', 'variances-0', 'variances_norm-0', 'GeneName-1', '

In [None]:
subsetted_adata = run_scanorama(subsetted_adata)


Found 4000 genes among all datasets
[[0.         0.25049718 0.19655287 0.87159533]
 [0.         0.         0.64584906 0.14007782]
 [0.         0.         0.         0.35992218]
 [0.         0.         0.         0.        ]]
Processing datasets (0, 3)
Processing datasets (1, 2)
Processing datasets (2, 3)
Processing datasets (0, 1)
Processing datasets (0, 2)
Processing datasets (1, 3)


In [None]:
subsetted_adata[subsetted_query_idx]


View of AnnData object with n_obs × n_vars = 12582 × 4000
    obs: 'donor', 'method', 'donor_method', 'cell_ontology_type', 'cell_ontology_id', '_labels', '_batch', 'batch', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var'
    var: 'feature_types.0.0-0', 'n_cells.0.0-0', 'gene_symbol.0.0-0', 'n_cells.1.0-0', 'n_cells.0-0', 'n_cells.1.1-0', 'feature_types.0.0.0.1-0', 'gene_symbol.0.0.0.1-0', 'n_cells.1.0.0.1-0', 'n_cells.1.0.1-0', 'n_cells-0', 'len-0', 'ensembl_id-0', 'contamination_prop-0-0', 'contamination_prop-1-0', 'contamination_prop-10-0', 'contamination_prop-11-0', 'contamination_prop-12-0', 'contamination_prop-13-0', 'contamination_prop-14-0', 'contamination_prop-2-0', 'contamination_prop-3-0', 'contamination_prop-4-0', 'contamination_prop-5-0', 'contamination_prop-6-0', 'contamination_prop-7-0', 'contamination_prop-8-0', 'contamination_prop-9-0', 'highly_variable-0', 'highly_variable_rank-0', 'means-0', 'variances-0', 'variances_norm-0', 'GeneName-1', 'G

In [None]:
train_idx = ref.obs_names
test_idx = subsetted_query_idx
run_knn_on_scanorama(subsetted_adata, train_idx, test_idx)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs[result_key][test_idx] = knn_pred
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [None]:
subsetted_adata

AnnData object with n_obs × n_vars = 112582 × 4000
    obs: 'donor', 'method', 'donor_method', 'cell_ontology_type', 'cell_ontology_id', '_labels', '_batch', 'batch', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var', 'knn_on_scanorama_pred'
    var: 'feature_types.0.0-0', 'n_cells.0.0-0', 'gene_symbol.0.0-0', 'n_cells.1.0-0', 'n_cells.0-0', 'n_cells.1.1-0', 'feature_types.0.0.0.1-0', 'gene_symbol.0.0.0.1-0', 'n_cells.1.0.0.1-0', 'n_cells.1.0.1-0', 'n_cells-0', 'len-0', 'ensembl_id-0', 'contamination_prop-0-0', 'contamination_prop-1-0', 'contamination_prop-10-0', 'contamination_prop-11-0', 'contamination_prop-12-0', 'contamination_prop-13-0', 'contamination_prop-14-0', 'contamination_prop-2-0', 'contamination_prop-3-0', 'contamination_prop-4-0', 'contamination_prop-5-0', 'contamination_prop-6-0', 'contamination_prop-7-0', 'contamination_prop-8-0', 'contamination_prop-9-0', 'highly_variable-0', 'highly_variable_rank-0', 'means-0', 'variances-0', 'variances_norm-0'

In [None]:
adata.obs['knn_on_scanorama_pred'] = 'na'

In [None]:
adata.obs['knn_on_scanorama_pred'][subsetted_adata.obs_names] = subsetted_adata.obs['knn_on_scanorama_pred']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [None]:
from scipy import sparse

adata.obsm['X_scanorama'] = sparse.csr_matrix((adata.n_obs, 50))

In [None]:
import pandas as pd
df = pd.DataFrame(index = adata.obs_names, data = np.zeros(shape=(adata.n_obs, 50)))
df.loc[subsetted_adata.obs_names]=subsetted_adata.obsm['X_scanorama']

In [None]:
adata[subsetted_adata.obs_names].obsm['X_scanorama'] = subsetted_adata.obsm['X_scanorama']

In [None]:
adata.obsm['X_scanorama']= df

In [None]:
x_scanorama = sparse.csr_matrix(df.values)
adata.obsm['X_scanorama']=x_scanorama

In [None]:
adata.obsm['X_scanorama']

<268409x50 sparse matrix of type '<class 'numpy.float64'>'
	with 5629100 stored elements in Compressed Sparse Row format>

In [None]:
adata = adata[subsetted_adata.obs_names].copy()

In [None]:
tmp_path = results_adata_path[:-12] + "subsetted_results.h5ad"

In [None]:
res = anndata.read(results_adata_path)

In [None]:
res = res[subsetted_adata.obs_names].copy()

In [None]:
res.write(tmp_path, 'gzip')

In [None]:
tmp_path

'/content/drive/MyDrive/Thymus_eval/notebook_evaluation/subsetted_results.h5ad'

In [None]:
save_results(adata, 
             results_adata_path, 
             obs_keys=['knn_on_scanorama_pred'], 
             obsm_keys=['X_scanorama'])

... storing 'knn_on_scanorama_pred' as categorical


In [None]:
adata

AnnData object with n_obs × n_vars = 268409 × 4000
    obs: 'donor', 'method', 'donor_method', 'cell_ontology_type', 'cell_ontology_id', '_labels', '_batch', 'batch', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var', 'knn_on_scvi_pred'
    var: 'feature_types.0.0-0', 'n_cells.0.0-0', 'gene_symbol.0.0-0', 'n_cells.1.0-0', 'n_cells.0-0', 'n_cells.1.1-0', 'feature_types.0.0.0.1-0', 'gene_symbol.0.0.0.1-0', 'n_cells.1.0.0.1-0', 'n_cells.1.0.1-0', 'n_cells-0', 'len-0', 'ensembl_id-0', 'contamination_prop-0-0', 'contamination_prop-1-0', 'contamination_prop-10-0', 'contamination_prop-11-0', 'contamination_prop-12-0', 'contamination_prop-13-0', 'contamination_prop-14-0', 'contamination_prop-2-0', 'contamination_prop-3-0', 'contamination_prop-4-0', 'contamination_prop-5-0', 'contamination_prop-6-0', 'contamination_prop-7-0', 'contamination_prop-8-0', 'contamination_prop-9-0', 'highly_variable-0', 'highly_variable_rank-0', 'means-0', 'variances-0', 'variances_norm-0', 'Ge

In [None]:
res.obs_names[0] in subsetted_adata.obs_names

True

In [None]:
# adata_subset = adata.copy()
# adata_subset.obsm['X_scanorama']

# import pandas as pd
# df = pd.DataFrame(index = adata.obs_names, data = np.zeros(shape=(adata.n_obs, 50)))
# df.loc[adata_subset.obs_names]=adata_subset.obsm['X_scanorama']
# from scipy import sparse
# x_scanorama = sparse.csr_matrix(df.values)
# adata.obsm['X_scanorama']=x_scanorama

# adata.obs['knn_on_scanorama_pred'] = adata_subset.obs['knn_on_scanorama_pred']
# adata.obs['knn_on_scanorama_pred']
# save_results(adata, 
#              results_adata_path, 
#              obs_keys=['knn_on_scanorama_pred'], 
#              obsm_keys=['X_scanorama'])

In [None]:
#TODO check and run subsampling
adata = run_scanorama(adata)

Found 4000 genes among all datasets
[[0.         0.24361949 0.20931389 0.87548638]
 [0.         0.         0.64352374 0.11867704]
 [0.         0.         0.         0.3540856 ]
 [0.         0.         0.         0.        ]]
Processing datasets (0, 3)
Processing datasets (1, 2)
Processing datasets (2, 3)
Processing datasets (0, 1)
Processing datasets (0, 2)
Processing datasets (1, 3)


In [None]:
adata

AnnData object with n_obs × n_vars = 112582 × 4000
    obs: 'donor', 'method', 'donor_method', 'cell_ontology_type', 'cell_ontology_id', '_labels', '_batch', 'batch', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var'
    var: 'feature_types.0.0-0', 'n_cells.0.0-0', 'gene_symbol.0.0-0', 'n_cells.1.0-0', 'n_cells.0-0', 'n_cells.1.1-0', 'feature_types.0.0.0.1-0', 'gene_symbol.0.0.0.1-0', 'n_cells.1.0.0.1-0', 'n_cells.1.0.1-0', 'n_cells-0', 'len-0', 'ensembl_id-0', 'contamination_prop-0-0', 'contamination_prop-1-0', 'contamination_prop-10-0', 'contamination_prop-11-0', 'contamination_prop-12-0', 'contamination_prop-13-0', 'contamination_prop-14-0', 'contamination_prop-2-0', 'contamination_prop-3-0', 'contamination_prop-4-0', 'contamination_prop-5-0', 'contamination_prop-6-0', 'contamination_prop-7-0', 'contamination_prop-8-0', 'contamination_prop-9-0', 'highly_variable-0', 'highly_variable_rank-0', 'means-0', 'variances-0', 'variances_norm-0', 'GeneName-1', 'GeneID-1

In [None]:
train_idx = ref.obs_names
test_idx = subsetted_query_idx.values
run_knn_on_scanorama(adata, train_idx, test_idx)

KeyError: ignored

In [None]:
adata

AnnData object with n_obs × n_vars = 112582 × 4000
    obs: 'donor', 'method', 'donor_method', 'cell_ontology_type', 'cell_ontology_id', '_labels', '_batch', 'batch', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var'
    var: 'feature_types.0.0-0', 'n_cells.0.0-0', 'gene_symbol.0.0-0', 'n_cells.1.0-0', 'n_cells.0-0', 'n_cells.1.1-0', 'feature_types.0.0.0.1-0', 'gene_symbol.0.0.0.1-0', 'n_cells.1.0.0.1-0', 'n_cells.1.0.1-0', 'n_cells-0', 'len-0', 'ensembl_id-0', 'contamination_prop-0-0', 'contamination_prop-1-0', 'contamination_prop-10-0', 'contamination_prop-11-0', 'contamination_prop-12-0', 'contamination_prop-13-0', 'contamination_prop-14-0', 'contamination_prop-2-0', 'contamination_prop-3-0', 'contamination_prop-4-0', 'contamination_prop-5-0', 'contamination_prop-6-0', 'contamination_prop-7-0', 'contamination_prop-8-0', 'contamination_prop-9-0', 'highly_variable-0', 'highly_variable_rank-0', 'means-0', 'variances-0', 'variances_norm-0', 'GeneName-1', 'GeneID-1

In [None]:
save_results(adata, 
             results_adata_path, 
             obs_keys=['knn_on_scanorama_pred'], 
             obsm_keys=['X_scanorama'])

In [None]:
# adata_subset = adata.copy()
# adata_subset.obsm['X_scanorama']

# import pandas as pd
# df = pd.DataFrame(index = adata.obs_names, data = np.zeros(shape=(adata.n_obs, 50)))
# df.loc[adata_subset.obs_names]=adata_subset.obsm['X_scanorama']
# from scipy import sparse
# x_scanorama = sparse.csr_matrix(df.values)
# adata.obsm['X_scanorama']=x_scanorama

# adata.obs['knn_on_scanorama_pred'] = adata_subset.obs['knn_on_scanorama_pred']
# adata.obs['knn_on_scanorama_pred']
# save_results(adata, 
#              results_adata_path, 
#              obs_keys=['knn_on_scanorama_pred'], 
#              obsm_keys=['X_scanorama'])

## scvi offline

In [None]:
importlib.reload(annotation)
from annotation import (process_query,run_scvi)
run_scvi(adata, n_latent=50, dropout_rate=0.1, dispersion='gene-batch')

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 1/30:   0%|          | 0/30 [00:03<?, ?it/s]

Exception in thread Thread-6:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/pin_memory.py", line 25, in _pin_memory_loop
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.6/multiprocessing/queues.py", line 113, in get
    return _ForkingPickler.loads(res)
  File "/usr/local/lib/python3.6/dist-packages/torch/multiprocessing/reductions.py", line 282, in rebuild_storage_fd
    fd = df.detach()
  File "/usr/lib/python3.6/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connection(self._id) as conn:
  File "/usr/lib/python3.6/multiprocessing/resource_sharer.py", line 87, in get_connection
    c = Client(address, authkey=process.current_process().authkey)
  File "/usr/lib/python




In [None]:
train_idx = ref.obs_names
test_idx = query.obs_names
run_knn_on_scvi(adata,
                train_idx,
                test_idx, 
                obsm_key='X_scvi',
                result_key='knn_on_scvi_pred')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs[result_key][test_idx] = knn_pred
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [None]:
sc.pp.neighbors(adata, use_rep="X_scvi")
sc.tl.umap(adata)

In [None]:
sc.pl.umap(
    adata,
    color=["method", "cell_ontology_type"],
    ncols=1
)

In [None]:
save_results(adata, 
             results_adata_path, 
             obs_keys=['knn_on_scvi_pred'], 
             obsm_keys=['X_scvi'])


## scvi online

In [None]:
importlib.reload(annotation)
from annotation import run_scvi
scvi.data.setup_anndata(ref, batch_key='_batch', labels_key='_labels')

[34mINFO    [0m Using batches from adata.obs[1m[[0m[32m"_batch"[0m[1m][0m                                 
[34mINFO    [0m Using labels from adata.obs[1m[[0m[32m"_labels"[0m[1m][0m                                 
[34mINFO    [0m Using data from adata.X                                                
[34mINFO    [0m Computing library size prior per batch                                 
[34mINFO    [0m Successfully registered anndata object containing [1;34m32704[0m cells, [1;34m4000[0m    
         vars, [1;34m2[0m batches, [1;34m23[0m labels, and [1;34m0[0m proteins. Also registered [1;34m0[0m extra    
         categorical covariates and [1;34m0[0m extra continuous covariates.              
[34mINFO    [0m Please do not further modify adata until model is trained.             


In [None]:
run_scvi(ref, n_latent=50, dropout_rate=0.1, dispersion='gene-batch', 
         save_folder='/content/drive/MyDrive/Lung_eval/scvi_model')

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


suh
Epoch 1/245:   0%|          | 0/245 [00:00<?, ?it/s]



Epoch 245/245: 100%|██████████| 245/245 [03:29<00:00,  1.17it/s, loss=1.01e+03, v_num=1]


In [None]:
def train_online_scvi(adata, 
                      pretrained_scvi_path,
                      max_epochs=None,
                      save_path=None,
                      overwrite=True):
  if max_epochs is None:
    n_cells = adata.n_obs
    max_epochs = np.min([round((20000 / n_cells) * 400), 400])
    max_epochs = int(max_epochs/2)
  
  model = scvi.model.SCVI.load_query_data(adata,
                                          pretrained_scvi_path)
  model.train(max_epochs=max_epochs, 
              train_size=1.0)
  adata.obsm['X_scvi'] = model.get_latent_representation()
  if save_path is not None:
    model.save(save_path, overwrite=overwrite)


In [None]:
train_online_scvi(query,
                  '/content/drive/MyDrive/Lung_eval/scvi_model', 
                  save_path='/content/drive/MyDrive/Lung_eval/scarches_model')

[34mINFO    [0m Using data from adata.X                                                
[34mINFO    [0m Computing library size prior per batch                                 
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'local_l_mean'[0m, [32m'local_l_var'[0m,  
         [32m'labels'[0m[1m][0m                                                              
[34mINFO    [0m Successfully registered anndata object containing [1;34m75071[0m cells, [1;34m4000[0m    
         vars, [1;34m4[0m batches, [1;34m24[0m labels, and [1;34m0[0m proteins. Also registered [1;34m0[0m extra    
         categorical covariates and [1;34m0[0m extra continuous covariates.              


GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 1/53:   0%|          | 0/53 [00:00<?, ?it/s]



Epoch 53/53: 100%|██████████| 53/53 [07:12<00:00,  8.15s/it, loss=1.21e+03, v_num=1]


In [None]:
query

AnnData object with n_obs × n_vars = 75071 × 4000
    obs: 'cell_id', 'method', 'donor', 'cell_ontology_type', 'donor_method', 'cell_ontology_id', 'n_counts', 'n_genes', '_batch', '_labels', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var'
    var: 'feature_types.0.0-0', 'n_cells.0.0-0', 'gene_symbol.0.0-0', 'n_cells.1.0-0', 'n_cells.0-0', 'n_cells.1.1-0', 'feature_types.0.0.0.1-0', 'gene_symbol.0.0.0.1-0', 'n_cells.1.0.0.1-0', 'n_cells.1.0.1-0', 'n_cells-0', 'len-0', 'ensembl_id-0', 'contamination_prop-0-0', 'contamination_prop-1-0', 'contamination_prop-10-0', 'contamination_prop-11-0', 'contamination_prop-12-0', 'contamination_prop-13-0', 'contamination_prop-14-0', 'contamination_prop-2-0', 'contamination_prop-3-0', 'contamination_prop-4-0', 'contamination_prop-5-0', 'contamination_prop-6-0', 'contamination_prop-7-0', 'contamination_prop-8-0', 'contamination_prop-9-0'
    uns: '_scvi'
    obsm: 'X_scvi'

In [None]:
def run_knn_on_scvi(adata,
                    train_idx,
                    test_idx, 
                    obsm_key='X_scvi',
                    result_key='knn_on_scvi_pred'):
  
  if obsm_key not in adata.obsm.keys(): 
    print('Please train scVI first or pass in a valid obsm_key.')
  
  train_X = adata[train_idx].obsm[obsm_key]
  test_X = adata[test_idx].obsm[obsm_key]

  train_Y = adata[train_idx].obs['_labels'].to_numpy()
  knn = KNeighborsClassifier(n_neighbors = 15, weights='uniform')
  knn.fit(train_X, train_Y)
  knn_pred= knn.predict(test_X)

  #save_results
  adata.obs[result_key] = 'na'
  adata.obs[result_key][test_idx] = knn_pred

In [None]:
obsm_key = 'X_scvi'
train_X = ref.obsm[obsm_key]
train_Y = ref.obs['_labels'].to_numpy()

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 15, weights='uniform')


In [None]:
knn.fit(train_X, train_Y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='uniform')

In [None]:
test_X = query.obsm[obsm_key]
knn_pred= knn.predict(test_X)

In [None]:
query.obs['knn_on_scvi_scarches_pred'] = knn_pred

In [None]:
query.write('/content/drive/MyDrive/Lung_eval/scarches_results.h5ad', 'gzip')

... storing 'cell_ontology_type' as categorical
... storing '_batch' as categorical
... storing '_labels' as categorical
... storing 'knn_on_scvi_scarches_pred' as categorical


## scanvi offline

In [None]:
adata

AnnData object with n_obs × n_vars = 33135 × 3463
    obs: 'donor', 'method', 'donor_method', 'cell_ontology_type', 'cell_ontology_id', '_labels', '_batch', 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'batch', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var'
    var: 'feature_types.0.0-0-0', 'n_cells.0.0-0-0', 'gene_symbol.0.0-0-0', 'n_cells.1.0-0-0', 'n_cells.0-0-0', 'n_cells.1.1-0-0', 'feature_types.0.0.0.1-0-0', 'gene_symbol.0.0.0.1-0-0', 'n_cells.1.0.0.1-0-0', 'n_cells.1.0.1-0-0', 'n_cells-0-0', 'len-0-0', 'ensembl_id-0-0', 'contamination_prop-0-0-0', 'contamination_prop-1-0-0', 'contamination_prop-10-0-0', 'contamination_prop-11-0-0', 'contamination_prop-12-0-0', 'contamination_prop-13-0-0', 'contamination_prop-14-0-0', 'contamination_prop-2-0-0', 'contamination_prop-3-0-0', 'contamination_prop-4-0-0', 'contamination_prop-5-0-0', 'contamination_prop-6-0-0', '

In [None]:
run_scanvi(adata, n_latent=100, dropout_rate=0.1, 
           n_epochs_semisupervised=1,
           n_epochs_unsupervised=1, 
           save_folder = 'thymus_scanvi')

[34mINFO    [0m Training Unsupervised Trainer for [1;34m1[0m epochs.                            
[34mINFO    [0m Training SemiSupervised Trainer for [1;34m1[0m epochs.                          


GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 1/1:   0%|          | 0/1 [00:00<?, ?it/s]



Epoch 1/1: 100%|██████████| 1/1 [00:00<00:00,  1.03it/s, loss=1.18e+03, v_num=1]

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Epoch 1/1: 100%|██████████| 1/1 [00:01<00:00,  1.73s/it, loss=1.25e+03, v_num=1]


In [None]:
save_results(adata, 
             results_adata_path, 
             obs_keys=['scanvi_pred'])

## scanvi online

In [None]:
scvi.data.setup_anndata(ref, batch_key='_batch', labels_key='_labels')

[34mINFO    [0m Using batches from adata.obs[1m[[0m[32m"_batch"[0m[1m][0m                                 
[34mINFO    [0m Using labels from adata.obs[1m[[0m[32m"_labels"[0m[1m][0m                                 
[34mINFO    [0m Using data from adata.X                                                
[34mINFO    [0m Computing library size prior per batch                                 
[34mINFO    [0m Successfully registered anndata object containing [1;34m12582[0m cells, [1;34m4000[0m    
         vars, [1;34m2[0m batches, [1;34m23[0m labels, and [1;34m0[0m proteins. Also registered [1;34m0[0m extra    
         categorical covariates and [1;34m0[0m extra continuous covariates.              
[34mINFO    [0m Please do not further modify adata until model is trained.             


In [None]:
run_scanvi(ref, n_latent=100, dropout_rate=0.1, 
           n_epochs_semisupervised=None,
           n_epochs_unsupervised=None, 
           save_folder = 'thymus_scanvi')


[34mINFO    [0m Training Unsupervised Trainer for [1;34m400[0m epochs.                          
[34mINFO    [0m Training SemiSupervised Trainer for [1;34m400[0m epochs.                        


GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 1/400:   0%|          | 0/400 [00:00<?, ?it/s]



Epoch 400/400: 100%|██████████| 400/400 [03:33<00:00,  1.88it/s, loss=964, v_num=1]

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Epoch 400/400: 100%|██████████| 400/400 [06:06<00:00,  1.09it/s, loss=1.06e+03, v_num=1]


In [None]:
def train_online_scanvi(adata, 
                        pretrained_scanvi_path,
                        max_epochs=None,
                        trainer_kwargs=None,
                        n_samples_per_label=100,
                        save_path=None,
                        overwrite=True):
  if max_epochs is None:
    n_cells = adata.n_obs
    max_epochs = np.min([round((20000 / n_cells) * 400), 400])
    max_epochs = int(max_epochs/2)

  model = scvi.model.SCANVI.load_query_data(adata, 
                                            pretrained_scanvi_path, 
                                            freeze_classifier=True)
  model.train(max_epochs = max_epochs, 
              train_size =1.0,
              n_samples_per_label=n_samples_per_label)
  if save_path is not None:
    model.save(save_path, overwrite=overwrite)

In [None]:
train_online_scanvi(query,
                    pretrained_scanvi_path='thymus_scanvi/scanvi_model', 
                    semisupervised_trainer_kwargs=dict(weight_decay=0.0),
                    save_path='/content/drive/MyDrive/Thymus_eval/scanvi_online')

[34mINFO    [0m Using data from adata.X                                                
[34mINFO    [0m Computing library size prior per batch                                 
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'local_l_mean'[0m, [32m'local_l_var'[0m,  
         [32m'labels'[0m[1m][0m                                                              
[34mINFO    [0m Successfully registered anndata object containing [1;34m255827[0m cells, [1;34m4000[0m   
         vars, [1;34m4[0m batches, [1;34m24[0m labels, and [1;34m0[0m proteins. Also registered [1;34m0[0m extra    
         categorical covariates and [1;34m0[0m extra continuous covariates.              
[34mINFO    [0m Training Unsupervised Trainer for [1;34m31[0m epochs.                           
[34mINFO    [0m Training SemiSupervised Trainer for [1;34m10[0m epochs.                         


GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 1/10:   0%|          | 0/10 [00:00<?, ?it/s]



Epoch 10/10: 100%|██████████| 10/10 [02:14<00:00, 13.46s/it, loss=1.8e+03, v_num=1]


In [None]:
m = scvi.model.SCANVI.load('/content/drive/MyDrive/Thymus_eval/scanvi_online/', query)
adata.obs['scanvi_online_pred']= 'na'
adata.obs['scanvi_online_pred'][query.obs_names] = m.predict()

[34mINFO    [0m Using data from adata.X                                                
[34mINFO    [0m Computing library size prior per batch                                 
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'local_l_mean'[0m, [32m'local_l_var'[0m,  
         [32m'labels'[0m[1m][0m                                                              
[34mINFO    [0m Successfully registered anndata object containing [1;34m255827[0m cells, [1;34m4000[0m   
         vars, [1;34m4[0m batches, [1;34m24[0m labels, and [1;34m0[0m proteins. Also registered [1;34m0[0m extra    
         categorical covariates and [1;34m0[0m extra continuous covariates.              


In [None]:
save_results(adata, 
             results_adata_path, 
             obs_keys=['scanvi_online_pred'])

... storing 'scanvi_online_pred' as categorical


## SVM on HVGs

In [None]:
train_idx = subsample_dataset(adata[ref.obs_names],ref_labels_key)
run_svm_on_hvg(adata, train_idx, query.obs_names)

(1827, 4000)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs['svm_pred'][test_idx] = svm_pred
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


AnnData object with n_obs × n_vars = 268409 × 4000
    obs: 'donor', 'method', 'donor_method', 'cell_ontology_type', '_labels', '_batch', 'batch', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var', 'svm_pred'
    var: 'feature_types.0.0-0', 'n_cells.0.0-0', 'gene_symbol.0.0-0', 'n_cells.1.0-0', 'n_cells.0-0', 'n_cells.1.1-0', 'feature_types.0.0.0.1-0', 'gene_symbol.0.0.0.1-0', 'n_cells.1.0.0.1-0', 'n_cells.1.0.1-0', 'n_cells-0', 'len-0', 'ensembl_id-0', 'contamination_prop-0-0', 'contamination_prop-1-0', 'contamination_prop-10-0', 'contamination_prop-11-0', 'contamination_prop-12-0', 'contamination_prop-13-0', 'contamination_prop-14-0', 'contamination_prop-2-0', 'contamination_prop-3-0', 'contamination_prop-4-0', 'contamination_prop-5-0', 'contamination_prop-6-0', 'contamination_prop-7-0', 'contamination_prop-8-0', 'contamination_prop-9-0', 'highly_variable-0', 'highly_variable_rank-0', 'means-0', 'variances-0', 'variances_norm-0', 'GeneName-1', 'GeneID-1', 'mean

In [None]:
save_results(adata, 
             results_adata_path, 
             obs_keys=['svm_pred'])

... storing 'svm_pred' as categorical


## Random Forest on HVGs

In [None]:
train_idx = subsample_dataset(adata[ref.obs_names],ref_labels_key)
run_rf_on_hvg(adata, train_idx, query.obs_names, save_key='rf_on_hvg_pred')

Training random forest classifier with ['CD8-positive, alpha-beta T cell' 'CD8-positive, alpha-beta T cell'
 'CD8-positive, alpha-beta T cell' ... 'thymocyte' 'thymocyte' 'thymocyte'] cells


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs[save_key][test_idx] = rf_pred
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [None]:
save_results(adata, 
             results_adata_path, 
             obs_keys=['rf_on_hvg_pred'])

... storing 'rf_on_hvg_pred' as categorical


## OnClass

In [None]:
print(ref_path)
print(query_path)

run_onclass(adata=adata, 
            max_iter=2,
            ref_adata_path=ref_path,
            query_adata_path=query_path, 
            cl_obo_file=cl_obo_file, 
            cl_ontology_file=cl_ontology_file)

/content/drive/MyDrive/PBMC/multi.new.h5ad
/content/drive/MyDrive/PBMC/pbmc_query.h5ad
init OnClass
100.000000 precentage of labels are in the Cell Ontology


  d['descr'] = dtype_to_descr(array.dtype)


Cost after epoch 1: loss:3180.003 acc: 0.875 auc: 0.996
Cost after epoch 2: loss:1536.538 acc: 0.883 auc: 0.996
training finished
no label file is provided
5527
number of intersection genes 20713
5527 5527


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs['onclass_pred'][test_adata.obs_names]=test_adata.obs['onclass_pred']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


AnnData object with n_obs × n_vars = 26080 × 4000
    obs: 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'cell_ontology_type', 'method', 'cell_ontology_id', '_labels', '_batch', 'batch', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var', 'onclass_pred'
    var: 'features-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'mean', 'std', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'log1p', 'hvg', '_scvi'
    layers: 'counts'

In [None]:
save_results(adata, 
             results_adata_path, 
             obs_keys=['onclass_pred'])

... storing 'onclass_pred' as categorical


# Statistics

In [None]:
# save results to this directory
res_path = '/content/drive/MyDrive/Thymus_eval/output/'
sc.settings.figdir = res_path

## Compute Agreement and Disagreement

In [None]:
# TODO change to path of results
# results = anndata.read('/content/drive/MyDrive/Lung_eval/new_api_ts_as_ref/results.h5ad')
results = anndata.read(results_adata_path)

In [None]:
results.obs['cell_ontology_type'] = adata.obs['cell_ontology_type']

In [None]:
methods = ['knn_on_bbknn_pred',
 'knn_on_scanorama_pred',
 'knn_on_scvi_pred',
 'scanvi_pred',
 'onclass_pred',
 'svm_on_hvg_pred',
 'rf_on_hvg_pred']

labels = results.obs['cell_ontology_type'].str.lower()

### make all celltype names lower case to accommadate onclass predictions

In [None]:
for x in methods:
    results.obs[x] = [y.lower() for y in results.obs[x]]

In [None]:
ref_cells = results.obs['_labels'].values!='unknown'

### Overall agreement


In [None]:
agreement = results.obs[methods].apply(MajorityCount, axis=1)
agreement[ref_cells] = 0 
results.obs["majority_pred"] = results.obs[methods].apply(MajorityVote, axis=1)
results.obs["agreement"] = agreement.values.round(2).astype(str)

In [None]:
#@markdown this computes the percentage of cells that the algorithms agree/disagree on 
agreement_counts = pd.DataFrame(
    np.unique(results[~ref_cells].obs["agreement"], return_counts=True)
).T

agreement_counts[0] = np.round(agreement_counts[0].astype(float) * 7).astype(int)
agreement_counts["freq"] = agreement_counts[1] / np.sum(agreement_counts[1])
agreement_counts.columns = ["n_agree", "count", "freq"]
display(agreement_counts)

Unnamed: 0,n_agree,count,freq
0,1,1,1.33207e-05
1,2,57,0.000759281
2,3,586,0.00780594
3,4,1849,0.02463
4,5,3155,0.0420269
5,6,13097,0.174462
6,7,56326,0.750303


### which algorithm tends to be the disagreement? 

In [None]:
agreement_counts.to_csv(res_path + 'agreement_counts.csv')

FileNotFoundError: ignored

### per celltype agreement

In [None]:
#@ This computes what ground truth celltypes the algorithm disagrees on the most
disagreement = {}
for x in np.unique(labels[~ref_cells]):
  temp = pd.DataFrame(np.unique(results[(~ref_cells) & (labels == x)].obs["agreement"],return_counts=True))
  temp.columns = temp.loc[0]
  temp = temp.loc[1]
  disagreement[x] = temp

disagreement = pd.DataFrame.from_dict(disagreement).T
disagreement.fillna(0, inplace=True)
disagreement = (disagreement.T / disagreement.sum(axis=1)).T.sort_values('1.0')
disagreement.style.format('{0:,.2f}').background_gradient(axis=0) 
disagreement.to_csv(res_path + "per_celltype_agreement.csv")

KeyError: ignored

In [None]:
disagreement

NaN


In [None]:
### import cell ontology 


### import cell ontology 


In [None]:
import obonet

obo = "/content/drive/MyDrive/ts_evaluation/" + "cl.obo"
f = open(obo, "r")
co = obonet.read_obo(f)
f.close()

celltype_dict = {}
for x in co.nodes:
    celltype_dict[co.nodes[x]["name"].lower()] = x


In [None]:
results.obsm['X_umap'] = results.obsm['X_scvi_50latent_0.1dropout_genebatchdispersion_umap']
CM = {}
ACC1 = {}
ACC2 = {}
MEAN = {}
for name in methods + ['majority_pred']:
    pred = results.obs[name].values

    df, prop = PredictionEval(
      pred[~ref_cells], labels[~ref_cells], res_path, name=name
    )

    acc1, acc2, acc_detail, mean1, mean2 = EvalSummary(
        df, prop.loc[df.index, "prop"].values, co
    )

    print(name, "weighted mean accuracy:", mean2, "unweighted mean accuracy", np.mean(acc2))

    acc_bar = accuracy_barplot(
        df, acc1, acc2, prop.loc[df.index, "prop"].values, name, res_path
    )
    ACC1[name] = acc1
    ACC2[name] = acc2
    MEAN[name] = [mean1, mean2, np.mean(acc1), np.mean(acc2)]
    CM[name] = df
    sc.pl.umap(
        results[~ref_cells],
        color=[name],
        save = 'scVI_pred_%s.pdf'%name,
        title = 'UMAP scVI Prediction %s'%name
    )


## Consensus Statistics

In [None]:
labels = results.obs["cell_ontology_type"].astype(str).str.lower()

In [None]:
# TODO add consensus statistics code

## Summary of cell type compositions

In [None]:
# TODO add cell type composition code here