# Annotation Notebook


This notebook lets you annotate your own data with the tabula sapiens dataset.

By default, it will run the following methods: `onclass`, `scanVI`, `svm`, and `singleCellNet`. Compute permitting, we suggest running all the methods. If your dataset exceeds 100k cells, total runtime will be around 2-3 hours on GPU. (CHECK THIS)


## Arguments:
- **annotation_method**: list from [`"onclass"`, `"scanvi"`, `"svm"`, `"singlecellnet"`]
- **tissue**: `None` or one of [`Bladder`, `Blood`, `Bone_Marrow`, `Kidney`, `Large_Intestine`, `Lung`, `Lymph_Node`, `Muscle`, `Pancreas`, `Skin`,`Small_Intestine`, `Spleen`, `Thymus`, `Trachea`, `Vasculature`]. If `None`, will use the entire tabula sapiens dataset.
- **input_anndata**: path to your input anndata
- **output_folder_name**: folder in `/data` to save outputs to. Should be unexisting directory
- **use_gpu**: if `True`, will use the GPU for training. Note: runtimes are significantly longer on CPU
- **use_10X_only**: If `True`, only uses the 10X data from tabula sapiens. This should only equal True if `input_anndata` is 10X. Based on our observations, scanVI will perform better if only using 10X data. Should not be True if you have smartseq2 data.
- **batch_correction_conditions**: List from [`"donor"`, `"method"`] or `None`

Optional arguments for scanVI:
- **scvi_model**: path to pretrained scvi model. Default: `None`.
- **scanvi_model**: path to pretrained scanvi mode. Default: `None`.
- **n_scvi_epochs**: n_epochs to train scvi for. Default: `400` 
- **n_scanvi_epochs**: n_epochs to train scanvi for. Default: `15`


In [3]:
%load_ext autoreload
%autoreload 2
import os
from annotation import setup_dataset, svm_annotation, onclass_annotation, singlecellnet_annotation

# annotation_method = ["onclass", 'scanvi', 'svm']
annotation_method = ['onclass']

tissue = 'Lung'
input_anndata = 'data/Lung_test.h5ad'
output_folder_name = 'lung_evaluation'
use_gpu = True
use_10X_only= True
batch_correction_conditions = ['donor', 'method']
annotation_key = 'manual_annotation'

#refernce dataset filepath
tabula_sapiens_filepath = 'data/Lung_ref.h5ad'

#scVI arguments:
scvi_model = None
scanvi_model= None
n_scvi_epochs= 400
n_scanvi_epochs = 15

output_folder = os.path.join('data', output_folder_name)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
else:
    raise ValueError("{} already exists. Please provide an unexisting directory to save outputs".format(output_folder))


ValueError: data/lung_evaluation already exists. Please provide an unexisting directory to save outputs

In [5]:
full = setup_dataset(input_anndata, tabula_sapiens_filepath, tissue, use_10X_only, batch_correction_conditions)

import scanpy as sc
sc.pp.highly_variable_genes(full, flavor='seurat_v3', subset=True, n_top_genes=4000)
print(full)

if 'scanvi' in annotation_method:
    scanvi_annotation(
        full_dataset= full,
        batch_key='batch_indices', 
        output_folder = output_folder,
        ts_label_key = annotation_key,
        scvi_model=None,
        scanvi_model=None,
        n_scvi_epochs = 72,
        n_scanvi_epochs = 5,
        use_gpu = use_gpu)
    
if "onclass" in annotation_method:
    onclass_annotation(input_anndata, 
                       output_folder, 
                       ref_anndata_path= 'data/OnClass_data/data_used_for_training/tabula-muris-senis-facs_cell_ontology_test.h5ad',
                       ref_label_key = 'manual_cell_ontology_class')

if "svm" in annotation_method:
    svm_annotation( 
    full,
    batch_key='_batch_indices', 
    ts_label_key = annotation_key,
    output_folder= output_folder)

if "singlecellnet" in annotation_method:
    singlecellnet_annotation()

  if not is_categorical(df_full[k]):
Trying to set attribute `.obs` of view, copying.
  df.loc[: int(n_top_genes), 'highly_variable'] = True
  if not is_categorical(df_full[k]):


AnnData object with n_obs × n_vars = 98366 × 4000
    obs: 'cell_id', 'method', 'donor', 'manual_annotation', 'donor_method', 'tissue', '_batch', '_batch_indices', 'batch', '_dataset'
    var: 'feature_types.0.0-0', 'n_cells.0.0-0', 'gene_symbol.0.0-0', 'n_cells.1.0-0', 'n_cells.0-0', 'n_cells.1.1-0', 'feature_types.0.0.0.1-0', 'gene_symbol.0.0.0.1-0', 'n_cells.1.0.0.1-0', 'n_cells.1.0.1-0', 'n_cells-0', 'len-0', 'ensembl_id-0', 'contamination_prop-0-0', 'contamination_prop-1-0', 'contamination_prop-10-0', 'contamination_prop-11-0', 'contamination_prop-12-0', 'contamination_prop-13-0', 'contamination_prop-14-0', 'contamination_prop-2-0', 'contamination_prop-3-0', 'contamination_prop-4-0', 'contamination_prop-5-0', 'contamination_prop-6-0', 'contamination_prop-7-0', 'contamination_prop-8-0', 'contamination_prop-9-0', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'hvg'
Embed the cell ontology
init OnClass
Here, we used the pretrain cell type e

FileNotFoundError: [Errno 2] No such file or directory: 'chenling_cl.obo'

In [None]:
if "onclass" in annotation_method:
    onclass_annotation(input_anndata, 
                       output_folder, 
                       ref_anndata_path ='data/Lung_ref.h5ad',
                       ref_label_key = 'manual_annotation')


In [6]:
annotation_key

'manual_annotation'

In [9]:
full_dataset = full

In [11]:
def subsample_dataset(train_data, labels_key, n_samples):
    sample_idx = []
    labels, counts = np.unique(train_data.obs[labels_key], return_counts=True)
    for i, label in enumerate(labels):
        label_locs = np.where(train_data.obs[labels_key] == label)[0]        
        if counts[i] < n_samples:
            sample_idx.append(label_locs)
        else:
            label_subset = np.random.choice(label_locs, n_samples, replace = False)
            sample_idx.append(label_subset)
    sample_idx = np.concatenate(sample_idx)
    return sample_idx


In [14]:
import numpy as np
train_data = full_dataset[full_dataset.obs['_dataset'] == 'tabula_sapiens']
train_idx = subsample_dataset(train_data, annotation_key, 100)
train_data = train_data[train_idx].copy()
test_data = full_dataset[full_dataset.obs['_dataset'] == 'user']

In [15]:
train_data

AnnData object with n_obs × n_vars = 3826 × 4000
    obs: 'cell_id', 'method', 'donor', 'manual_annotation', 'donor_method', 'tissue', '_batch', '_batch_indices', 'batch', '_dataset'
    var: 'feature_types.0.0-0', 'n_cells.0.0-0', 'gene_symbol.0.0-0', 'n_cells.1.0-0', 'n_cells.0-0', 'n_cells.1.1-0', 'feature_types.0.0.0.1-0', 'gene_symbol.0.0.0.1-0', 'n_cells.1.0.0.1-0', 'n_cells.1.0.1-0', 'n_cells-0', 'len-0', 'ensembl_id-0', 'contamination_prop-0-0', 'contamination_prop-1-0', 'contamination_prop-10-0', 'contamination_prop-11-0', 'contamination_prop-12-0', 'contamination_prop-13-0', 'contamination_prop-14-0', 'contamination_prop-2-0', 'contamination_prop-3-0', 'contamination_prop-4-0', 'contamination_prop-5-0', 'contamination_prop-6-0', 'contamination_prop-7-0', 'contamination_prop-8-0', 'contamination_prop-9-0', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersecti

In [17]:
train_X = train_data.X
test_X = test_data.X
train_Y = train_data.obs[annotation_key]


In [18]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

In [19]:
clf.fit(train_X, train_Y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [21]:
test_Y = clf.predict(test_X)

In [22]:
import pandas as pd

In [24]:
pd.DataFrame(index = test_data.obs_names, data = test_Y).to_csv('data/lung_evaluation/random_forest_pred.csv')

In [8]:
if "svm" in annotation_method:
    svm_annotation( 
    full,
    batch_key='_batch_indices', 
    ts_label_key = annotation_key,
    output_folder= output_folder)

> /home/annotation.py(203)svm_annotation()
-> train_data = train_data[train_idx].copy()


(Pdb)  n


> /home/annotation.py(204)svm_annotation()
-> print(train_data)


(Pdb)  


AnnData object with n_obs × n_vars = 3826 × 4000
    obs: 'cell_id', 'method', 'donor', 'manual_annotation', 'donor_method', 'tissue', '_batch', '_batch_indices', 'batch', '_dataset'
    var: 'feature_types.0.0-0', 'n_cells.0.0-0', 'gene_symbol.0.0-0', 'n_cells.1.0-0', 'n_cells.0-0', 'n_cells.1.1-0', 'feature_types.0.0.0.1-0', 'gene_symbol.0.0.0.1-0', 'n_cells.1.0.0.1-0', 'n_cells.1.0.1-0', 'n_cells-0', 'len-0', 'ensembl_id-0', 'contamination_prop-0-0', 'contamination_prop-1-0', 'contamination_prop-10-0', 'contamination_prop-11-0', 'contamination_prop-12-0', 'contamination_prop-13-0', 'contamination_prop-14-0', 'contamination_prop-2-0', 'contamination_prop-3-0', 'contamination_prop-4-0', 'contamination_prop-5-0', 'contamination_prop-6-0', 'contamination_prop-7-0', 'contamination_prop-8-0', 'contamination_prop-9-0', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersecti

(Pdb)  


suhh
> /home/annotation.py(206)svm_annotation()
-> test_data = full_dataset[full_dataset.obs['_dataset'] == 'user']


(Pdb)  c




  view_to_actual(adata)
  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'SVM_pred' as categorical


In [None]:
# it took 6:30 for unsupervised training of scvi
# it took 5:30 for semi supervised training of scanvi


In [81]:
import scvi
full = setup_dataset(input_anndata, tabula_sapiens_filepath, tissue, use_10X_only, batch_correction_conditions)
sc.pp.highly_variable_genes(full, flavor='seurat_v3', subset=True, n_top_genes=4000)

train_data = full[full.obs['_dataset'] == 'tabula_sapiens'].copy()
test_data = full[full.obs['_dataset'] == 'user'].copy()
    

  if not is_categorical(df_full[k]):
Trying to set attribute `.obs` of view, copying.
  df.loc[: int(n_top_genes), 'highly_variable'] = True
  if not is_categorical(df_full[k]):


In [82]:
np.unique(train_data.X[:100].todense())

matrix([[ 0.,  0.,  0., ..., 68., 76., 83.]], dtype=float32)

In [143]:
import numpy as np

def subsample_dataset(train_data, labels_key, n_samples):
    sample_idx = []
    labels, counts = np.unique(train_data.obs[labels_key], return_counts=True)
    for i, label in enumerate(labels):
        label_locs = np.where(train_data.obs[labels_key] == label)[0]        
        if counts[i] < n_samples:
            sample_idx.append(label_locs)
        else:
            label_subset = np.random.choice(label_locs, n_samples, replace = False)
            sample_idx.append(label_subset)
    sample_idx = np.concatenate(sample_idx)
    return sample_idx

In [144]:
sample_idx = subsample_dataset(train_data, labels_key = 'manual_annotation', n_samples = 100)

In [147]:
train_data.obs['scanvi_labels'] = 'unknown'
train_data.obs['scanvi_labels'][sample_idx] = train_data.obs['manual_annotation'][sample_idx]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [153]:
scvi.data.setup_anndata(train_data, batch_key = 'donor_method', labels_key = 'scanvi_labels')

[34mINFO[0m      Using batches from adata.obs[1m[[0m[32m"donor_method"[0m[1m][0m                                       
[34mINFO[0m      Using labels from adata.obs[1m[[0m[32m"scanvi_labels"[0m[1m][0m                                       
[34mINFO[0m      Using data from adata.X                                                            
[34mINFO[0m      Computing library size prior per batch                                             
[34mINFO[0m      Successfully registered anndata object containing [1;34m65662[0m cells, [1;34m4000[0m vars, [1;34m3[0m        
          batches, [1;34m43[0m labels, and [1;34m0[0m proteins. Also registered [1;34m0[0m extra categorical covariates 
          and [1;34m0[0m extra continuous covariates.                                                 
[34mINFO[0m      Please do not further modify adata until model is trained.                         


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):


In [155]:
model = scvi.model.SCANVI(train_data,
                          unlabeled_category = 'unknown', 
                          use_cuda = True, 
                          n_layers=3, 
                          n_latent=50, 
                          dispersion='gene-batch')

In [162]:
unsupervised_trainer_kwargs = dict(data_loader_kwargs=dict(pin_memory=True))
semisupervised_trainer_kwargs = dict(data_loader_kwargs=dict(pin_memory=True))
semisupervised_train_kwargs = dict(batch_size = 1024)
unsupervised_train_kwargs = dict(batch_size = 1024)

In [163]:
model.train(train_size=1.0,n_epochs_kl_warmup=10,unsupervised_trainer_kwargs=unsupervised_trainer_kwargs ,
            semisupervised_trainer_kwargs = semisupervised_trainer_kwargs,
            semisupervised_train_kwargs=semisupervised_train_kwargs,
            unsupervised_train_kwargs = unsupervised_train_kwargs)


[34mINFO[0m      Training Unsupervised Trainer for [1;34m122[0m epochs.                                      
[34mINFO[0m      Training SemiSupervised Trainer for [1;34m10[0m epochs.                                     
[34mINFO[0m      KL warmup for [1;34m10[0m epochs                                                            
Training...:   1%|          | 1/122 [04:50<9:46:37, 290.89s/it]
Training...:   4%|▍         | 5/122 [19:05<7:23:41, 227.54s/it]

KeyboardInterrupt: 

In [11]:
full = setup_dataset(input_anndata, tabula_sapiens_filepath, tissue, use_10X_only, batch_correction_conditions)


Trying to set attribute `.obs` of view, copying.


In [14]:
full.X

<102665x4000 sparse matrix of type '<class 'numpy.float32'>'
	with 24318139 stored elements in Compressed Sparse Row format>

In [12]:
np.unique(full.obs['_dataset'])

array(['user'], dtype=object)

In [13]:
input_anndata

'data/Lung_test.h5ad'

In [15]:
tabula_sapiens_filepath


'data/Lung_ref.h5ad'