# AIDO.Cell Quickstart

This demo quickly walks through installing AIDO.Cell and embedding new single cell data.

__Requirements__:
- A100 GPU or equivalent

### Install ModelGenerator and extra dependencies for tutorials

In [None]:
!git clone https://github.com/genbio-ai/ModelGenerator.git
%cd ModelGenerator
!pip install -e .
!pip install -r experiments/AIDO.Cell/requirements.txt

### Grab some data from GEO and load into anndata

In [None]:
%%bash
cd ~/ModelGenerator/experiments/AIDO.Cell
mkdir -p data
cd data
wget -nv -O GSE214695.tar 'http://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE214695&format=file'
tar -xvf GSE214695.tar
cd ..

In [None]:
import anndata as ad
import scanpy as sc

adata = sc.read_10x_mtx('data', prefix='GSM6614348_HC-1_')
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_genes(adata, min_cells=3)
# No more normalization needed, AIDO.Cell uses raw counts

### Preprocess the anndata for AIDO.Cell

In [None]:
import cell_utils
aligned_adata, attention_mask = cell_utils.align_adata(adata)

###########  Aligning data to AIDO.Cell  ###########
AIDO.Cell was pretrained on a fixed set of 19264 genes.
Aligning your data to the AIDO.Cell gene set...
2428 in your data that cannot be used by AIDO.Cell. Removing these.
['A1BG-AS1' 'A2M-AS1' 'AAED1' ... 'ZNRD1' 'ZNRF3-AS1' 'ZSCAN16-AS1']
5837 genes in the AIDO.Cell pretraining set missing in your data.
AIDO.Cell is trained with zero-masking. Setting these to zero for AIDO.Cell to ignore.
['A2ML1' 'A3GALT2' 'A4GNT' ... 'ZSWIM5' 'ZYG11A' 'ZZZ3']
13427 non-zero genes remaining.
Reordering genes to match AIDO.Cell gene ordering
Gathering attention mask for nonzero genes
####################  Finished  ####################


### Get AIDO.Cell embeddings

In [None]:
# Embed
import anndata as ad
import numpy as np
import torch
import sys
from modelgenerator.tasks import Embed

device = 'cuda'
batch_size = 2

model = Embed.from_config({
        "model.backbone": "aido_cell_3m",
        "model.batch_size": batch_size
    }).eval()
model = model.to(device).to(torch.float16)

batch_np = aligned_adata[:batch_size].X.toarray()
batch_tensor = torch.from_numpy(batch_np).to(torch.float16).to(device)
batch_transformed = model.transform({'sequences': batch_tensor})
embs = model(batch_transformed)

# Full Embeddings
print('FULL EMBEDDING')
print('(batch_size, genes, embedding_dim)')
print(embs.shape)
print(embs)
print('-------------------------------------')

# Non-Zero Genes Embeddings
print('NON-ZERO GENES EMBEDDING')
embs = embs[:, attention_mask.astype(bool), :]
print('(batch_size, genes, embedding_dim)')
print(embs.shape)
print(embs)

FULL EMBEDDING
(batch_size, genes, embedding_dim)
torch.Size([2, 19264, 128])
tensor([[[-2.0430,  0.4229, -1.6641,  ..., -0.9346,  0.3691,  1.6074],
         [-0.6450, -1.9004, -2.7969,  ..., -1.5557,  0.9419, -0.5210],
         [-1.0693, -1.5303, -0.9526,  ..., -0.6470,  0.6484,  0.8975],
         ...,
         [ 0.5708, -1.8574, -2.6406,  ..., -0.3594, -0.2087,  0.9453],
         [ 0.0121,  0.0419,  0.3096,  ..., -0.4370,  1.3516, -0.4097],
         [-1.1113, -1.5303, -1.0635,  ..., -1.0801,  1.4648, -0.9688]],

        [[-2.2988,  1.0430, -2.3164,  ..., -0.2478,  0.5171,  0.1464],
         [-0.8042, -1.9922, -2.7480,  ..., -1.4678,  0.6299, -0.7510],
         [-0.0687, -2.2207, -0.0922,  ..., -1.4395,  0.0156,  0.8447],
         ...,
         [ 0.0627, -1.3369, -2.4355,  ..., -0.0134,  0.0335,  1.0449],
         [ 0.1595,  0.0429,  0.3174,  ..., -0.1583,  1.0918, -0.3188],
         [-0.6709, -1.0010, -1.5508,  ..., -1.0186,  0.9917, -0.7573]]],
       device='cuda:0', dtype=torch.fl