# Celltype Classification with Transcriptformer

This notebooks showcase how the Transcriptformer embeddings can be used to train a classifier for celltype classification.

In [1]:
%load_ext autoreload
%autoreload 2

import json
import logging
import os

import hydra
from omegaconf import DictConfig, OmegaConf


from transcriptformer.model.inference import run_inference
from transcriptformer.datasets import tabula_sapiens
import yaml

In [2]:
# !python ./../download_artifacts.py tf-sapiens

In [3]:
adata = tabula_sapiens(tissue="ear", version="v2")
cfg = OmegaConf.load("./../conf/inference_config.yaml")
logging.debug(OmegaConf.to_yaml(cfg))

cfg.model.checkpoint_path = "./checkpoints/tf_sapiens"

config_path = os.path.join(cfg.model.checkpoint_path, "config.json")
with open(config_path) as f:
    config_dict = json.load(f)
mlflow_cfg = OmegaConf.create(config_dict)

# Merge the MLflow config with the main config
cfg = OmegaConf.merge(mlflow_cfg, cfg)

# Set the checkpoint paths based on the unified checkpoint_path
cfg.model.inference_config.load_checkpoint = os.path.join(cfg.model.checkpoint_path, "model_weights.pt")
cfg.model.data_config.aux_vocab_path = os.path.join(cfg.model.checkpoint_path, "vocabs")
cfg.model.data_config.esm2_mappings_path = os.path.join(cfg.model.checkpoint_path, "vocabs")


In [12]:
adata.var["feature_id"] = adata.var_names.values
adata.X = adata.layers["decontXcounts"]

In [15]:
adata.var["feature_id"]

ensg
ENSG00000000003    ENSG00000000003
ENSG00000000419    ENSG00000000419
ENSG00000000457    ENSG00000000457
ENSG00000000460    ENSG00000000460
ENSG00000000938    ENSG00000000938
                        ...       
ENSG00000290117    ENSG00000290117
ENSG00000290122    ENSG00000290122
ENSG00000290126    ENSG00000290126
ENSG00000290127    ENSG00000290127
ENSG00000290146    ENSG00000290146
Name: feature_id, Length: 21897, dtype: object

In [14]:
# Set logging level to ERROR to reduce verbosity
logging.getLogger().setLevel(logging.ERROR)

adata_output = run_inference(cfg, data_files=[adata])

['ENSG00000000003.15' 'ENSG00000000419.14' 'ENSG00000000457.14' ...
 'ENSG00000290126.1' 'ENSG00000290127.1' 'ENSG00000290146.1']
{'unknown': 0, '[PAD]': 1, '[START]': 2, '[END]': 3, '[RD]': 4, '[CELL]': 5, '[MASK]': 6, 'ENSG00000000003': 7, 'ENSG00000000005': 8, 'ENSG00000000419': 9, 'ENSG00000000457': 10, 'ENSG00000000460': 11, 'ENSG00000000938': 12, 'ENSG00000000971': 13, 'ENSG00000001036': 14, 'ENSG00000001084': 15, 'ENSG00000001167': 16, 'ENSG00000001460': 17, 'ENSG00000001461': 18, 'ENSG00000001497': 19, 'ENSG00000001561': 20, 'ENSG00000001617': 21, 'ENSG00000001626': 22, 'ENSG00000001629': 23, 'ENSG00000001630': 24, 'ENSG00000001631': 25, 'ENSG00000002016': 26, 'ENSG00000002330': 27, 'ENSG00000002549': 28, 'ENSG00000002586': 29, 'ENSG00000002587': 30, 'ENSG00000002726': 31, 'ENSG00000002745': 32, 'ENSG00000002746': 33, 'ENSG00000002822': 34, 'ENSG00000002834': 35, 'ENSG00000002919': 36, 'ENSG00000002933': 37, 'ENSG00000003056': 38, 'ENSG00000003096': 39, 'ENSG00000003137': 40, '

ValueError: No valid data was loaded from any files. Check if files exist and contain valid data after filtering.