In [2]:
import os
import celltypist
import scanpy
import subprocess

# 获取 Git 仓库的根目录
git_root = subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).strip().decode('utf-8')

# 更改当前工作目录到 Git 仓库的根目录
os.chdir(git_root)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Read the reference data
adata_reference = scanpy.read("data/201.load_sc/reference.h5ad")

scanpy.pp.normalize_total(adata_reference, target_sum=1e4)
scanpy.pp.log1p(adata_reference)
# filter out broad_type == 'RCC' cells
adata_reference = adata_reference[adata_reference.obs['broad_type'] != 'RCC', :]
# Train the celltypist model
rcc_model = celltypist.train(adata_reference, labels='broad_type', n_jobs=40, feature_selection=True)

# Create the directory if it doesn't exist
os.makedirs("data/202.annotation", exist_ok=True)

# Save the trained model
rcc_model.write("data/202.annotation/rcc_model.pkl")

🍳 Preparing data before training
✂️ 12 non-expressed genes are filtered out
🔬 Input data has 256456 cells and 19724 genes
⚖️ Scaling input data
🏋️ Training data using SGD logistic regression
🔎 Selecting features
🧬 1770 features are selected
🏋️ Starting the second round of training
🏋️ Training data using logistic regression


In [3]:
adata_sc = scanpy.read("data/201.load_sc/sc_pre.h5ad")
scanpy.pp.normalize_total(adata_sc, target_sum=1e4)
scanpy.pp.log1p(adata_sc)
predictions = celltypist.annotate(
    adata_sc, model="data/202.annotation/rcc_model.pkl", majority_voting=True
)

🔬 Input data has 404408 cells and 44806 genes
🔗 Matching reference genes in the model
🧬 1770 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
⛓️ Over-clustering input data with resolution set to 30
🗳️ Majority voting the predictions
✅ Majority voting done!


In [5]:
predictions.predicted_labels.to_csv("data/202.annotation/predicted_labels.csv", index=True, header=True)