In [4]:
import os
import celltypist
import scanpy
import subprocess

# 获取 Git 仓库的根目录
git_root = subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).strip().decode('utf-8')

# 更改当前工作目录到 Git 仓库的根目录
os.chdir(git_root)

In [8]:
# Read the reference data
adata_reference = scanpy.read("data/201.load_sc/reference.h5ad")

scanpy.pp.normalize_total(adata_reference, target_sum=1e4)
scanpy.pp.log1p(adata_reference)
# Train the celltypist model
rcc_model = celltypist.train(adata_reference, labels='broad_type', n_jobs=40, feature_selection=True)

# Create the directory if it doesn't exist
os.makedirs("data/202.annotation", exist_ok=True)

# Save the trained model
rcc_model.write("data/202.annotation/rcc_model.pkl")

🍳 Preparing data before training
✂️ 10 non-expressed genes are filtered out
🔬 Input data has 270855 cells and 19726 genes
⚖️ Scaling input data
🏋️ Training data using SGD logistic regression
🔎 Selecting features
🧬 1836 features are selected
🏋️ Starting the second round of training
🏋️ Training data using logistic regression
✅ Model training done!


In [11]:
adata_sc = scanpy.read("data/201.load_sc/sc_pre.h5ad")
scanpy.pp.normalize_total(adata_sc, target_sum=1e4)
scanpy.pp.log1p(adata_sc)
predictions = celltypist.annotate(
    adata_sc, model="data/202.annotation/rcc_model.pkl", majority_voting=True
)

🔬 Input data has 505629 cells and 44806 genes
🔗 Matching reference genes in the model
🧬 1836 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
  from .autonotebook import tqdm as notebook_tqdm
⛓️ Over-clustering input data with resolution set to 30
🗳️ Majority voting the predictions
✅ Majority voting done!


In [19]:
predictions.predicted_labels.to_csv("data/202.annotation/predicted_labels.csv", index=True, header=True)

In [17]:
predictions.predicted_labels

Unnamed: 0,predicted_labels,over_clustering,majority_voting
PD43824_5739STDY7958805_AAACGGGAGCTGTCTA-1,B-cell,76,B-cell
PD43824_5739STDY7958805_AACACGTGTAAACACA-1,B-cell,76,B-cell
PD43824_5739STDY7958805_AACCATGTCAGTCCCT-1,Plasma,192,Plasma
PD43824_5739STDY7958805_AAGACCTGTGCAGTAG-1,B-cell,357,B-cell
PD43824_5739STDY7958805_AAGGCAGCAAACAACA-1,B-cell,76,B-cell
...,...,...,...
GSM7028039_RCC5_TTTGTCACAAGGGTCA-1,RCC,36,RCC
GSM7028039_RCC5_TTTGTCAGTACAGCAG-1,RCC,256,RCC
GSM7028039_RCC5_TTTGTCAGTGCTTCTC-1,RCC,161,RCC
GSM7028039_RCC5_TTTGTCAGTTATCCGA-1,Myeloid,438,Myeloid
