In [1]:
import os
import celltypist
import scanpy
import subprocess
import polars as pl

# 获取 Git 仓库的根目录
git_root = subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).strip().decode('utf-8')

# 更改当前工作目录到 Git 仓库的根目录
os.chdir(git_root)

In [2]:
# Read the reference data
adata_reference = scanpy.read("data/201.load_sc/reference.h5ad")

scanpy.pp.normalize_total(adata_reference, target_sum=1e4)
scanpy.pp.log1p(adata_reference)
adata_reference = adata_reference[adata_reference.obs['broad_type'] == 'T-cell']
# read data/205.sub_cluster/T_markers_ct.tsv
T_cells = pl.read_csv("data/205.sub_cluster/T_markers_ct.tsv", separator='\t')
# to np
T_cells = T_cells.to_numpy()
# filter adata reference.obs['annotation'] in T_cells
adata_reference = adata_reference[adata_reference.obs['annotation'].isin(T_cells[:, 0])]
# Train the celltypist model
T_model = celltypist.train(adata_reference, labels='annotation', n_jobs=40, feature_selection=True)

# Create the directory if it doesn't exist
os.makedirs("data/205.sub_cluster", exist_ok=True)

# Save the trained model
T_model.write("data/205.sub_cluster/T_model.pkl")

🍳 Preparing data before training
✂️ 424 non-expressed genes are filtered out
🔬 Input data has 126632 cells and 19312 genes
⚖️ Scaling input data
🏋️ Training data using SGD logistic regression
🔎 Selecting features
🧬 2082 features are selected
🏋️ Starting the second round of training
🏋️ Training data using logistic regression
✅ Model training done!


In [None]:
adata_sc = scanpy.read("data/201.load_sc/sc_pre.h5ad")
T_cell_names = pl.read_csv("data/204.cluster/T_cell_names.tsv", separator="\t")
T_cell_names = T_cell_names.to_numpy()
# filter cells in T_cell_names
adata_sc = adata_sc[adata_sc.obs.index.isin(T_cell_names[:, 0])]
predictions_list = []

for dataset in adata_sc.obs["dataset"].unique():
    adata_subset = adata_sc[adata_sc.obs["dataset"] == dataset, :]
    scanpy.pp.normalize_total(adata_subset, target_sum=1e4)
    scanpy.pp.log1p(adata_subset)
    predictions = celltypist.annotate(
        adata_subset, model="data/205.sub_cluster/T_model.pkl", majority_voting=True
    )
    predictions_list.append(predictions.predicted_labels)


🔬 Input data has 22167 cells and 44806 genes
🔗 Matching reference genes in the model
🧬 2082 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
⛓️ Over-clustering input data with resolution set to 15
🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 5749 cells and 44806 genes
🔗 Matching reference genes in the model
🧬 2082 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
⛓️ Over-clustering input data with resolution set to 10
🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 785 cells and 44806 genes
🔗 Matching reference genes in the model
🧬 2082 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct 

ShapeError: unable to append to a DataFrame of width 3 with a DataFrame of width 1

In [47]:
# drop elements with only one column in predictions_list
predictions_list = [pred for pred in predictions_list if pred.shape[1] > 1]
# concat
all_predictions = pl.concat([pl.from_pandas(pred, include_index=True) for pred in predictions_list])

In [48]:
# name first column as cell_name
all_predictions = all_predictions.rename({"None": "cell_name"})
all_predictions.write_csv("data/205.sub_cluster/predicted_labels_T.csv")