In [1]:
# Download
# https://cellxgene.cziscience.com/collections/d5d0df8f-4eee-49d8-a221-a288f50a1590

# 확인
import scanpy as sc
import numpy as np
import pandas as pd
adata = sc.read_h5ad('/data/project/kim89/search_new_data/bone.h5ad')

In [2]:
# obs: 셀 메타데이터
print("✅ obs columns:")
print(adata.obs.columns.tolist())
print()

# var: 유전자 메타데이터
print("✅ var columns:")
print(adata.var.columns.tolist())
print()

# obsm: 저차원 임베딩 (PCA, UMAP 등)
print("✅ obsm keys:")
print(list(adata.obsm.keys()))
print()

# layers: 추가 데이터 레이어
print("✅ layers keys:")
print(list(adata.layers.keys()))
print()

# raw: raw 데이터 여부
print("✅ raw exists:")
print(adata.raw is not None)
if adata.raw is not None:
    print("raw shape:", adata.raw.shape)
print()

# X matrix info
print("✅ X matrix type:", type(adata.X))
print("✅ X shape:", adata.X.shape)

✅ obs columns:
['AuthorCellType', 'AuthorCellType_Broad', 'Shannon.Diversity.Normalized', 'nCount_RNA', 'nFeature_RNA', 'Study', 'donor_id', 'Sorting', 'S.Score', 'G2M.Score', 'CyclePhase', 'scrublet_scores', 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'is_primary_data', 'self_reported_ethnicity_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id', 'sex_ontology_term_id', 'cell_type_ontology_term_id', 'Donor_Age_Group', 'tissue_type', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid']

✅ var columns:
['HCA_Hay2018', 'Oetjen2018', 'Granja2019', 'Mende2022', 'Setty2019', 'Ainciburu2023', 'HVG_intersect3000', 'nCells_Detected', 'nDatasets_Detected', 'gene_symbols', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type']

✅ obsm keys:
['X_harmony', 'X_pca', 'X_umap']

✅ layers keys:
[]

✅ raw exists:
True
ra

In [None]:
# 📌 총 세포 수
total_cells = adata.shape[0]
print(f"Total cell counts : {total_cells:,}")

# 📌 샘플 수 및 각 샘플 당 세포 수 (donor_id 기준)
sample_counts = adata.obs['donor_id'].value_counts()
# print("\nSamples (class distribution) :")
# print(sample_counts)

# 📌 평균 샘플 당 세포 수
avg_cells_per_sample = sample_counts.mean()
print(f"\nAvg. cells per sample : {avg_cells_per_sample:.2f}")

# 📌 Cell types (고유값 목록)
cell_types = adata.obs['cell_type'].unique().tolist()
# print("\nCell types :")
# print(cell_types)
print(f"\nNumber of unique cell types : {len(cell_types)}")

# 📌 AuthorCellType (고유값 목록)
cell_types = adata.obs['AuthorCellType'].unique().tolist()
# print("\nCell types :")
# print(cell_types)
print(f"\nNumber of unique cell types : {len(cell_types)}")

# 📌 Classes (질병 상태 등 label로 쓸 수 있는 값, 여기선 'disease' 기준)
classes = adata.obs['disease'].unique().tolist()
print("\nClasses :")
print(classes)


Total cell counts : 263,159

Avg. cells per sample : 5847.98

Cell types :
['small pre-B-II cell', 'immature B cell', 'late pro-B cell', 'large pre-B-II cell', 'common lymphoid progenitor', 'hematopoietic multipotent progenitor cell', 'mature B cell', 'pro-B cell', 'basophilic erythroblast', 'plasmacytoid dendritic cell, human', 'common myeloid progenitor', 'megakaryocyte progenitor cell', 'central memory CD4-positive, alpha-beta T cell', 'natural killer cell', 'T cell', 'plasma cell', 'basophil mast progenitor cell', 'granulocyte monocyte progenitor cell', 'pre-conventional dendritic cell', 'hematopoietic oligopotent progenitor cell', 'hematopoietic stem cell', 'megakaryocyte-erythroid progenitor cell', 'erythroid progenitor cell', 'promonocyte', 'naive thymus-derived CD4-positive, alpha-beta T cell', 'CD16-negative, CD56-bright natural killer cell, human', 'effector memory CD8-positive, alpha-beta T cell', 'CD8-positive, alpha-beta memory T cell', 'effector memory CD4-positive, alpha

In [None]:
print(adata.obs.columns)


# 각 컬럼 값 확인
print("\n🔹 disease unique values:")
print(adata.obs['disease'].unique())


print("🔹 donor_id unique values:")
print(adata.obs['donor_id'].unique())


print("\n🔹 AuthorCellType unique values:")
print(adata.obs['AuthorCellType'].unique())


print("\n🔹 cell_type_ontology_term_id unique values:")
print(adata.obs['cell_type_ontology_term_id'].unique())


print("\n🔹 cell_type unique values:")
print(adata.obs['cell_type'].unique())

Index(['AuthorCellType', 'AuthorCellType_Broad',
       'Shannon.Diversity.Normalized', 'nCount_RNA', 'nFeature_RNA', 'Study',
       'donor_id', 'Sorting', 'S.Score', 'G2M.Score', 'CyclePhase',
       'scrublet_scores', 'assay_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'is_primary_data', 'self_reported_ethnicity_ontology_term_id',
       'suspension_type', 'tissue_ontology_term_id', 'sex_ontology_term_id',
       'cell_type_ontology_term_id', 'Donor_Age_Group', 'tissue_type',
       'cell_type', 'assay', 'disease', 'sex', 'tissue',
       'self_reported_ethnicity', 'development_stage', 'observation_joinid'],
      dtype='object')

🔹 disease unique values:
['normal']
Categories (1, object): ['normal']
🔹 donor_id unique values:
['HCA_BM_BM2', 'HCA_BM_BM4', 'HCA_BM_BM8', 'HCA_BM_BM1', 'HCA_BM_BM6', ..., 'Oetjen2018_S', 'Oetjen2018_Sk', 'Oetjen2018_T', 'Oetjen2018_U', 'Oetjen2018_W']
Length: 45
Categories (45, object): ['Ainciburu202

In [None]:

print(adata.obs['disease__ontology_label'].value_counts())
# 환자 ID, 레이블 정보 추출
patient_ids = adata.obs['patient']
labels = adata.obs['disease__ontology_label']

# 셀 수 카운트
patient_counts = patient_ids.value_counts()

# 500개 미만 환자 필터링
under_500 = patient_counts[patient_counts < 500]

print("500개 미만 셀을 가진 환자 수:", len(under_500))
print(under_500)

# 라벨 별 분포 확인
adata.obs['label_mapped'] = labels.map({
    'normal': 0,
    'hypertrophic cardiomyopathy': 1,
    'dilated cardiomyopathy': 2
})
df = pd.DataFrame({'patient': patient_ids, 'label': adata.obs['label_mapped']})
patient_label = df.groupby('patient')['label'].first()

print("label 분포 (500개 이상 셀 보유한 환자 기준):")
print(patient_label[~patient_label.index.isin(under_500.index)].value_counts())