In [1]:
# Download
# https://cellxgene.cziscience.com/collections/0f528c8a-a25c-4840-8fa3-d156fa11086f

# 확인
import scanpy as sc
import numpy as np
import pandas as pd
adata = sc.read_h5ad('/data/project/kim89/search_new_data/kidney.h5ad')

In [2]:
# obs: 셀 메타데이터
print("✅ obs columns:")
print(adata.obs.columns.tolist())
print()

# var: 유전자 메타데이터
print("✅ var columns:")
print(adata.var.columns.tolist())
print()

# obsm: 저차원 임베딩 (PCA, UMAP 등)
print("✅ obsm keys:")
print(list(adata.obsm.keys()))
print()

# layers: 추가 데이터 레이어
print("✅ layers keys:")
print(list(adata.layers.keys()))
print()

# raw: raw 데이터 여부
print("✅ raw exists:")
print(adata.raw is not None)
if adata.raw is not None:
    print("raw shape:", adata.raw.shape)
print()

# X matrix info
print("✅ X matrix type:", type(adata.X))
print("✅ X shape:", adata.X.shape)

✅ obs columns:
['library_id', 'nCount_RNA', 'nFeature_RNA', 'percent.er', 'percent.mt', 'subclass.l2', 'subclass.l1', 'class', 'experiment_id', 'suspension_type', 'assay_ontology_term_id', 'donor_id', 'specimen', 'disease_category', 'disease_ontology_term_id', 'eGFR', 'diabetes_history', 'hypertension', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'region', 'percent.cortex', 'percent.medulla', 'tissue_type', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'Age_binned', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid']

✅ var columns:
['feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type']

✅ obsm keys:
['X_umap']

✅ layers keys:
[]

✅ raw exists:
True
raw shape: (304989, 36368)

✅ X matrix type: <class 'scipy.sparse._csr.csr_matrix'>
✅ X shape: (304989, 36368)


In [19]:
# 📌 총 세포 수
total_cells = adata.shape[0]
print(f"Total cell counts : {total_cells:,}")

# 📌 샘플 수 및 각 샘플 당 세포 수 (donor_id  기준)
sample_counts = adata.obs['donor_id'].value_counts()
print("\nSamples :")
print(sample_counts)

# 📌 평균 샘플 당 세포 수
avg_cells_per_sample = sample_counts.mean()
print(f"\nAvg. cells per sample : {avg_cells_per_sample:.2f}")

# 📌 Cell types (고유값 목록)
cell_types = adata.obs['cell_type'].unique().tolist()
print("\nCell types :")
print(cell_types)
print(f"\nNumber of unique cell types : {len(cell_types)}")


Total cell counts : 304,989

Samples :
donor_id
3535        19257
3593        10434
KRP460       9119
18-312       7757
28-10109     7682
            ...  
29-10006      297
163-6         231
29-10401      198
163-5         118
28-12510      114
Name: count, Length: 80, dtype: int64

Avg. cells per sample : 3812.36

Cell types :
['kidney interstitial alternatively activated macrophage', 'kidney distal convoluted tubule epithelial cell', 'epithelial cell of proximal tubule', 'T cell', 'kidney loop of Henle thick ascending limb epithelial cell', 'kidney collecting duct intercalated cell', 'kidney interstitial fibroblast', 'kidney connecting tubule epithelial cell', 'kidney collecting duct principal cell', 'endothelial cell', 'B cell', 'mature NK T cell', 'mononuclear phagocyte', 'podocyte', 'parietal epithelial cell', 'mast cell', 'plasmacytoid dendritic cell, human', 'plasma cell', 'non-classical monocyte', 'conventional dendritic cell', 'kidney loop of Henle thin descending limb epithe

In [20]:
# 📌 Classes (질병 상태 등 label로 쓸 수 있는 값, 여기선 'disease' 기준)
classes = adata.obs['disease'].unique().tolist()
print("\nClasses :")
print(classes)

# # 📌 질병 별 샘플 수 (고유 donor_id 수 세기)
disease_sample_counts = adata.obs.groupby('disease', observed=True)['donor_id'].nunique()
print("\n📊 Disease-wise Sample Counts (donor_id):")
print(disease_sample_counts)



Classes :
['normal', 'chronic kidney disease', 'acute kidney failure']

📊 Disease-wise Sample Counts (donor_id):
disease
acute kidney failure      16
chronic kidney disease    39
normal                    25
Name: donor_id, dtype: int64


In [None]:
# print(adata.obs.columns)


# # 각 컬럼 값 확인
# print("\n🔹 disease unique values:")
# print(adata.obs['disease'].unique())


# print("🔹 donor_id unique values:")
# print(adata.obs['donor_id'].unique())


# print("\n🔹 AuthorCellType unique values:")
# print(adata.obs['AuthorCellType'].unique())


# print("\n🔹 cell_type_ontology_term_id unique values:")
# print(adata.obs['cell_type_ontology_term_id'].unique())


# print("\n🔹 cell_type unique values:")
# print(adata.obs['cell_type'].unique())

In [None]:

# print(adata.obs['disease__ontology_label'].value_counts())
# # 환자 ID, 레이블 정보 추출
# patient_ids = adata.obs['patient']
# labels = adata.obs['disease__ontology_label']

# # 셀 수 카운트
# patient_counts = patient_ids.value_counts()

# # 500개 미만 환자 필터링
# under_500 = patient_counts[patient_counts < 500]

# print("500개 미만 셀을 가진 환자 수:", len(under_500))
# print(under_500)

# # 라벨 별 분포 확인
# adata.obs['label_mapped'] = labels.map({
#     'normal': 0,
#     'hypertrophic cardiomyopathy': 1,
#     'dilated cardiomyopathy': 2
# })
# df = pd.DataFrame({'patient': patient_ids, 'label': adata.obs['label_mapped']})
# patient_label = df.groupby('patient')['label'].first()

# print("label 분포 (500개 이상 셀 보유한 환자 기준):")
# print(patient_label[~patient_label.index.isin(under_500.index)].value_counts())