In [21]:
# Download
# https://cellxgene.cziscience.com/collections/0f528c8a-a25c-4840-8fa3-d156fa11086f

# 확인
import scanpy as sc
import numpy as np
import pandas as pd
adata = sc.read_h5ad('/data/project/kim89/search_new_data/kidney.h5ad')

In [22]:
# obs: 셀 메타데이터
print("✅ obs columns:")
print(adata.obs.columns.tolist())
print()

# var: 유전자 메타데이터
print("✅ var columns:")
print(adata.var.columns.tolist())
print()

# obsm: 저차원 임베딩 (PCA, UMAP 등)
print("✅ obsm keys:")
print(list(adata.obsm.keys()))
print()

# layers: 추가 데이터 레이어
print("✅ layers keys:")
print(list(adata.layers.keys()))
print()

# raw: raw 데이터 여부
print("✅ raw exists:")
print(adata.raw is not None)
if adata.raw is not None:
    print("raw shape:", adata.raw.shape)
print()

# X matrix info
print("✅ X matrix type:", type(adata.X))
print("✅ X shape:", adata.X.shape)

✅ obs columns:
['orig.ident', 'nCount_RNA', 'SpecimenID', 'LibraryID', 'SampleID', 'Run', 'clusterNumber', 'subclass.l1', 'dataSource', 'diseasetype', 'Race', 'clusterClass', 'assay_ontology_term_id', 'sampletype', 'disease_ontology_term_id', 'sex_ontology_term_id', 'donor_id', 'suspension_type', 'tissue_ontology_term_id', 'tissue_type', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'Age_binned', 'cell_type_ontology_term_id', 'author_cell_type', 'diabetes_history', 'hypertension', 'eGFR', 'is_primary_data', 'disease_category', 'nFeature_RNA', 'percent.mt', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid']

✅ var columns:
['index', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type']

✅ obsm keys:
['X_pca', 'X_umap']

✅ layers keys:
[]

✅ raw exists:
True
raw shape: (225177, 31332)

✅ X matrix type: <class 'scipy.sparse._cs

In [23]:
# 📌 총 세포 수
total_cells = adata.shape[0]
print(f"Total cell counts : {total_cells:,}")

# 📌 샘플 수 및 각 샘플 당 세포 수 (SampleID 기준)
sample_counts = adata.obs['SampleID'].value_counts()
print("\nSamples :")
print(sample_counts)

# 📌 평균 샘플 당 세포 수
avg_cells_per_sample = sample_counts.mean()
print(f"\nAvg. cells per sample : {avg_cells_per_sample:.2f}")


Total cell counts : 225,177

Samples :
SampleID
29-10006    11461
33-10005     8904
31-10000     8870
28-12263     7527
31-10154     6568
            ...  
27-10053      728
55            597
124           574
33-10006      457
28-10051      300
Name: count, Length: 77, dtype: int64

Avg. cells per sample : 2924.38


In [27]:
# 📌 Cell types (고유값 목록)
cell_types = adata.obs['cell_type'].unique().tolist()
print("\nCell types :")
print(cell_types)
print(f"\nNumber of unique cell types : {len(cell_types)}")

# 📌 Cell types (고유값 목록)
author_cell_types = adata.obs['author_cell_type'].unique().tolist()
print("\nAuthor Cell types :")
print(author_cell_types)
print(f"\nNumber of unique 'Author cell types' : {len(author_cell_types)}")


Cell types :
['kidney collecting duct principal cell', 'kidney interstitial cell', 'kidney loop of Henle thin descending limb epithelial cell', 'endothelial cell', 'epithelial cell of proximal tubule', 'kidney loop of Henle thick ascending limb epithelial cell', 'kidney loop of Henle thin ascending limb epithelial cell', 'kidney collecting duct intercalated cell', 'cytotoxic T cell', 'non-classical monocyte', 'conventional dendritic cell', 'T cell', 'kidney interstitial alternatively activated macrophage', 'mature NK T cell', 'plasma cell', 'mononuclear phagocyte', 'monocyte', 'B cell', 'natural killer cell', 'plasmacytoid dendritic cell, human', 'mast cell', 'kidney distal convoluted tubule epithelial cell', 'podocyte', 'parietal epithelial cell', 'kidney connecting tubule epithelial cell']

Number of unique cell types : 25

Author Cell types :
['IMCD', 'VSMC/P', 'dVSMC', 'PC', 'DTL1', 'EC-LYM', 'CNT-PC', 'cycEPI', 'aTAL1', 'ATL', 'EC-AEA', 'IC-A', 'T-CYT', 'EC-PTC', 'dPC', 'aPT', 'n

In [25]:
# 📌 Classes (질병 상태 등 label로 쓸 수 있는 값, 여기선 'disease' 기준)
classes = adata.obs['disease'].unique().tolist()
print("\nClasses :")
print(classes)

# # 📌 질병 별 샘플 수 (고유 donor_id 수 세기)
disease_sample_counts = adata.obs.groupby('disease', observed=True)['donor_id'].nunique()
print("\n📊 Disease-wise Sample Counts (donor_id):")
print(disease_sample_counts)



Classes :
['normal', 'acute kidney failure', 'chronic kidney disease']

📊 Disease-wise Sample Counts (donor_id):
disease
acute kidney failure      14
chronic kidney disease    37
normal                    26
Name: donor_id, dtype: int64


In [None]:
# print(adata.obs.columns)


# # 각 컬럼 값 확인
# print("\n🔹 disease unique values:")
# print(adata.obs['disease'].unique())


# print("🔹 donor_id unique values:")
# print(adata.obs['donor_id'].unique())


# print("\n🔹 AuthorCellType unique values:")
# print(adata.obs['AuthorCellType'].unique())


# print("\n🔹 cell_type_ontology_term_id unique values:")
# print(adata.obs['cell_type_ontology_term_id'].unique())


# print("\n🔹 cell_type unique values:")
# print(adata.obs['cell_type'].unique())

In [None]:

# print(adata.obs['disease__ontology_label'].value_counts())
# # 환자 ID, 레이블 정보 추출
# patient_ids = adata.obs['patient']
# labels = adata.obs['disease__ontology_label']

# # 셀 수 카운트
# patient_counts = patient_ids.value_counts()

# # 500개 미만 환자 필터링
# under_500 = patient_counts[patient_counts < 500]

# print("500개 미만 셀을 가진 환자 수:", len(under_500))
# print(under_500)

# # 라벨 별 분포 확인
# adata.obs['label_mapped'] = labels.map({
#     'normal': 0,
#     'hypertrophic cardiomyopathy': 1,
#     'dilated cardiomyopathy': 2
# })
# df = pd.DataFrame({'patient': patient_ids, 'label': adata.obs['label_mapped']})
# patient_label = df.groupby('patient')['label'].first()

# print("label 분포 (500개 이상 셀 보유한 환자 기준):")
# print(patient_label[~patient_label.index.isin(under_500.index)].value_counts())