In [1]:
import scanpy as sc
import pandas as pd
import numpy as np

In [2]:
from tqdm import tqdm
import scipy.sparse as sp

## Split data in hECA-10M

In [None]:
adata = sc.read_h5ad("/nfs/public/cell_gpt_data/dataHub/datasets/datasets/ECA_GO/model_hub/eca_heart_liver_lung_blood_bm.h5ad")

In [None]:
# split dataset
n_cells = adata.shape[0]
# Get 200k cells from hECA-10M
np.random.seed(0)
indices = np.random.permutation(n_cells)
adata = adata[indices[:200000]]

In [None]:
n_cells = adata.shape[0]
indices = np.random.permutation(n_cells)
# Split 80% for training and 20% for validation
n_train = int(0.8 * n_cells)
adata_train = adata[indices[:n_train]]
adata_test = adata[indices[n_train:]]

In [None]:
adata_train

View of AnnData object with n_obs × n_vars = 160000 × 42117
    obs: 'study_id', 'donor_gender', 'cell_type', 'organ', 'age_bin', 'donor_age', 'region', 'seq_tech'

In [None]:
adata_train.write_h5ad("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/finetune/hECA-20K_train.h5ad")

In [None]:
adata_test

View of AnnData object with n_obs × n_vars = 40000 × 42117
    obs: 'study_id', 'donor_gender', 'cell_type', 'organ', 'age_bin', 'donor_age', 'region', 'seq_tech'

In [None]:
adata_test.write_h5ad("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/finetune/hECA-20K_test.h5ad")

# Preprocessing datasets

## AHCA

In [3]:
def preprocess_for_Geneformer(adata):
    df_ref_gene = pd.read_table("./total_gene_list_42117.txt",index_col=0)
    adata.var['ensembl_id'] = df_ref_gene['Ensembl gene ID'] #这里直接用的是42117个基因，有需要的话需要重新弄一下
    sc.pp.calculate_qc_metrics(adata,percent_top = None, 
                                       log1p = False, 
                                       inplace = True)
    adata.obs['n_counts'] = adata.obs['total_counts']

In [4]:
adata = sc.read_h5ad("/nfs/public/cell_gpt_data/dataHub/datasets/datasets/ECA_GO/model_hub/bone_marrow_AHCA_uniformed.h5ad")
preprocess_for_Geneformer(adata)

In [5]:
adata

AnnData object with n_obs × n_vars = 3230 × 42117
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'RNA_snn_res.orig', 'seurat_clusters', 'Color_of_tissues', 'Cell_type_in_each_tissue', 'tSNE_1', 'tSNE_2', 'Cell_type_in_merged_data', 'cell_type', 'organ', 'n_genes_by_counts', 'total_counts', 'n_counts'
    var: 'ensembl_id', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'

In [5]:
adata.write_h5ad("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/BoneMarrow_AHCA.h5ad")

## Suo2022

In [4]:
adata = sc.read_h5ad("/nfs/public/cell_gpt_data/Test_data/Liver/Liver_test.h5ad")

In [5]:
adata.X = np.array(adata.X.todense())

In [6]:
for i in tqdm(range(adata.shape[0])):
    adata.X[i,:] = np.exp(adata.X[i,:])-1
adata.X = sp.csr_matrix(adata.X)

100%|██████████| 137181/137181 [00:14<00:00, 9505.48it/s] 


In [12]:
preprocess_for_Geneformer(adata)

In [13]:
adata.write_h5ad("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/Liver_Suo2022.h5ad")

## Simonson2023

In [14]:
adata = sc.read_h5ad("/nfs/public/cell_gpt_data/Test_data/Heart/Simonson.h5ad")

In [18]:
adata.X = np.array(adata.X.todense())

In [19]:
for i in tqdm(range(adata.shape[0])):
    adata.X[i,:] = np.exp(adata.X[i,:])-1
adata.X = sp.csr_matrix(adata.X)

100%|██████████| 60345/60345 [00:02<00:00, 23145.51it/s]


In [20]:
preprocess_for_Geneformer(adata)

In [21]:
adata.write_h5ad("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/Heart_Simonson2023.h5ad")

## Finetune train

In [22]:
adata = sc.read_h5ad("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/finetune/hECA-20K_train.h5ad")

In [24]:
adata.X = np.array(adata.X.todense())
for i in tqdm(range(adata.shape[0])):
    adata.X[i,:] = np.exp(adata.X[i,:])-1
adata.X = sp.csr_matrix(adata.X)

100%|██████████| 160000/160000 [00:13<00:00, 12243.15it/s]


In [25]:
preprocess_for_Geneformer(adata)

In [26]:
adata.write_h5ad("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/finetune/hECA-20K_train.h5ad")

## Finetune test

In [11]:
adata = sc.read_h5ad("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/finetune/test/hECA-20K_test.h5ad")

In [12]:
adata.X = np.array(adata.X.todense())
for i in tqdm(range(adata.shape[0])):
    adata.X[i,:] = np.exp(adata.X[i,:])-1
adata.X = sp.csr_matrix(adata.X)

  adata.X[i,:] = np.exp(adata.X[i,:])-1
100%|██████████| 40000/40000 [00:07<00:00, 5232.34it/s]


In [29]:
preprocess_for_Geneformer(adata)

In [30]:
adata.write_h5ad("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/finetune/hECA-20K_test.h5ad")

# Intestine

In [12]:
adata = sc.read_h5ad("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/Intestine/Intestine_heca1.h5ad")

In [14]:
adata.X.max()

8.618953649731345

In [15]:
adata.X = np.array(adata.X.todense())
for i in tqdm(range(adata.shape[0])):
    adata.X[i,:] = np.exp(adata.X[i,:])-1
adata.X = sp.csr_matrix(adata.X)

100%|██████████| 55214/55214 [00:08<00:00, 6442.01it/s]


In [18]:
preprocess_for_Geneformer(adata)

In [34]:
adata = adata[adata.obs.cell_type!="Unclassified"]

In [38]:
adata2k = sc.read_h5ad("/nfs/public/cell_gpt_data/Intestine_heca1_2000.h5ad")
adata.obs['train_test'] = adata2k.obs.train_test[adata.obs.index]
adata.obs.train_test.value_counts()

  adata.obs['train_test'] = adata2k.obs.train_test[adata.obs.index]


train_test
True     43929
False    10983
Name: count, dtype: int64

In [39]:
adata_train = adata[adata.obs.train_test == True].copy()
adata_test = adata[adata.obs.train_test == False].copy()

In [46]:
adata_train.write_h5ad("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/Intestine/Finetune/finetune.h5ad")
adata_test.write_h5ad("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/Intestine/Test/test.h5ad")

# Tokenizing dataset

In [47]:
from geneformer import TranscriptomeTokenizer

  def twobit_to_dna(twobit: int, size: int) -> str:
  def dna_to_twobit(dna: str) -> int:
  def twobit_1hamming(twobit: int, size: int) -> List[int]:
  from .autonotebook import tqdm as notebook_tqdm


In [33]:
tk = TranscriptomeTokenizer({"cell_type": "cell_type", "organ": "organ"}, nproc=32)
tk.tokenize_data("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/BoneMarrow", 
                 "/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/tokenized", 
                 "BoneMarrow_AHCA", 
                 file_format="h5ad")

Tokenizing /nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/BoneMarrow/BoneMarrow_AHCA.h5ad


  for i in adata.var["ensembl_id"][coding_miRNA_loc]
  coding_miRNA_ids = adata.var["ensembl_id"][coding_miRNA_loc]


/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/BoneMarrow/BoneMarrow_AHCA.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Creating dataset.


Map (num_proc=32): 100%|██████████| 3230/3230 [00:02<00:00, 1477.69 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3230/3230 [00:00<00:00, 39111.51 examples/s]


In [34]:
tk = TranscriptomeTokenizer({"cell_type": "cell_type", "organ": "organ"}, nproc=32)
tk.tokenize_data("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/Heart/", 
                 "/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/tokenized", 
                 "Heart_Simonson2023", 
                 file_format="h5ad")

Tokenizing /nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/Heart/Heart_Simonson2023.h5ad


  for i in adata.var["ensembl_id"][coding_miRNA_loc]
  coding_miRNA_ids = adata.var["ensembl_id"][coding_miRNA_loc]


/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/Heart/Heart_Simonson2023.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Creating dataset.


Map (num_proc=32): 100%|██████████| 60345/60345 [00:23<00:00, 2578.08 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 60345/60345 [00:00<00:00, 61776.86 examples/s] 


In [35]:
tk = TranscriptomeTokenizer({"cell_type": "cell_type", "organ": "organ"}, nproc=32)
tk.tokenize_data("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/Liver/", 
                 "/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/tokenized", 
                 "Liver_Suo2022", 
                 file_format="h5ad")

Tokenizing /nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/Liver/Liver_Suo2022.h5ad


  for i in adata.var["ensembl_id"][coding_miRNA_loc]
  coding_miRNA_ids = adata.var["ensembl_id"][coding_miRNA_loc]


/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/Liver/Liver_Suo2022.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Creating dataset.


Map (num_proc=32): 100%|██████████| 137181/137181 [02:30<00:00, 912.78 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 137181/137181 [00:04<00:00, 28472.05 examples/s] 


In [36]:
tk = TranscriptomeTokenizer({"cell_type": "cell_type", "organ": "organ"}, nproc=32)
tk.tokenize_data("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/finetune/test/", 
                 "/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/tokenized", 
                 "MultiOrgan_finetune_test", 
                 file_format="h5ad")

Tokenizing /nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/finetune/test/hECA-20K_test.h5ad


  for i in adata.var["ensembl_id"][coding_miRNA_loc]
  coding_miRNA_ids = adata.var["ensembl_id"][coding_miRNA_loc]


/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/finetune/test/hECA-20K_test.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Creating dataset.


Map (num_proc=32): 100%|██████████| 40000/40000 [00:33<00:00, 1191.63 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 40000/40000 [00:00<00:00, 43700.87 examples/s] 


In [37]:
tk = TranscriptomeTokenizer({"cell_type": "cell_type", "organ": "organ"}, nproc=32)
tk.tokenize_data("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/finetune/train/", 
                 "/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/tokenized", 
                 "MultiOrgan_finetune_train", 
                 file_format="h5ad")



Tokenizing /nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/finetune/train/hECA-20K_train.h5ad


  for i in adata.var["ensembl_id"][coding_miRNA_loc]
  coding_miRNA_ids = adata.var["ensembl_id"][coding_miRNA_loc]


/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/finetune/train/hECA-20K_train.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Creating dataset.


Map (num_proc=32): 100%|██████████| 160000/160000 [01:41<00:00, 1569.00 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 160000/160000 [00:03<00:00, 44773.74 examples/s] 


In [48]:
tk = TranscriptomeTokenizer({"cell_type": "cell_type"}, nproc=32)
tk.tokenize_data("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/Intestine/Finetune/", 
                 "/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/tokenized", 
                 "Intestine_Finetune", 
                 file_format="h5ad")

Tokenizing /nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/Intestine/Finetune/finetune.h5ad


  for i in adata.var["ensembl_id"][coding_miRNA_loc]
  coding_miRNA_ids = adata.var["ensembl_id"][coding_miRNA_loc]


/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/Intestine/Finetune/finetune.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Creating dataset.


Map (num_proc=32): 100%|██████████| 43929/43929 [00:11<00:00, 3889.04 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 43929/43929 [00:06<00:00, 6968.45 examples/s]  


In [49]:
tk = TranscriptomeTokenizer({"cell_type": "cell_type"}, nproc=32)
tk.tokenize_data("/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/Intestine/Test/", 
                 "/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/tokenized", 
                 "Intestine_Test", 
                 file_format="h5ad")

Tokenizing /nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/Intestine/Test/test.h5ad


  for i in adata.var["ensembl_id"][coding_miRNA_loc]
  coding_miRNA_ids = adata.var["ensembl_id"][coding_miRNA_loc]


/nfs/public/cell_gpt_data/Geneformer_4_recomb/dataset/test/Intestine/Test/test.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Creating dataset.


Map (num_proc=32): 100%|██████████| 10983/10983 [00:02<00:00, 4601.36 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 10983/10983 [00:00<00:00, 126961.75 examples/s]
