In [26]:
import os
import anndata
import cell2sentence as cs
from cell2sentence import tasks, CSData
from datasets import load_from_disk
from cell2sentence.csmodel import CSModel


In [2]:
# Load AnnData Objects
human_restricted = anndata.read_h5ad("/home/dor3/palmer_scratch/C2S_Files_Daphne/Cross_Species_Datasets/mouse_human_pancreas_tissue_Baron_et_al/processed_homolog_intersected_data/human_pancreas_one_sample_preprocessed_homolog_intersected_adata.h5ad")
mouse_restricted = anndata.read_h5ad("/home/dor3/palmer_scratch/C2S_Files_Daphne/Cross_Species_Datasets/mouse_human_pancreas_tissue_Baron_et_al/processed_homolog_intersected_data/mouse_pancreas_preprocessed_homolog_intersected_adata.h5ad")

In [30]:
c2s_save_dir = "/home/sr2464/palmer_scratch/C2S_Files_Syed/c2s_api_testing"  # C2S dataset will be saved into this directory
human_c2s_save_name = "human_pancreas_tissue_c2s_embeddings_cinema_ot"  # This will be the name of our C2S dataset on disk

human_csdata = cs.CSData.csdata_from_arrow(
    arrow_dataset=human_arrow_ds, 
    vocabulary=human_vocab,
    save_dir=c2s_save_dir,
    save_name=human_c2s_save_name,
    dataset_backend="arrow"
)

Saving the dataset (0/1 shards):   0%|          | 0/12113 [00:00<?, ? examples/s]

In [31]:
mouse_c2s_save_name = "mouse_pancreas_tissue_c2s_embeddings_cinema_ot"  # This will be the name of our C2S dataset on disk
mouse_csdata = cs.CSData.csdata_from_arrow(
    arrow_dataset=mouse_arrow_ds, 
    vocabulary=mouse_vocab,
    save_dir=c2s_save_dir,
    save_name=mouse_c2s_save_name,
    dataset_backend="arrow"
)

Saving the dataset (0/1 shards):   0%|          | 0/12113 [00:00<?, ? examples/s]

In [33]:
# Get cell sentences
human_csdata = CSData(vocab=None, data_path='./data/human_arrow', dataset_backend='arrow')
mouse_csdata = CSData(vocab=None, data_path='./data/mouse_arrow', dataset_backend='arrow')

In [34]:
# Ensure the AnnData object has the same order of cells as the cell sentences list
human_cell_sentences = human_csdata.get_sentence_strings()
if len(human_cell_sentences) != human_restricted.shape[0]:
    raise ValueError("The number of cell sentences does not match the number of cells in the AnnData object.")

mouse_cell_sentences = mouse_csdata.get_sentence_strings()
if len(mouse_cell_sentences) != mouse_restricted.shape[0]:
    raise ValueError("The number of cell sentences does not match the number of cells in the AnnData object.")

ValueError: The number of cell sentences does not match the number of cells in the AnnData object.

In [12]:
# Add cell sentences to the AnnData object
human_restricted.obs['cell_sentence'] = human_cell_sentences

# Optionally, print a few sentences to verify
print(human_restricted.obs[['cell_sentence']].head())

                                                                 cell_sentence
human1_lib1.final_cell_0001  REG1A-1 REG1A PRSS2-2 PRSS2 PRSS2-3 PRSS2-1 CE...
human1_lib1.final_cell_0002  REG1A-1 REG1A REG1B-1 REG1B PRSS2-2 PRSS2 PRSS...
human1_lib1.final_cell_0003  REG1A-1 REG1A PRSS2-1 PRSS2-3 PRSS2-2 PRSS2 RE...
human1_lib1.final_cell_0004  REG1A-1 REG1A REG1B-1 REG1B PRSS2-1 PRSS2-3 PR...
human1_lib1.final_cell_0005  REG1A-1 REG1A PRSS2-2 PRSS2-1 PRSS2-3 PRSS2 RE...


In [13]:
# Add cell sentences to the AnnData object
mouse_restricted.obs['cell_sentence'] = mouse_cell_sentences

# Optionally, print a few sentences to verify
print(mouse_restricted.obs[['cell_sentence']].head())

                                                                 cell_sentence
mouse1_lib1.final_cell_0001  INS2 INS1 IAPP PCSK2 SPP1 SQSTM1 HSPA5 CHGA MT...
mouse1_lib1.final_cell_0002  KRT8 ACTG1 ACTB FTH1 ACTN1 RPL13 RPS2 F3 HSPA5...
mouse1_lib1.final_cell_0003  SST IAPP PYY GNAS RBP4 INS2 PCSK2 SCG2 CHGB RE...
mouse1_lib1.final_cell_0004  VIM SERPINE1 ACTB TMSB4X ACTG1 S100A6 ACTN1 AN...
mouse1_lib1.final_cell_0005  SST IAPP PYY RBP4 GNAS PCSK2 SCG2 GPX3 HSPA5 C...


In [14]:
# Convert and Save to Arrow Dataset
def prepare_and_save_csdata(adata, arrow_path, random_state=1234):
    adata = adata.T if adata.shape[1] > adata.shape[0] else adata
    if not os.path.exists(os.path.dirname(arrow_path)):
        os.makedirs(os.path.dirname(arrow_path))
    
    arrow_ds, vocab = CSData.adata_to_arrow(adata, random_state=random_state)
    arrow_ds.save_to_disk(arrow_path)
    return arrow_ds, vocab

In [15]:
# Ensure Data Path
arrow_dir = './data/'
os.makedirs(arrow_dir, exist_ok=True)

In [16]:
# Paths to Arrow Datasets
arrow_path_human = os.path.join(arrow_dir, 'human_arrow')
arrow_path_mouse = os.path.join(arrow_dir, 'mouse_arrow')

In [17]:
# Ensure required keys and convert to Arrow dataset
required_keys = ['cell_sentence', 'organism']

# Ensure organism key is present for both human and mouse
if 'organism' not in human_restricted.obs:
    human_restricted.obs['organism'] = 'Homo sapiens'
if 'organism' not in mouse_restricted.obs:
    mouse_restricted.obs['organism'] = 'Mus musculus'

In [18]:
# Convert and Save
print("Converting human AnnData to Arrow dataset...")
human_arrow_ds, human_vocab = prepare_and_save_csdata(human_restricted, arrow_path_human)
print(f"Arrow dataset for human is saved at: {arrow_path_human}")

print("Converting mouse AnnData to Arrow dataset...")
mouse_arrow_ds, mouse_vocab = prepare_and_save_csdata(mouse_restricted, arrow_path_mouse)
print(f"Arrow dataset for mouse is saved at: {arrow_path_mouse}")

Converting human AnnData to Arrow dataset...


100%|██████████| 12113/12113 [00:01<00:00, 6609.76it/s]


Saving the dataset (0/1 shards):   0%|          | 0/12113 [00:00<?, ? examples/s]

Arrow dataset for human is saved at: ./data/human_arrow
Converting mouse AnnData to Arrow dataset...


100%|██████████| 12113/12113 [00:01<00:00, 7037.83it/s]


Saving the dataset (0/1 shards):   0%|          | 0/12113 [00:00<?, ? examples/s]

Arrow dataset for mouse is saved at: ./data/mouse_arrow


In [19]:
# Verify Arrow Datasets
print(f"Human Arrow dataset path exists? {os.path.exists(arrow_path_human)}")
print(f"Mouse Arrow dataset path exists? {os.path.exists(arrow_path_mouse)}")

Human Arrow dataset path exists? True
Mouse Arrow dataset path exists? True


In [20]:
# Load and Verify
loaded_human_arrow_ds = load_from_disk(arrow_path_human)
print(f"Human Arrow dataset loaded successfully with {len(loaded_human_arrow_ds)} entries.")

loaded_mouse_arrow_ds = load_from_disk(arrow_path_mouse)
print(f"Mouse Arrow dataset loaded successfully with {len(loaded_mouse_arrow_ds)} entries.")

Human Arrow dataset loaded successfully with 12113 entries.
Mouse Arrow dataset loaded successfully with 12113 entries.


In [28]:
# Create CSData Objects
human_csdata_new = CSData(vocab=human_vocab, data_path=arrow_path_human, dataset_backend='arrow')
mouse_csdata_new = CSData(vocab=mouse_vocab, data_path=arrow_path_mouse, dataset_backend='arrow')

In [24]:
# Define CSModel
cell_type_prediction_model_path = "/home/sr2464/palmer_scratch/C2S_Files_Syed/multicell_pretraining_v2_important_models/pythia-410m-multicell_v2_2024-07-28_13-55-51_checkpoint-7600_cell_type_pred"
save_dir = "/home/sr2464/palmer_scratch/C2S_Files_Syed/c2s_api_testing/csmodel_tutorial_2"
save_name = "cell_embedding_prediction_pythia_410M_1"
csmodel = CSModel(
    model_name_or_path=cell_type_prediction_model_path,
    save_dir=save_dir,
    save_name=save_name
)

Using device: cuda


In [29]:
# Embed Cells
print("Embedding human cells...")
human_embeddings = tasks.embed_cells(csdata=human_csdata_new, csmodel=csmodel, n_genes=200)
print("Human cell embeddings shape: ", human_embeddings.shape)

print("Embedding mouse cells...")
mouse_embeddings = tasks.embed_cells(csdata=mouse_csdata_new, csmodel=csmodel, n_genes=200)
print("Mouse cell embeddings shape: ", mouse_embeddings.shape)

Embedding human cells...
Reloading model from path on disk: /home/sr2464/palmer_scratch/C2S_Files_Syed/c2s_api_testing/csmodel_tutorial_2/cell_embedding_prediction_pythia_410M_1


KeyError: 'organism'