In [1]:
import scanpy as sc

# Load the preprocessed AnnData
input_path = '../data/sample/raw_h5ad/Li2023b_downsampled_10000_cells.h5ad'
adata = sc.read_h5ad(input_path)

In [2]:
from epiagent.preprocessing import global_TFIDF
from epiagent.tokenization import tokenization
import numpy as np
import os

cCRE_document_frequency = np.load('../data/cCRE_document_frequency.npy')

# Apply TF-IDF
print("Applying TF-IDF...")
global_TFIDF(adata, cCRE_document_frequency)

# Tokenize the data
print("Tokenizing the data...")
tokenization(adata)

# Save the processed AnnData
processed_output_dir = "../data/sample/processed_h5ad/"
os.makedirs(processed_output_dir, exist_ok=True)
processed_output_path = os.path.join(processed_output_dir, "Li2023b_downsampled_10000_cells_cellsentenced.h5ad")
adata.write(processed_output_path)
print(f"Processed data saved at {processed_output_path}")

Applying TF-IDF...


Tokenizing the data...
Tokenization complete: 'cell_sentences' column added to adata.obs.
Processed data saved at ../data/sample/processed_h5ad/Li2023b_downsampled_10000_cells_cellsentenced.h5ad


In [3]:
print(adata)

AnnData object with n_obs × n_vars = 9999 × 1355445
    obs: 'sample', 'cell_barcode', 'brain_region', 'diagnosis', 'donor_ID', 'sex', 'age', 'TSS_enrichment', 'num_reads_in_TSS', 'num_reads_in_promoter', 'num_reads_in_blacklist', 'prmoter_ratio', 'nucleosome_ratio', 'num_unique_fragments', 'num_mononucleosome_fragments', 'num_dinucleosome_fragments', 'num_multinucleosome_fragments', 'doublet_score', 'doublet_enrichment', 'blacklist_ratio', 'ArchR_clusters_full_dataset', 'annotation_cell_class', 'annotation_major_cell_type', 'annotation_cell_subtype', 'cell_type', 'cross_validation', 'cell_sentences'


### DI

In [1]:
import scanpy as sc

input_path = './output/finetune_DI/imputed_signals_finetuned_epoch_50.h5ad'
adata = sc.read_h5ad(input_path)

In [2]:
adata

AnnData object with n_obs × n_vars = 10001 × 50000
    obs: 'sangerID', 'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age', 'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'batch_key', 'cell_type', 'cell_state', 'cross_validation', 'cell_sentences'
    uns: 'cell_type_colors', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [5]:
adata.obsm['X_pca'].shape

(10001, 50)

In [3]:
adata.obsm['X_umap']

array([[ 6.5349927, 10.657569 ],
       [-6.528096 ,  1.0514829],
       [10.891603 ,  8.127394 ],
       ...,
       [-4.6673927,  9.065848 ],
       [ 2.5069582,  8.075196 ],
       [-7.9387836,  3.0977898]], dtype=float32)