In [None]:
cd .. 

## Pseudotime ordering of cells for each time point

In [None]:
import anndata
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import scanpy as sc
import scFates as scf
import seaborn as sns
import os
import gc
import matplotlib.pyplot as plt

In [None]:
def order(adata, root_gene, copy=True):
    """Order adata rooted at `root_gene`.
    """
    if copy:
        adata = adata.copy()
        
    sc.pp.pca(adata, n_comps=50)
    scf.tl.curve(adata, Nodes=30, use_rep="X_pca", ndims_rep=2)
    
    scf.tl.root(adata, root_gene)
    scf.tl.pseudotime(adata, n_jobs=20, n_map=100, seed=42)
    
    return adata

### Pseudotime Ordering for PBMC COVID Vaccine CITE-seq data

Zhang, Bingjie, et al. "Multimodal single-cell datasets characterize antigen-specific CD8+ T cells across SARS-CoV-2 vaccination and infection". Nature Immunology, https://www.nature.com/articles/s41590-023-01608-9.

Data (`PBMC_vaccine_CITE.rds`) downloaded from: https://zenodo.org/record/7555405 and converted to `h5ad` following the tutorial at https://mojaveazure.github.io/seurat-disk/articles/convert-anndata.html

In [None]:
pbmc = anndata.read('/home/paperspace/data/CompBio/PBMC_vaccine_CITE_covid.h5ad')

#### Order

In [None]:
root_gene = 'CD8A'
pbmc_list = []

"""
PBMC samples for CITE-seq and ASAP-seq were collected at four time points:
immediately before (Day 0) vaccination, after primary vaccination (Day 2, Day 10),
and seven days after boost vaccination (Day 28).

These correspond to `.obs['timepoint']` of 0, 1, 2, 3.
"""

for i in tqdm(range(4)):
    adata = pbmc[pbmc.obs['timepoint'] == i]
    adata = order(adata, root_gene)
    pbmc_list.append(adata)

In [None]:
for i, adata in enumerate(tqdm(pbmc_list)):
    adata.uns['timepoint'] = i
    _ = adata.uns.pop('epg')  # pop due to write errors
    del adata.raw  # write error
    adata.write(f'data/pseudotime_adatas/adata_PBMC_{i}.h5ad')

### Pseudotime Ordering for Human Lung Cell Atlas snRNA-seq

Sikkema, L., et al. "An Integrated Cell Atlas of the Human Lung in Health and Disease:. Nature Medicine, https://www.nature.com/articles/s41591-023-02327-2.

Data downloaded from https://cellxgene.cziscience.com/collections/6f6d381a-7701-4781-935c-db10d30de293

We removed subjects with age='nan'.

The following cell types were used in this study:

- fibroblast of lung
- AT2
- endothelial cells (includes 'vein', 'pulmonary artery', 'capillary', 'lymphatic vessel')

In [None]:
# inclusive
__age_groups__ = [(0, 25), (26, 35), (36, 45), (46, 55), (56, 65), (66, 100)]

cell_types = [
    'fibroblast of lung',
    'AT2',
    'Endothelial',
]

In [None]:
# Pick cell type from the list above
CELL_TYPE = 'Endothelial'

In [None]:
hlca = anndata.read('data/adatas/HLCA.h5ad')  # assuming we have removed subjects with age='nan'

In [None]:
if CELL_TYPE == cell_types[0]:
    hlca = hlca[hlca.obs['cell_type'] == 'fibroblast of lung']
    CELL_TYPE = 'Fibroblasts'  # to remove spaces for saving
elif CELL_TYPE == cell_types[1]:
    hlca = hlca[hlca.obs['ann_level_3'] == 'AT2']
elif CELL_TYPE == cell_types[2]:
    hlca = hlca[hlca.obs['ann_level_1'] == 'Endothelial']
else:  # default to 'cell_type'
    hlca = hlca[hlca.obs['cell_type'] == CELL_TYPE]

subjects, idx = np.unique(hlca.obs['subject_ID'], return_index=True)
age = hlca.obs['age'].to_numpy().astype(float)

gc.collect()

#### Order

In [None]:
root = 'CDKN1A'
root_gene = hlca[:, hlca.var['feature_name'] == root].var_names[0]
hlca_list = []

for i, (low, high) in enumerate(tqdm(__age_groups__)):
    indices = np.where((age >= low) & (age <= high), True, False)
    assert indices.sum() > 50  # ensure at least 50 samples
    adata = order(hlca[indices], root_gene)
    hlca_list.append(adata)

In [None]:
assert sum(adata.shape[0] for adata in hlca_list) == hlca.shape[0]

In [None]:
for i, adata in enumerate(tqdm(hlca_list)):
    adata.uns['age_group'] = list(__age_groups__[i])
    _ = adata.uns.pop('epg')  # pop due to write errors
    adata.write(f'data/pseudotime_adatas/adata_{CELL_TYPE}_{i}.h5ad')