# Learn pseudotime analysis

- https://scanpy.readthedocs.io/en/stable/tutorials.html
- https://scanpy-tutorials.readthedocs.io/en/latest/paga-paul15.html

In [1]:
import os
import numpy as np
import pandas as pd
import scanpy as sc

In [2]:
import warnings
warnings.simplefilter('ignore', UserWarning)
warnings.simplefilter('ignore', RuntimeWarning)
warnings.simplefilter('ignore', FutureWarning)

In [3]:
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=75, facecolor='white')
sc.logging.print_header()

scanpy==1.9.3 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.10.1 pandas==1.5.3 scikit-learn==1.2.2 statsmodels==0.13.5 python-igraph==0.10.4 louvain==0.8.0 pynndescent==0.5.8


Download data set

```bash
%%bash

wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE72nnn/GSE72857/suppl/GSE72857_umitab.txt.gz
```

## Load data

In [9]:
def paul15(filename = 'data/paul15.h5'):
    import h5py
    import anndata as ad

    with h5py.File(filename, 'r') as f:
        # Coercing to float32 for backwards compatibility
        X = f['data.debatched'][()].astype(np.float32)
        gene_names = f['data.debatched_rownames'][()].astype(str)
        cell_names = f['data.debatched_colnames'][()].astype(str)
        clusters = f['cluster.id'][()].flatten().astype(int)
        infogenes_names = f['info.genes_strings'][()].astype(str)

    # each row has to correspond to a observation, therefore transpose
    adata = ad.AnnData(X.transpose(), dtype=X.dtype)
    adata.var_names = gene_names
    adata.row_names = cell_names
    # names reflecting the cell type identifications from the paper
    cell_type = 6 * ['Ery']
    cell_type += 'MEP Mk GMP GMP DC Baso Baso Mo Mo Neu Neu Eos Lymph'.split()
    adata.obs['paul15_clusters'] = [f'{i}{cell_type[i-1]}' for i in clusters]
    # make string annotations categorical (optional)
    adata._sanitize()
    # just keep the first of the two equivalent names per gene
    adata.var_names = [gn.split(';')[0] for gn in adata.var_names]
    # remove 10 corrupted gene names
    infogenes_names = np.intersect1d(infogenes_names, adata.var_names)
    # restrict data array to the 3461 informative genes
    adata = adata[:, infogenes_names].copy()
    # usually we'd set the root cell to an arbitrary cell in the MEP cluster
    # adata.uns['iroot'] = np.flatnonzero(adata.obs['paul15_clusters'] == '7MEP')[0]
    # here, set the root cell as in Haghverdi et al. (2016)
    # note that other than in Matlab/R, counting starts at 0
    adata.uns['iroot'] = 840

    return adata

In [10]:
if not os.path.exists('write'):
    os.makedirs('write')
results_file = 'write/paul15.h5ad'

In [11]:
adata = paul15()