In [3]:
import numpy as np

import src.load as load
from src.utils.experiment import (
    fit_and_dump_GCS,
    fit_and_dump_DT,
    fit_and_dump_topDE,
    fit_and_dump_ReliefF,
    fit_and_dump_scGeneFit,
    fit_and_dump_Fval,
    fit_and_dump_MI,
    fit_and_dump_mRMR,
    fit_and_dump_CEM,
    fit_and_dump_RankCorr,
    get_feat_list,
)

import os, gc
import anndata
from pathlib import Path
import scipy.sparse as sp
from tqdm import tqdm
from sklearn.model_selection import train_test_split as tts

<h3>Data</h3>

Three datasets were used in this study.

1. **Idiopathic Pulmonary Fibrosis (IPF)**: 96,301 cells and 4,443 highly variable genes. (https://www.science.org/doi/10.1126/sciadv.aba1983)
2. **Mouse Cortex (MC)**: 3,005 cells and 20,006 genes. After filtering genes expressed in at least 10 cells, we are left with 16,484 genes. (https://www.science.org/doi/10.1126/science.aaa1934)
3. **Human Cell Atlas (HCA)**: 84,363 cells and 2,968 highly variable genes. (https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02210-0)

These datasets can be downloaded as .h5ad files from https://drive.google.com/drive/folders/1fLt3QGI2XFIz-4ZjOwk9poFpmqq1Zcgt?usp=sharing.

Since IPF and HCA are large, we only work with highly variable genes to reduce the runtime of the methods.
The IPF h5ad file contains highly variable genes only, so we only need to set `high_var` to `True` for HCA.
`filter_genes` will be set to True for both MC and HCA.

Only for scGeneFit we also set `high_var=True` for MC. Without it, the algorithm will take too long to run across all random seeds and coverage factors.

All data is total count normalized (`normalize = True`), log1p transformed (`log = True`) and scaled to unit variance and zero-mean (`scale = True`). scGeneFit performed poorly when data was scaled, so we skip scaling for this method. Scanpy is performing the preprocessing in the backend.

Finally, we remove all classes with less than 50 cells. This leads to 33 classes for IPF and 75 classes for HCA (tissue/cell type pairs).

In [4]:
# Choose dataset
dataset = Path('IPF')
os.makedirs('data', exist_ok=True)
root = 'data' / dataset
os.makedirs(root, exist_ok=True)

filter_genes = False if str(dataset) == 'IPF' else True
normalize = True
log = True
scale = True # Set to False if testing scGeneFit
high_var = True if str(dataset) == 'HCA' else False
#high_var = True # True for MC if running scGeneFit

# Load data
adata, key = load.dataset(
    str(dataset),
    normalize=normalize,
    log=log,
    scale=scale,
    high_var=high_var,
    filter_genes=filter_genes,
)
adata = load.remove_low_count_ct(adata, key, 50)
print("Range:", adata.X.min(), adata.X.max())

# sparse matrices will raise an error for some methods
if not scale and sp.issparse(adata.X):
    __mat = np.array(adata.X.todense())
    __ad = anndata.AnnData(__mat)
    __ad.obs = adata.obs
    adata = __ad
    gc.collect()

7 unique cell types.
adata.shape=(3005, 20006)
Removed low count classes.
adata.shape=(3005, 16484)
7 celltype combinations.
Range: -8.1035595 10.0


<h2>GVars</h2>

In [5]:
json_filename = 'report.json'
feature_importances_filename = 'feature_importances_.txt'

params = {
    'n_train': adata.shape[0] // 2,
    'n_features_in': adata.shape[1],
    'n_classes': np.unique(adata.obs[key]).size,
    'filter_genes': filter_genes,
    'normalize': normalize,
    'log': log,
    'scale': scale,
    'high_var': high_var,
}

Select methods to fit. GreedyCover must be run first in order to determine the number of features to select using other methods.

In [18]:
to_fit = [
    'GreedyCover',
    'DT',
    'TopDE',
    'ReliefF',
    'Fval',
    'MI',
    'mRMR',
    'CrossEntropy',
    'RankCorr',
    #'scGeneFit', # run this separately without scaled data for better results
]

`TopDE`, `CrossEntropy`, and `RankCorr` may require some manual tuning of the parameters since the number of features to select can not be fixed in advance.

In [None]:
random_seeds = np.arange(42, 47)
rs_root = 'data' / dataset / 'RS'
# set root based on a fixed greedy cover run
greedy_root = rs_root / f'rs42' / 'GreedyCover' / json_filename

for random_seed in tqdm(random_seeds):
    x_train, x_test = tts(
        adata, random_state=random_seed,
        stratify=adata.obs[key], train_size=0.5)
    params['n_train'] = x_train.shape[0]
    gc.collect()
    root = rs_root / f'rs{random_seed}'
    
    coverage_list = np.arange(1, 20).tolist()
    kwargs = {
        'root': root,
        'data': x_train,
        'key': key,
        'extras': params,
        'coverage_list': coverage_list,
        'json_filename': json_filename,
    }
    if 'GreedyCover' in to_fit: fit_and_dump_GCS(**kwargs)
        
    # Reload coverage_list based on greedy root
    # so that all methods have the same number of features
    # selected across all runs.
    coverage_list, max_features_list = get_feat_list(greedy_root)
    kwargs['coverage_list'] = coverage_list
    kwargs['max_features_list'] = max_features_list
    
    if 'DT' in to_fit: fit_and_dump_DT(**kwargs)
    if 'TopDE' in to_fit: fit_and_dump_topDE(**kwargs)
    if 'ReliefF' in to_fit: fit_and_dump_ReliefF(**kwargs)
    if 'scGeneFit' in to_fit: fit_and_dump_scGeneFit(**kwargs)
    if 'Fval' in to_fit: fit_and_dump_Fval(**kwargs)
    if 'MI' in to_fit: fit_and_dump_MI(**kwargs)
    if 'mRMR' in to_fit: fit_and_dump_mRMR(**kwargs)
    if 'RankCorr' in to_fit: fit_and_dump_RankCorr(
        **kwargs, lamb_list=(np.arange(1, 16) / 4.5).tolist())
    if 'CEM' in to_fit:
        # separate coverage list for CEM since n features varies a lot
        coverage_list = np.arange(10, 15)
        kwargs['smoothing_parameter'] = None
        kwargs['coverage_list'] = coverage_list.tolist()
        fit_and_dump_CEM(**kwargs)