In [1]:
import numpy as np

import src.load as load
from src.utils.experiment import (
    fit_and_dump_GCS,
    fit_and_dump_DT,
    fit_and_dump_topDE,
    fit_and_dump_ReliefF,
    fit_and_dump_scGeneFit,
    fit_and_dump_Fval,
    fit_and_dump_MI,
    fit_and_dump_mRMR,
    fit_and_dump_CEM,
    get_feat_list,
)

import os, gc
from pathlib import Path
from tqdm import tqdm
from sklearn.model_selection import train_test_split as tts

In [2]:
# Choose dataset
dataset = Path('MC')
os.makedirs('data', exist_ok=True)
root = 'data' / dataset
os.makedirs(root, exist_ok=True)

# Set these
filter_genes = True # set to true for MC, HCA, false for IPF
normalize = True
log = True
scale = True
high_var = True # set to true for HCA, false for MC, IPF

# Load data
adata, key = load.dataset(
    str(dataset),
    normalize=normalize,
    log=log,
    scale=scale,
    high_var=high_var,
    filter_genes=filter_genes,
)
adata = load.remove_low_count_ct(adata, key, 50)
print("Range:", adata.X.min(), adata.X.max())

7 unique cell types.
adata.shape=(3005, 20006)
Removed low count classes.
adata.shape=(3005, 5034)
7 celltype combinations.
Range: -1.8969457 10.0


<h2>GVars</h2>

In [3]:
json_filename = 'report.json'
feature_importances_filename = 'feature_importances_.txt'

params = {
    'n_train': adata.shape[0] // 2,
    'n_features_in': adata.shape[1],
    'n_classes': np.unique(adata.obs[key]).size,
    'filter_genes': filter_genes,
    'normalize': normalize,
    'log': log,
    'scale': scale,
    'high_var': high_var,
}

In [4]:
to_fit = [
#     'GreedyCover',
#     'DT',
#     'TopDE',
#     'ReliefF',
    'scGeneFit',
#     'Fval',
#     'MI',
#     'mRMR',
#     'CEM',
]

In [None]:
random_seeds = np.arange(42, 47)
rs_root = 'data' / dataset / 'RS'
# set root based on a fixed greedy cover run
greedy_root = rs_root / f'rs42' / 'GreedyCover' / json_filename

for random_seed in tqdm(random_seeds):
    x_train, x_test = tts(
        adata, random_state=random_seed,
        stratify=adata.obs[key], train_size=0.5)
    params['n_train'] = x_train.shape[0]
    gc.collect()
    root = rs_root / f'rs{random_seed}'
    
    coverage_list = np.arange(1, 10).tolist()
    kwargs = {
        'root': root,
        'data': x_train,
        'key': key,
        'extras': params,
        'coverage_list': coverage_list,
        'json_filename': json_filename,
    }
    
    if 'GreedyCover' in to_fit: fit_and_dump_GCS(**kwargs)
        
    # Reload coverage_list based on greedy root
    # so that all methods have the same number of features
    # selected across all runs.
    coverage_list, max_features_list = get_feat_list(greedy_root)
    kwargs['coverage_list'] = coverage_list
    kwargs['max_features_list'] = max_features_list

    if 'DT' in to_fit: fit_and_dump_DT(**kwargs)
    if 'TopDE' in to_fit: fit_and_dump_topDE(**kwargs)
    if 'ReliefF' in to_fit: fit_and_dump_ReliefF(**kwargs)
    if 'scGeneFit' in to_fit: fit_and_dump_scGeneFit(**kwargs)
    if 'Fval' in to_fit: fit_and_dump_Fval(**kwargs)
    if 'MI' in to_fit: fit_and_dump_MI(**kwargs)
    if 'mRMR' in to_fit: fit_and_dump_mRMR(**kwargs)

    if 'CEM' in to_fit:
        # separate coverage list for CEM since n features varies a lot
        coverage_list = np.arange(10, 15)
        kwargs['smoothing_parameter'] = None
        kwargs['coverage_list'] = coverage_list.tolist()
        fit_and_dump_CEM(**kwargs)

  0%|                                                                                      | 0/5 [00:00<?, ?it/s]

[14, 17, 19, 23, 26, 29, 33, 36, 39, 43, 46, 49, 53, 56, 59, 63, 66, 70, 74, 77, 81, 86, 89, 91, 95, 98, 101, 105, 108, 111, 114, 119, 123, 126, 129, 134, 139]
Fitting scGeneFit.
Solving a linear program with 5034 variables and 7 constraints
Time elapsed: 172.54728603363037 seconds
Solving a linear program with 5034 variables and 9 constraints
Time elapsed: 154.36224794387817 seconds
Solving a linear program with 5034 variables and 7 constraints
Time elapsed: 162.20732522010803 seconds
Solving a linear program with 5034 variables and 7 constraints
Time elapsed: 137.73728203773499 seconds
Solving a linear program with 5034 variables and 7 constraints
Time elapsed: 152.38596200942993 seconds
Solving a linear program with 5034 variables and 9 constraints
Time elapsed: 226.6019163131714 seconds
Solving a linear program with 5034 variables and 7 constraints
Time elapsed: 147.25155568122864 seconds
Solving a linear program with 5034 variables and 8 constraints
Time elapsed: 114.7731041908264