In [24]:
import os, json, gc, datetime

import anndata
import numpy as np
from pathlib import Path
import scipy.sparse as sp
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split as tts
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

import src.load as load
from src import Classifier
from src.utils.general import load_reports

<h3>Data</h3>

Three datasets were used in this study.

1. **Idiopathic Pulmonary Fibrosis (IPF)**: 96,301 cells and 4,443 highly variable genes. (https://www.science.org/doi/10.1126/sciadv.aba1983)
2. **Mouse Cortex (MC)**: 3,005 cells and 20,006 genes. After filtering genes expressed in at least 10 cells, we are left with 16,484 genes. (https://www.science.org/doi/10.1126/science.aaa1934)
3. **Human Cell Atlas (HCA)**: 84,363 cells and 2,968 highly variable genes. (https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02210-0)

These datasets can be downloaded as .h5ad files from https://drive.google.com/drive/folders/1fLt3QGI2XFIz-4ZjOwk9poFpmqq1Zcgt?usp=sharing.

Since IPF and HCA are large, we only work with highly variable genes to reduce the runtime of the methods.
The IPF h5ad file contains highly variable genes only, so we only need to set `high_var` to `True` for HCA.
`filter_genes` will be set to True for both MC and HCA.

Only for scGeneFit we also set `high_var=True` for MC. Without it, the algorithm will take too long to run across all random seeds and coverage factors.

All data is total count normalized (`normalize = True`), log1p transformed (`log = True`) and scaled to unit variance and zero-mean (`scale = True`). Scanpy is performing the preprocessing in the backend.

Finally, we remove all classes with less than 50 cells. This leads to 33 classes for IPF and 75 classes for HCA (tissue/cell type pairs).

In [22]:
dataset = Path('HCA') # choose from IPF, HCA, MC

In [26]:
root = 'data' / dataset
os.makedirs(root, exist_ok=True)

# Load data
scale = True # set to False if running scGeneFit
adata, key = load.dataset(
    str(dataset),
    normalize=True,
    log=True,
    scale=scale,
    high_var=True if str(dataset) == 'HCA' else False,
    #high_var=True, # Set manually to True for scGeneFit if MC
    filter_genes=False if str(dataset) == 'IPF' else True,
)
adata = load.remove_low_count_ct(adata, key, 50)
print("Range:", adata.X.min(), adata.X.max())

if not scale and sp.issparse(adata.X):
    __mat = np.array(adata.X.todense())
    __ad = anndata.AnnData(__mat)
    __ad.obs = adata.obs
    adata = __ad

7 unique cell types.
adata.shape=(3005, 20006)
Removed low count classes.
adata.shape=(3005, 5034)
7 celltype combinations.
Range: 0.0 8.04193


<h2>Train and Test</h2>

By default, we train a Logistic Regression model to predict the class label. The number of iterations is capped at 600 to speed up the experiments, although our experiments show that no significant improvement occurs if trained for more iterations.

To use any other classifier, simply pass it to `base_classifier`. It must implement `fit_transform`.

In [27]:
def train_and_test(
        x_train, y_train,
        x_test, y_test,
        *,
        feature_selector=None,
        features=None,
        json_path=None,
        confusion_matrix_dir=None,
        key=None,
        base_classifier=None
):
    """Train and test a classifier. Dump the classification results
    and confusion matrices into a json_path and confusion_matrix_dir.
    """
    # use logistic regression if None
    if base_classifier is None:
        base_classifier = LogisticRegression(
            max_iter=600, verbose=0, n_jobs=-1)
    # build a classifier wrapper with extra functionality
    wrap_classifier = Classifier(
        base_classifier,
        feature_selector=feature_selector,
        features=features,
    )
    # fit and save results
    wrap_classifier.fit(x_train, y_train)
    
    if json_path is not None and confusion_matrix_dir is not None:
        wrap_classifier.dump(
            x_train, y_train,
            x_test, y_test,
            json_path=json_path,
            confusion_matrix_dir=confusion_matrix_dir,
            key=datetime.timestamp(datetime.now()) if key is None else key,
            extras={'solution': (
                feature_selector.get_support(indices=True)
                if feature_selector is not None
                else features
            )})
    else:
        wrap_classifier.report(x_test, y_test)
    print()
    return wrap_classifier

<h2>Run Classification</h2>

Note: You must first run the `FeatureSelection` notebook to create `report.json` files with selected features.

Read the `report.json` files containing selected features from the corresponding directories, and run classification. We split the data in a train and test set of equal size in a stratified fashion.

In [29]:
method_list = [
    'GreedyCover',
    'DT',
    'TopDE',
    'ReliefF',
    'MI',
    'mRMR',
    'Fval',
    'CrossEntropy',
    'RankCorr',
    'scGeneFit', # run this separately without scaled data for better results
]

In [None]:
for rs in range(42, 47):
    # Split data into train/test
    x_train, x_test = tts(
        adata, random_state=rs,
        stratify=adata.obs[key], train_size=0.5)
    gc.collect()
    
    # Load reports for this random seed
    root = 'data' / dataset / 'RS' / f'rs{rs}'
    reports = load_reports(root, lreports=True, method_list=method_list)

    for report, method in zip(reports, method_list):
        coverage_list = list(report.keys())
        for coverage in coverage_list:
            solution = report[coverage]['solution']
            basedir = root / method

            _ = train_and_test(
                x_train.X, x_train.obs[key],
                x_test.X, x_test.obs[key],
                features=solution,
                json_path=basedir / 'logisticR.json',
                confusion_matrix_dir=basedir / 'cm',
                key=coverage,
            )