In [1]:
from src import Classifier
import src.load as load

import os, json, gc
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split as tts
from warnings import simplefilter
import datetime
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [2]:
dataset = Path('IPF')
os.makedirs('data', exist_ok=True)
os.makedirs('data' / dataset, exist_ok=True)
root = 'data' / dataset

# Set these
filter_genes = False # set to True for MC, HCA and False for IPF
normalize = True
log = True
scale = True
high_var = False # set to True for HCA and False for MC, IPF

# Load data
adata, key = load.dataset(
    str(dataset),
    normalize=normalize,
    log=log,
    scale=scale,
    high_var=high_var,
    filter_genes=filter_genes,
)
adata = load.remove_low_count_ct(adata, key, 50)
print("Range:", adata.X.min(), adata.X.max())

adata.shape=(96301, 4443)
Removed low count classes.
adata.shape=(96196, 4443)
33 label combinations.
Range: -2.4008408 10.0


<h2>Train and Test</h2>

In [3]:
def train_and_test(
        x_train, y_train,
        x_test, y_test,
        *,
        feature_selector=None,
        features=None,
        json_path=None,
        confusion_matrix_dir=None,
        key=None,
        base_classifier=None,
        scale=False,
        normalize=False):
    """Train and test a classifier. Dump the classification results
    and confusion matrices into a json_path and confusion_matrix_dir.
    """
    # use logistic regression if None
    if base_classifier is None:
        base_classifier = LogisticRegression(
            max_iter=600, verbose=0, n_jobs=-1)
    # build a classifier wrapper with extra functionality
    wrap_classifier = Classifier(
        base_classifier,
        feature_selector=feature_selector,
        features=features,
        scale=scale,
        normalize=normalize
    )
    # fit and save results
    wrap_classifier.fit(x_train, y_train)
    
    if json_path is not None and confusion_matrix_dir is not None:
        wrap_classifier.dump(
            x_train, y_train,
            x_test, y_test,
            json_path=json_path,
            confusion_matrix_dir=confusion_matrix_dir,
            key=datetime.timestamp(datetime.now()) if key is None else key,
            extras={'solution': (
                feature_selector.get_support(indices=True)
                if feature_selector is not None
                else features
            )})
    else:
        wrap_classifier.report(x_test, y_test)
    print()
    return wrap_classifier

<h2>Run Classification</h2>

In [None]:
for rs in range(46, 47):
    x_train, x_test = tts(
        adata, random_state=rs,
        stratify=adata.obs[key], train_size=0.5)
    gc.collect()
    
    root = 'data' / dataset / 'RS' / f'rs{rs}'
    method_list = [
#         'GreedyCover',
        'scGeneFit',
#         'DT',
#         'TopDE',
#         'ReliefF',
#         'Fval',
#         'MI',
#         'mRMR',
#         'CrossEntropy',
    ]
    report_paths = [root / m / "report.json" for m in method_list]
    reports = []
    for report_path in report_paths:
        with open(report_path, "r") as f:
            reports.append(json.load(f))

    for report, method in zip(reports, method_list):
        coverage_list = list(report.keys())
        for coverage in coverage_list:
            solution = report[coverage]['solution']
            basedir = root / method

            _ = train_and_test(
                x_train.X, x_train.obs[key],
                x_test.X, x_test.obs[key],
                features=solution,
                json_path=basedir / 'logisticR.json',
                confusion_matrix_dir=basedir / 'cm',
                key=coverage,
                scale=False,
                normalize=False
            )