In [None]:
import numpy as np
import pandas as pd

from scipy.stats import f_oneway, mannwhitneyu, shapiro

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score

import time
import warnings

#### Load data

In [2]:
exp = pd.read_csv(r'../../data/clean/gene_expression.csv')
cnv = pd.read_csv(r'../../data/clean/cnv.csv')
cnv_full = pd.read_csv(r'../../data/cnv.csv')
met = pd.read_csv(r'../../data/clean/metabolomics.csv')
labels = pd.read_csv(r'../../data/labels.csv')

In [3]:
exp.drop(columns="Unnamed: 0", inplace=True)
cnv.drop(columns="Unnamed: 0", inplace=True)
cnv_full.drop(columns="Unnamed: 0", inplace=True)
met.drop(columns="Unnamed: 0", inplace=True)
labels.drop(columns="Unnamed: 0", inplace=True)

#### Split labels into classes

In [4]:
emci = labels[labels["DX_bl"] == "EMCI"].reset_index(drop=True)
cn = labels[labels["DX_bl"] == "CN"].reset_index(drop=True)
lmci = labels[labels["DX_bl"] == "LMCI"].reset_index().drop(columns='index')
# ad = labels[labels["DX_bl"] == "AD"].reset_index().drop(columns='index')

targets = pd.concat([cn, emci, lmci]).reset_index(drop=True)
encoding = {"CN": 0, "EMCI": 1, "LMCI": 1}
targets.loc[:, "DX_bl"] = targets["DX_bl"].map(encoding)
targets = targets.astype({"DX_bl": "int16"})

In [5]:
exp = exp.merge(targets[["PTID"]], how='inner', on="PTID")
cnv = cnv.merge(targets[["PTID"]], how='inner', on="PTID")
cnv_full = cnv_full.merge(targets[["PTID"]], how='inner', on="PTID")
met = met.merge(targets[["PTID"]], how='inner', on="PTID")

exp.sort_values(by="PTID", ignore_index=True, inplace=True)
cnv.sort_values(by="PTID", ignore_index=True, inplace=True)
cnv_full.sort_values(by="PTID", ignore_index=True, inplace=True)
met.sort_values(by="PTID", ignore_index=True, inplace=True)
targets.sort_values(by="PTID", ignore_index=True, inplace=True)

exp_np = exp.select_dtypes(include=np.number).to_numpy()
cnv_full_np = cnv_full.select_dtypes(include=np.number).to_numpy()
met_np = met.select_dtypes(include=np.number).to_numpy()
targets_np = targets[['DX_bl']].to_numpy().ravel()

cnv_full_np_log = np.log2(cnv_full_np + 1)

In [6]:
X = np.concatenate((exp_np, cnv_full_np_log, met_np), axis=1)
y = targets_np

#### Run classification

In [16]:
n_folds = 5
kf = StratifiedKFold(n_splits=n_folds, random_state=42, shuffle=True)
AUC, ACC = np.zeros(n_folds), np.zeros(n_folds)

for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):

    X_train = X[train_idx]
    X_test  = X[test_idx]
    y_train = y[train_idx]
    y_test  = y[test_idx]

    exp_meta = {'pval_thresh': 0.2,  'idx_bnds': (0, exp_np.shape[1] - 1)}
    cnv_meta = {'pval_thresh': 0.01, 'idx_bnds': (exp_meta['idx_bnds'][1] + 1, exp_meta['idx_bnds'][1] + cnv_full_np.shape[1])}
    met_meta = {'pval_thresh': 0.2,  'idx_bnds': (cnv_meta['idx_bnds'][1] + 1, cnv_meta['idx_bnds'][1] + met_np.shape[1])}



    # SELECT FEATURES
    pvals = np.ones((X_train.shape[1]))
    for j in range(X_train.shape[1]):
        pos = X_train[y_train == 1, j].ravel()
        neg = X_train[y_train == 0, j].ravel()

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            _, p = f_oneway(pos, neg)

        pvals[j] = p

    pvals = np.nan_to_num(pvals, nan=1.0)

    for omic_meta in (exp_meta, cnv_meta, met_meta):
        mask = np.full_like(pvals, False)
        mask[omic_meta['idx_bnds'][0]: omic_meta['idx_bnds'][1] + 1] = True

        omic_meta['sig_idx'] = np.where(
            np.logical_and(pvals < omic_meta['pval_thresh'], mask)
        )[0]
        
        print(omic_meta['sig_idx'])
        print(len(omic_meta['sig_idx']))
    print()


    forest = RandomForestClassifier(random_state=0, class_weight='balanced', n_jobs = -1 )
    forest.fit(X_train, y_train)
    for omic_meta in (exp_meta, cnv_meta, met_meta):
        mask = np.full_like(pvals, False)
        mask[omic_meta['idx_bnds'][0]: omic_meta['idx_bnds'][1] + 1] = True

        omic_meta['rf_idx'] = np.where(
            np.logical_and(forest.feature_importances_ > 0.0, mask)
        )[0]

        print(omic_meta['rf_idx'])
        print(len(omic_meta['rf_idx']))
    print()


    # select overlap genes
    for omic_meta in (exp_meta, cnv_meta, met_meta):
        omic_meta['intxn'] = list(set(omic_meta['sig_idx']) & set(omic_meta['rf_idx']))
        print(omic_meta['intxn'])
        print(len(omic_meta['intxn']))


    for omic_meta in exp_meta, cnv_meta, met_meta:
        omic_meta['sel_train'] = X_train[:, omic_meta['intxn']]
        omic_meta['sel_test'] = X_test[:, omic_meta['intxn']]
        omic_meta['scaler'] = StandardScaler()

        if len(omic_meta['intxn']) > 0:
            omic_meta['sel_train'] = omic_meta['scaler'].fit_transform(omic_meta['sel_train'])
            omic_meta['sel_test'] = omic_meta['scaler'].transform(omic_meta['sel_test'])

    X_train = np.concatenate((exp_meta['sel_train'], cnv_meta['sel_train'], met_meta['sel_train']), axis=1)
    X_test = np.concatenate((exp_meta['sel_test'], cnv_meta['sel_test'], met_meta['sel_test']), axis=1)

    # remove Highly Correlated Features from the dataset
    corr_matrix = np.abs(np.corrcoef(X_train, rowvar=False))
    upper = np.where(
        np.logical_and(
            corr_matrix, np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
        ), corr_matrix, 0
    )

    # drop features with high similarity
    cutoff = 0.8
    to_drop = [column for column in range(upper.shape[1]) if any(upper[column] > cutoff)]
    X_train = np.delete(X_train, to_drop, axis=1)
    X_test = np.delete(X_test, to_drop, axis=1)

    print(X_train.shape)
    print(X_test.shape)


    # CLASSIFICATION
    # tuning RF prediction parameters by exhaust grid search to optimize a prediction RF amodel
    start_time = time.time()
    param_grid = {
        'n_estimators': np.arange(40, 240, 20),
        'max_features': np.arange(0.2, 1, 0.1),
        'max_depth': np.arange(1, 10, 1),
        'max_samples': np.arange(0.2, 1, 0.1)
    }
                
    model_grid = GridSearchCV(
        RandomForestClassifier(criterion='gini', random_state=0, class_weight="balanced", n_jobs=-1),
        param_grid
    ).fit(X_train, y_train)

    model = model_grid.best_estimator_
    elapsed_time = time.time() - start_time



    # PRINT RESULTS
    print(f"Elapsed time to tune RF: {elapsed_time:.3f} seconds")
    print(f"Best parameters: {model.get_params()}")

    test_prob = model.predict_proba(X_test)
    test_predict = model.predict(X_test)
    AUC[i] = roc_auc_score(y_test, test_prob[:, 1])
    ACC[i] = accuracy_score(y_test, test_predict)
    print("Test accuracy: %f" % ACC[i])
    print("Test      AUC: %f" % AUC[i])

    # prepare the result output
    if i == 0:
        test_out = pd.DataFrame(test_prob)
        test_out['predict'] = test_predict
        test_out['class_label'] = y_test
    else:
        temp = pd.DataFrame(test_prob)
        temp['predict'] = test_predict
        temp['class_label'] = y_test
        pd.concat([test_out, temp], axis=0)

    print()

[    0     2     5 ... 17834 17835 17840]
2907
[ 17907  17996  18009  18271  18331  18415  18449  18581  19417  19440
  19493  19886  20108  20640  21208  21357  21495  21638  22408  22590
  23048  23090  23092  24324  24348  24349  24350  24773  24956  25153
  25246  25396  25656  25897  25923  25968  26036  26260  26650  27200
  27210  27219  27226  27443  27536  27540  27548  27748  27782  27907
  28429  28500  28802  29538  29843  29844  29968  30195  30208  30282
  30777  30833  31034  31043  31074  31094  31266  31274  31275  31277
  31278  31450  31491  31603  31767  31818  31822  32890  32981  33176
  33342  33390  33510  33807  33808  33868  33965  34019  34271  34678
  35161  35420  35505  35748  36498  36557  37047  37286  37684  38019
  38055  39104  39155  39368  39861  40237  40257  40387  40558  40592
  40664  40811  40812  40826  41711  41849  41927  42004  42235  42264
  42617  42655  43129  43160  43282  43393  44035  45010  45106  45126
  45220  45288  45328  45782  

In [12]:
i

2

In [13]:
test_out.to_csv(r'../../results/multiomics_emci-cn_rf_pval-0.1-0.05-0.1_fold-1.csv')