In [1]:
import numpy as np
import pandas as pd

from scipy.stats import f_oneway

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score

import time
import warnings

#### Load data

In [2]:
exp = pd.read_csv(r'../../data/clean/gene_expression.csv')
cnv = pd.read_csv(r'../../data/clean/cnv.csv')
met = pd.read_csv(r'../../data/clean/metabolomics.csv')
labels = pd.read_csv(r'../../data/labels.csv')

In [3]:
exp.drop(columns="Unnamed: 0", inplace=True)
cnv.drop(columns="Unnamed: 0", inplace=True)
met.drop(columns="Unnamed: 0", inplace=True)
labels.drop(columns="Unnamed: 0", inplace=True)

#### Split labels into classes

In [30]:
emci = labels[labels["DX_bl"] == "EMCI"].reset_index(drop=True)
cn = labels[labels["DX_bl"] == "CN"].reset_index(drop=True)
lmci = labels[labels["DX_bl"] == "LMCI"].reset_index().drop(columns='index')
# ad = labels[labels["DX_bl"] == "AD"].reset_index().drop(columns='index')

targets = pd.concat([cn, emci, lmci]).reset_index(drop=True)
encoding = {"CN": 0, "EMCI": 1, "LMCI": 2}
targets.loc[:, "DX_bl"] = targets["DX_bl"].map(encoding).astype("int16")

In [31]:
targets["DX_bl"]

0      0
1      0
2      0
3      0
4      0
      ..
601    2
602    2
603    2
604    2
605    2
Name: DX_bl, Length: 606, dtype: object

#### Get fold 1

In [27]:
n_folds = 5
kf = StratifiedKFold(n_splits=n_folds, random_state=42, shuffle=True)
AUC, ACC = np.zeros(n_folds), np.zeros(n_folds)

for i, (train, test) in enumerate(kf.split(targets["PTID"], targets["DX_bl"])):
    # scale and merge data
    ptids_test = targets[["PTID"]].loc[test, :]
    ptids_train = targets[["PTID"]].loc[train, :]

    exp_sc = StandardScaler()
    cnv_sc = StandardScaler()
    met_sc = StandardScaler()

    exp_train = exp.merge(ptids_train, how='inner', on="PTID")
    cnv_train = cnv.merge(ptids_train, how='inner', on="PTID")
    met_train = met.merge(ptids_train, how='inner', on="PTID")

    exp_train.loc[:, exp_train.columns != "PTID"] = exp_sc.fit_transform(exp_train.loc[:, exp_train.columns != "PTID"])
    cnv_train.loc[:, cnv_train.columns != "PTID"] = cnv_sc.fit_transform(cnv_train.loc[:, cnv_train.columns != "PTID"])
    met_train.loc[:, met_train.columns != "PTID"] = met_sc.fit_transform(met_train.loc[:, met_train.columns != "PTID"])

    exp_test = exp.merge(ptids_test, how='inner', on="PTID")
    cnv_test = cnv.merge(ptids_test, how='inner', on="PTID")
    met_test = met.merge(ptids_test, how='inner', on="PTID")

    exp_test.loc[:, exp_test.columns != "PTID"] = exp_sc.transform(exp_test.loc[:, exp_test.columns != "PTID"])
    cnv_test.loc[:, cnv_test.columns != "PTID"] = cnv_sc.transform(cnv_test.loc[:, cnv_test.columns != "PTID"])
    met_test.loc[:, met_test.columns != "PTID"] = met_sc.transform(met_test.loc[:, met_test.columns != "PTID"])

    master_train = exp_train.merge(targets, how='inner', on="PTID")\
                            .merge(cnv_train, how='inner', on="PTID")\
                            .merge(met_train, how='inner', on="PTID")
    master_test  = exp_test .merge(targets, how='inner', on="PTID")\
                            .merge(cnv_test, how='inner', on="PTID")\
                            .merge(met_test, how='inner', on="PTID")

    X_train = master_train.drop(columns=["DX_bl", "PTID"]).to_numpy()
    X_test = master_test.drop(columns=["DX_bl", "PTID"]).to_numpy()
    y_train = master_train[["DX_bl"]].to_numpy().ravel().astype('int')
    y_test = master_test[["DX_bl"]].to_numpy().ravel().astype('int')

    break

ValueError: Supported target types are: ('binary', 'multiclass'). Got 'unknown' instead.

#### Run classification

In [7]:
for i, (train, test) in enumerate(kf.split(targets["PTID"], targets["DX_bl"])):
    # THE INDICES IN train AND test ARE TAKEN FROM targets, NOT ptid_list

    print(f"Started scaling and merging data for fold {i+1}... ", end='')

    # scale and merge data
    ptids_test = targets[["PTID"]].loc[test, :]
    ptids_train = targets[["PTID"]].loc[train, :]

    exp_sc = StandardScaler()
    cnv_sc = StandardScaler()
    met_sc = StandardScaler()

    exp_train = exp.merge(ptids_train, how='inner', on="PTID")
    # cnv_train = cnv.merge(ptids_train, how='inner', on="PTID")
    # met_train = met.merge(ptids_train, how='inner', on="PTID")

    exp_train.loc[:, exp_train.columns != "PTID"] = exp_sc.fit_transform(exp_train.loc[:, exp_train.columns != "PTID"])
    # cnv_train.loc[:, cnv_train.columns != "PTID"] = cnv_sc.fit_transform(cnv_train.loc[:, cnv_train.columns != "PTID"])
    # met_train.loc[:, met_train.columns != "PTID"] = met_sc.fit_transform(met_train.loc[:, met_train.columns != "PTID"])

    exp_test = exp.merge(ptids_test, how='inner', on="PTID")
    # cnv_test = cnv.merge(ptids_test, how='inner', on="PTID")
    # met_test = met.merge(ptids_test, how='inner', on="PTID")

    exp_test.loc[:, exp_test.columns != "PTID"] = exp_sc.transform(exp_test.loc[:, exp_test.columns != "PTID"])
    # cnv_test.loc[:, cnv_test.columns != "PTID"] = cnv_sc.transform(cnv_test.loc[:, cnv_test.columns != "PTID"])
    # met_test.loc[:, met_test.columns != "PTID"] = met_sc.transform(met_test.loc[:, met_test.columns != "PTID"])

    master_train = exp_train.merge(targets, how='inner', on="PTID")  # \
                            # .merge(cnv_train, how='inner', on="PTID")\
                            # .merge(met_train, how='inner', on="PTID")
    master_test  = exp_test .merge(targets, how='inner', on="PTID")  # \
                            # .merge(cnv_test, how='inner', on="PTID")\
                            # .merge(met_test, how='inner', on="PTID")

    X_train = master_train.drop(columns=["DX_bl", "PTID"]).to_numpy()
    X_test = master_test.drop(columns=["DX_bl", "PTID"]).to_numpy()
    y_train = master_train[["DX_bl"]].to_numpy().ravel().astype('int')
    y_test = master_test[["DX_bl"]].to_numpy().ravel().astype('int')

    break


Started scaling and merging data for fold 1... 

In [18]:
np.savez(r'../../data/testing/fold_1.npz', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

In [6]:
for i, (train, test) in enumerate(kf.split(targets["PTID"], targets["DX_bl"])):
    # THE INDICES IN train AND test ARE TAKEN FROM targets, NOT ptid_list

    print(f"Started scaling and merging data for fold {i+1}... ", end='')

    # scale and merge data
    ptids_test = targets[["PTID"]].loc[test, :]
    ptids_train = targets[["PTID"]].loc[train, :]

    exp_sc = StandardScaler()
    cnv_sc = StandardScaler()
    met_sc = StandardScaler()

    exp_train = exp.merge(ptids_train, how='inner', on="PTID")
    # cnv_train = cnv.merge(ptids_train, how='inner', on="PTID")
    # met_train = met.merge(ptids_train, how='inner', on="PTID")

    exp_train.loc[:, exp_train.columns != "PTID"] = exp_sc.fit_transform(exp_train.loc[:, exp_train.columns != "PTID"])
    # cnv_train.loc[:, cnv_train.columns != "PTID"] = cnv_sc.fit_transform(cnv_train.loc[:, cnv_train.columns != "PTID"])
    # met_train.loc[:, met_train.columns != "PTID"] = met_sc.fit_transform(met_train.loc[:, met_train.columns != "PTID"])

    exp_test = exp.merge(ptids_test, how='inner', on="PTID")
    # cnv_test = cnv.merge(ptids_test, how='inner', on="PTID")
    # met_test = met.merge(ptids_test, how='inner', on="PTID")

    exp_test.loc[:, exp_test.columns != "PTID"] = exp_sc.transform(exp_test.loc[:, exp_test.columns != "PTID"])
    # cnv_test.loc[:, cnv_test.columns != "PTID"] = cnv_sc.transform(cnv_test.loc[:, cnv_test.columns != "PTID"])
    # met_test.loc[:, met_test.columns != "PTID"] = met_sc.transform(met_test.loc[:, met_test.columns != "PTID"])

    master_train = exp_train.merge(targets, how='inner', on="PTID")  # \
                            # .merge(cnv_train, how='inner', on="PTID")\
                            # .merge(met_train, how='inner', on="PTID")
    master_test  = exp_test .merge(targets, how='inner', on="PTID")  # \
                            # .merge(cnv_test, how='inner', on="PTID")\
                            # .merge(met_test, how='inner', on="PTID")

    X_train = master_train.drop(columns=["DX_bl", "PTID"]).to_numpy()
    X_test = master_test.drop(columns=["DX_bl", "PTID"]).to_numpy()
    y_train = master_train[["DX_bl"]].to_numpy().ravel().astype('int')
    y_test = master_test[["DX_bl"]].to_numpy().ravel().astype('int')

    print("Done")

    # scaling and merging complete

    # calculate p-values for each feature
    p_vals = np.ones((X_train.shape[1]))
    # select relevant features
    for j in range(X_train.shape[1]):
        pos = X_train[y_train == 1, j]
        neg = X_train[y_train == 0, j]
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            _, p = f_oneway(pos, neg)
        p_vals[j] = p
    p_vals = np.nan_to_num(p_vals, nan=1.0)
    stat_sig_idx = np.where(p_vals < 0.05)[0]


    # Feature importance based on feature permutation
    forest = RandomForestClassifier(random_state=0, class_weight='balanced')
    forest.fit(X_train, y_train)
    forest_imp_idx = np.where(forest.feature_importances_ > 0.0)[0]


    # select overlap genes (p<0.1 and importance by RF)
    overlap = list(set(stat_sig_idx) & set(forest_imp_idx))
    # print(len(overlap))
    X_train = X_train[:, overlap]
    X_test = X_test[:, overlap]


    # tuning RF prediction parameters by exhaust grid search to optimize a prediction RF model
    start_time = time.time()
    print(f"Started tuning RF for fold {i+1}... ", end='')
    param_grid = {'n_estimators': np.arange(50, 200, 10),
                'max_features': np.arange(0.2, 1, 0.1),
                'max_depth': np.arange(1, 10, 1),
                'max_samples': np.arange(0.2, 1, 0.1)}
    model_grid = GridSearchCV(RandomForestClassifier(criterion='gini', random_state=0),
                                        param_grid, n_jobs=-1).fit(X_train, y_train)
    model = model_grid.best_estimator_
    elapsed_time = time.time() - start_time

    print("Done")

    print(f"Elapsed time to tune RF: {elapsed_time:.3f} seconds")
    print(f"Best parameters: {model.get_params()}")

    # predict using the model built
    test_prob = model.predict_proba(X_test)
    test_predict = model.predict(X_test)
    AUC[i] = roc_auc_score(y_test, test_prob[:, 1])
    ACC[i] = accuracy_score(y_test, test_predict)
    print("Test accuracy: %f" % ACC[i])
    print("Test      AUC: %f" % AUC[i])

    # prepare the result output
    if i == 0:
        test_out = pd.DataFrame(test_prob)
        test_out['predict'] = test_predict
        test_out['class_label'] = y_test
    else:
        temp = pd.DataFrame(test_prob)
        temp['predict'] = test_predict
        temp['class_label'] = y_test
        pd.concat([test_out, temp], axis=0)

    print()

Started scaling and merging data for fold 1... Done
Started tuning RF for fold 1... Done
Elapsed time to tune RF: 1771.105 seconds
Best parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 7, 'max_features': 0.5000000000000001, 'max_leaf_nodes': None, 'max_samples': 0.6000000000000001, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 180, 'n_jobs': None, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}
Test accuracy: 0.506024
Test      AUC: 0.518648

Started scaling and merging data for fold 2... Done
Started tuning RF for fold 2... Done
Elapsed time to tune RF: 1843.636 seconds
Best parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 8, 'max_features': 0.2, 'max_leaf_nodes': None, 'max_samples': 0.6000000000000001, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples

In [9]:
test_out.to_csv(r'../../results/multiomics_emci-cn_rf_sigdiff-and-rf_pval0-1')