In [14]:
import numpy as np
import pandas as pd

from scipy.stats import f_oneway

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score

import time
import warnings

#### Load data

In [15]:
expr = pd.read_csv(r'../../data/clean/gene_expression.csv')
cnv = pd.read_csv(r'../../data/clean/cnv.csv')
metab = pd.read_csv(r'../../data/clean/metabolomics.csv')
labels = pd.read_csv(r'../../data/clean/labels.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../../data/clean/labels.csv'

In [None]:
expr.drop(columns="Unnamed: 0", inplace=True)
cnv.drop(columns="Unnamed: 0", inplace=True)
metab.drop(columns="Unnamed: 0", inplace=True)
labels.drop(columns="Unnamed: 0", inplace=True)

#### Split labels into classes

In [None]:
emci = labels[labels["DX_bl"] == "EMCI"].reset_index(drop=True)
cn = labels[labels["DX_bl"] == "CN"].reset_index(drop=True)

targets = pd.concat([emci, cn])
# lmci = labels[labels["DX_bl"] == "LMCI"].reset_index().drop(columns='index')
# ad = labels[labels["DX_bl"] == "AD"].reset_index().drop(columns='index')

In [None]:
# lmci_1 = lmci.sample(frac=0.5, random_state=42)
# lmci_2 = lmci.drop(lmci_1.index)

# emci_1 = emci.sample(frac=0.5, random_state=42)
# emci_2 = emci.drop(emci_1.index)

# mci_1 = pd.concat([emci_1, lmci_1]).reset_index().drop(columns='index')
# mci_2 = pd.concat([emci_1, lmci_2]).reset_index().drop(columns='index')
# mci_3 = pd.concat([emci_2, lmci_1]).reset_index().drop(columns='index')
# mci_4 = pd.concat([emci_2, lmci_2]).reset_index().drop(columns='index')

#### Merge all datasets

In [None]:
master = expr.merge(cnv.filter(regex=r"(PTID|HighQual)"), how='inner', on="PTID").merge(metab, how='inner', on="PTID")
master = master.merge(targets, how='inner', on="PTID")

#### Run classification

In [21]:
X = master.drop(columns=["PTID", "DX_bl"]).to_numpy()
y = master[["DX_bl"]].to_numpy()

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y.ravel())

n_folds = 5
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

AUC, ACC = np.zeros(n_folds), np.zeros(n_folds)

In [24]:

for i, (train, test) in enumerate(kf.split(X, y)):
    ptids_train = train.index.to_list

    X_train = X[train]
    y_train = y[train]
    X_test = X[test]
    y_test = y[test]

    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)


    # calculate p-values for each feature
    p_vals = np.ones((X_train.shape[1]))
    # select relevant features
    for j in range(X_train.shape[1]):
        pos = X_train[y_train == 1, j]
        neg = X_train[y_train == 0, j]
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            _, p = f_oneway(pos, neg)
        p_vals[j] = p
    p_vals = np.nan_to_num(p_vals, nan=1.0)
    stat_sig_idx = np.where(p_vals < 0.05)[0]


    # Feature importance based on feature permutation
    forest = RandomForestClassifier(random_state=0, class_weight='balanced')
    forest.fit(X_train, y_train)
    forest_imp_idx = np.where(forest.feature_importances_ > 0.0)[0]


    # select overlap genes (p<0.1 and importance by RF)
    overlap = list(set(stat_sig_idx) & set(forest_imp_idx))
    # print(len(overlap))
    X_train = X_train[:, overlap]
    X_test = X_test[:, overlap]


    # tuning RF prediction parameters by exhaust grid search to optimize a prediction RF model
    start_time = time.time()
    param_grid = {'n_estimators': np.arange(50, 200, 10),
                'max_features': np.arange(0.2, 1, 0.1),
                'max_depth': np.arange(1, 10, 1),
                'max_samples': np.arange(0.2, 1, 0.1)}
    model_grid = GridSearchCV(RandomForestClassifier(criterion='gini', random_state=0),
                                        param_grid, n_jobs=-1).fit(X_train, y_train)
    model = model_grid.best_estimator_
    elapsed_time = time.time() - start_time
    print(f"Elapsed time to tune RF: {elapsed_time:.3f} seconds")
    print(f"Best parameters: {model.get_params()}")

    # predict using the model built
    test_prob = model.predict_proba(X_test)
    test_predict = model.predict(X_test)
    AUC[i] = roc_auc_score(y_test, test_prob[:, 1])
    ACC[i] = accuracy_score(y_test, test_predict)
    print("Test accuracy: %f" % ACC[i])
    print("Test      AUC: %f" % AUC[i])

    # prepare the result output
    if i == 0:
        test_out = pd.DataFrame(test_prob)
        test_out['predict'] = test_predict
        test_out['class_label'] = y_test
    else:
        temp = pd.DataFrame(test_prob)
        temp['predict'] = test_predict
        temp['class_label'] = y_test
        pd.concat([test_out, temp], axis=0)

    print()

TypeError: '<' not supported between instances of 'int' and 'str'

In [None]:
# increase range for max_features and max_samples??

In [11]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 7,
 'max_features': 0.2,
 'max_leaf_nodes': None,
 'max_samples': 0.8000000000000003,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 140,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [9]:
test_out.to_csv(r'../../results/multiomics_emci-cn_rf_sigdiff-and-rf_pval0-1')