In [1]:
import numpy as np
import pandas as pd

from scipy.stats import f_oneway, mannwhitneyu, shapiro

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, roc_auc_score

import time
import warnings

#### Load data

In [None]:
exp = pd.read_csv(r'../../data/clean/gene_expression.csv')
cnv = pd.read_csv(r'../../data/clean/cnv.csv')
cnv_full = pd.read_csv(r'../../data/cnv.csv')
met = pd.read_csv(r'../../data/clean/metabolomics.csv')
labels = pd.read_csv(r'../../data/labels.csv')

In [None]:
exp.drop(columns="Unnamed: 0", inplace=True)
cnv.drop(columns="Unnamed: 0", inplace=True)
cnv_full.drop(columns="Unnamed: 0", inplace=True)
met.drop(columns="Unnamed: 0", inplace=True)
labels.drop(columns="Unnamed: 0", inplace=True)

#### Split labels into classes

In [None]:
emci = labels[labels["DX_bl"] == "EMCI"].reset_index(drop=True)
cn = labels[labels["DX_bl"] == "CN"].reset_index(drop=True)
lmci = labels[labels["DX_bl"] == "LMCI"].reset_index().drop(columns='index')
# ad = labels[labels["DX_bl"] == "AD"].reset_index().drop(columns='index')

targets = pd.concat([cn, emci, lmci]).reset_index(drop=True)
encoding = {"CN": 0, "EMCI": 1, "LMCI": 1}
targets.loc[:, "DX_bl"] = targets["DX_bl"].map(encoding)
targets = targets.astype({"DX_bl": "int16"})

In [None]:
exp = exp.merge(targets[["PTID"]], how='inner', on="PTID")
cnv = cnv.merge(targets[["PTID"]], how='inner', on="PTID")
cnv_full = cnv_full.merge(targets[["PTID"]], how='inner', on="PTID")
met = met.merge(targets[["PTID"]], how='inner', on="PTID")

exp.sort_values(by="PTID", ignore_index=True, inplace=True)
cnv.sort_values(by="PTID", ignore_index=True, inplace=True)
cnv_full.sort_values(by="PTID", ignore_index=True, inplace=True)
met.sort_values(by="PTID", ignore_index=True, inplace=True)
targets.sort_values(by="PTID", ignore_index=True, inplace=True)

exp_np = exp.select_dtypes(include=np.number).to_numpy()
cnv_full_np = cnv_full.select_dtypes(include=np.number).to_numpy()
met_np = met.select_dtypes(include=np.number).to_numpy()
targets_np = targets[['DX_bl']].to_numpy().ravel()

cnv_full_np_log = np.log2(cnv_full_np + 1)

In [None]:
exp_meta = {'data': exp_np,          'pval_thresh': 0.2,  'var_thresh': 0.01}
cnv_meta = {'data': cnv_full_np_log, 'pval_thresh': 0.01, 'var_thresh': 0.01}
met_meta = {'data': met_np,          'pval_thresh': 0.2,  'var_thresh': 0.01}

train_idx = len(targets_np)
test_idx = []


# VARIANCE
for omic_meta in (exp_meta, cnv_meta, met_meta):
    omic_meta['train'] = omic_meta[train_idx]
    omic_meta['test'] = omic_meta[test_idx]

for omic_meta in (exp_meta, cnv_meta, met_meta):
    var_sel = VarianceThreshold(omic_meta['var_thresh'])
    omic_meta['data'] = var_sel.fit_transform(omic_meta['data'])
    omic_meta['var_sel'] = var_sel


# SIGNIFICANCE
pvals = np.ones((X_train.shape[1]))
for j in range(X_train.shape[1]):
    pos = X_train[y_train == 1, j].ravel()
    neg = X_train[y_train == 0, j].ravel()

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        _, p = f_oneway(pos, neg)

    pvals[j] = p

pvals = np.nan_to_num(pvals, nan=1.0)

for omic_meta in (exp_meta, cnv_meta, met_meta):
    mask = np.full_like(pvals, False)
    mask[omic_meta['idx_bnds'][0]: omic_meta['idx_bnds'][1] + 1] = True

    omic_meta['sig_idx'] = np.where(
        np.logical_and(pvals < omic_meta['pval_thresh'], mask)
    )[0]
    
    print(omic_meta['sig_idx'])
    print(len(omic_meta['sig_idx']))
print()


exp_meta = {'pval_thresh': 0.2,  'idx_bnds': (0, exp_np.shape[1] - 1)}
cnv_meta = {'pval_thresh': 0.01, 'idx_bnds': (exp_meta['idx_bnds'][1] + 1, exp_meta['idx_bnds'][1] + cnv_full_np.shape[1])}
met_meta = {'pval_thresh': 0.2,  'idx_bnds': (cnv_meta['idx_bnds'][1] + 1, cnv_meta['idx_bnds'][1] + met_np.shape[1])}


forest = RandomForestClassifier(random_state=0, class_weight='balanced', n_jobs = -1 )
forest.fit(X_train, y_train)
for omic_meta in (exp_meta, cnv_meta, met_meta):
    mask = np.full_like(pvals, False)
    mask[omic_meta['idx_bnds'][0]: omic_meta['idx_bnds'][1] + 1] = True

    omic_meta['rf_idx'] = np.where(
        np.logical_and(forest.feature_importances_ > 0.0, mask)
    )[0]

    print(omic_meta['rf_idx'])
    print(len(omic_meta['rf_idx']))
print()


# select overlap genes
for omic_meta in (exp_meta, cnv_meta, met_meta):
    omic_meta['intxn'] = list(set(omic_meta['sig_idx']) & set(omic_meta['rf_idx']))
    print(omic_meta['intxn'])
    print(len(omic_meta['intxn']))


for omic_meta in exp_meta, cnv_meta, met_meta:
    omic_meta['sel_train'] = X_train[:, omic_meta['intxn']]
    omic_meta['sel_test'] = X_test[:, omic_meta['intxn']]
    omic_meta['scaler'] = StandardScaler()

    if len(omic_meta['intxn']) > 0:
        omic_meta['sel_train'] = omic_meta['scaler'].fit_transform(omic_meta['sel_train'])
        omic_meta['sel_test'] = omic_meta['scaler'].transform(omic_meta['sel_test'])

X_train = np.concatenate((exp_meta['sel_train'], cnv_meta['sel_train'], met_meta['sel_train']), axis=1)
X_test = np.concatenate((exp_meta['sel_test'], cnv_meta['sel_test'], met_meta['sel_test']), axis=1)

# remove Highly Correlated Features from the dataset
corr_matrix = np.abs(np.corrcoef(X_train, rowvar=False))
upper = np.where(
    np.logical_and(
        corr_matrix, np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    ), corr_matrix, 0
)

# drop features with high similarity
cutoff = 0.8
to_drop = [column for column in range(upper.shape[1]) if any(upper[column] > cutoff)]
X_train = np.delete(X_train, to_drop, axis=1)
X_test = np.delete(X_test, to_drop, axis=1)