In [1]:
import numpy as np
import pandas as pd

from scipy.stats import f_oneway, mannwhitneyu, shapiro

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, roc_auc_score

import time
import warnings

#### Load data

In [2]:
exp = pd.read_csv(r'../../data/clean/gene_expression.csv')
cnv = pd.read_csv(r'../../data/clean/cnv.csv')
cnv_full = pd.read_csv(r'../../data/cnv.csv')
met = pd.read_csv(r'../../data/clean/metabolomics.csv')
labels = pd.read_csv(r'../../data/labels.csv')

In [3]:
exp.drop(columns="Unnamed: 0", inplace=True)
cnv.drop(columns="Unnamed: 0", inplace=True)
cnv_full.drop(columns="Unnamed: 0", inplace=True)
met.drop(columns="Unnamed: 0", inplace=True)
labels.drop(columns="Unnamed: 0", inplace=True)

#### Split labels into classes

In [4]:
emci = labels[labels["DX_bl"] == "EMCI"].reset_index(drop=True)
cn = labels[labels["DX_bl"] == "CN"].reset_index(drop=True)
lmci = labels[labels["DX_bl"] == "LMCI"].reset_index().drop(columns='index')
# ad = labels[labels["DX_bl"] == "AD"].reset_index().drop(columns='index')

targets = pd.concat([cn, emci, lmci]).reset_index(drop=True)
encoding = {"CN": 0, "EMCI": 1, "LMCI": 1}
targets.loc[:, "DX_bl"] = targets["DX_bl"].map(encoding)
targets = targets.astype({"DX_bl": "int16"})

In [5]:
exp = exp.merge(targets[["PTID"]], how='inner', on="PTID")
cnv = cnv.merge(targets[["PTID"]], how='inner', on="PTID")
cnv_full = cnv_full.merge(targets[["PTID"]], how='inner', on="PTID")
met = met.merge(targets[["PTID"]], how='inner', on="PTID")

exp.sort_values(by="PTID", ignore_index=True, inplace=True)
cnv.sort_values(by="PTID", ignore_index=True, inplace=True)
cnv_full.sort_values(by="PTID", ignore_index=True, inplace=True)
met.sort_values(by="PTID", ignore_index=True, inplace=True)
targets.sort_values(by="PTID", ignore_index=True, inplace=True)

exp_np = exp.select_dtypes(include=np.number).to_numpy()
cnv_full_np = cnv_full.select_dtypes(include=np.number).to_numpy()
met_np = met.select_dtypes(include=np.number).to_numpy()
targets_np = targets[['DX_bl']].to_numpy().ravel()

cnv_full_np_log = np.log2(cnv_full_np + 1)

In [7]:
np.min(exp_np)

0.293653371820192

In [8]:
exp_meta = {'data': exp_np,          'pval_thresh': 0.2,  'var_thresh': 0.01}
cnv_meta = {'data': cnv_full_np_log, 'pval_thresh': 0.01, 'var_thresh': 0.01}
met_meta = {'data': met_np,          'pval_thresh': 0.2,  'var_thresh': 0.01}

train_idx = np.arange(targets_np.shape[0])
test_idx = []


# VARIANCE
for omic_meta in (exp_meta, cnv_meta, met_meta):
    omic_meta['train'] = omic_meta[train_idx]
    omic_meta['test'] = omic_meta[test_idx]

for omic_meta in (exp_meta, cnv_meta, met_meta):
    var_sel = VarianceThreshold(omic_meta['var_thresh'])
    omic_meta['data'] = var_sel.fit_transform(omic_meta['data'])
    omic_meta['var_sel'] = var_sel


# SIGNIFICANCE
for omic_meta in (exp_meta, cnv_meta, met_meta):
    pvals = np.ones((omic_meta['train'].shape[1]))
    for j in range(omic_meta['train'].shape[1]):
        pos = omic_meta['train'][targets_np[train_idx] == 1, j].ravel()
        neg = omic_meta['train'][targets_np[train_idx] == 0, j].ravel()

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            _, p = f_oneway(pos, neg)

        pvals[j] = p

    pvals = np.nan_to_num(pvals, nan=1.0)

    omic_meta['sig_idx'] = np.nonzero(pvals < omic_meta['pval_thresh'])[0]
    omic_meta['train'] = omic_meta['train'][omic_meta['sig_idx']]
    omic_meta['test'] = omic_meta['test'][omic_meta['sig_idx']]


exp_meta['idx_bnds': (0, exp_meta['train'].shape[1] - 1)]
cnv_meta['idx_bnds': (exp_meta['idx_bnds'][1] + 1, exp_meta['idx_bnds'][1] + cnv_meta['train'].shape[1])]
met_meta['idx_bnds': (cnv_meta['idx_bnds'][1] + 1, cnv_meta['idx_bnds'][1] + met_meta['train'].shape[1])]

X_train = np.concatenate(exp_meta['train'], cnv_meta['train'], met_meta['train'])
X_test = np.concatenate(exp_meta['test'], cnv_meta['test'], met_meta['test'])


# CORRELATION
# remove Highly Correlated Features from the dataset
corr_matrix = np.abs(np.corrcoef(X_train, rowvar=False))
upper = np.where(
    np.logical_and(
        corr_matrix, np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    ), corr_matrix, 0
)

# drop features with high similarity
cutoff = 0.8
drop_mask = [any(upper[:, column] > cutoff) for column in range(upper.shape[1])]


# RANDOM FOREST
forest = RandomForestClassifier(random_state=0, class_weight='balanced', n_jobs=-1)
forest.fit(X_train, targets_np[train_idx])
for omic_meta in (exp_meta, cnv_meta, met_meta):
    omic_meta['rf_idx'] = np.nonzero(forest.feature_importances_ > 0.0)[0]
    omic_meta['train'] = omic_meta['train'][omic_meta['rf_idx']]
    omic_meta['test'] = omic_meta['test'][omic_meta['rf_idx']]

    print(omic_meta['rf_idx'])
    print(len(omic_meta['rf_idx']))
print()




KeyError: 606

In [12]:
targets_np.shape

(606,)