In [None]:
import os
# import sys
import numpy as np
import scipy as sp
import pandas as pd
import scipy.linalg as alg
# import matplotlib.pyplot as plt
from time import time
from tqdm import tqdm
from skrebate import ReliefF
from sklearn.model_selection import KFold
from sklearn.svm import SVR as SupportVectorRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import pearsonr, spearmanr

## Path & Filename...
PATH = os.path.join(os.getenv("HOMEPATH"), "Google Drive\\Study\\ECE 5332-009 - Topics in EE, Data Science\\BeatAML\\")
os.chdir(PATH)

## Read data...
RNA = pd.read_csv(PATH + "rnaseq.csv", header = 0)
DNA = pd.read_csv(PATH + "dnaseq.csv", header = 0)
CLI_NUM = pd.read_csv(PATH + "clinical_numerical.csv", header = 0)
CLI_CAT = pd.read_csv(PATH + "clinical_categorical.csv", header = 0)
CLI_CAT_LEG = pd.read_csv(PATH + "clinical_categorical_legend.csv", header = 0)

AUC = pd.read_csv(PATH + "aucs.csv",   header = 0)
RESP = pd.read_csv(PATH + "response.csv", header = 0)

## Process...
RNA.index = RNA.Symbol;    RNA = RNA.iloc[:, 2:]
# RNA.shape


In [None]:
## Function definitions...
def RF(X_train, y_train, X_test, seed = 0):
    mdl = RandomForestRegressor(n_estimators = 200, criterion = 'mse', min_samples_leaf = 5, 
                                random_state = seed)
    y_pred = mdl.fit(X_train, y_train).predict(X_test)
    return y_pred

def SVR(X_train, y_train, X_test):
    mdl = SupportVectorRegressor(kernel = "rbf", degree = 3, gamma = "scale", tol = 1e-3, 
                                 C = 100)
    y_pred = mdl.fit(X_train, y_train).predict(X_test)
    return y_pred

def EN(X_train, y_train, X_test):
    mdl = ElasticNet(fit_intercept = True, l1_ratio = 0.8, alpha = 0.9, tol = 1e-3, 
                     random_state = 0)
    y_pred = mdl.fit(X_train, y_train).predict(X_test)
    return y_pred

def EVAL_PERF(y_label, y_pred, alpha = 0.05):
    y_label, y_pred = np.array(y_label).squeeze(), np.array(y_pred).squeeze()
    PCC, pval = pearsonr(y_label, y_pred);     PCC = PCC if pval < alpha else 0
    SCC, pval = spearmanr(y_label, y_pred);    SCC = SCC if pval < alpha else 0
    NRMSE = ((y_label - y_pred)**2).mean() / y_label.std(ddof = 0)
    NMAE  = (np.abs(y_label - y_pred)).mean() / (np.abs(y_label - y_label.mean())).mean()
    return PCC, SCC, NRMSE, NMAE


In [None]:
drug_list = AUC.inhibitor.unique().tolist()
RNA_filt = RNA.iloc[(RNA.var(axis = 1) > 0.1).to_numpy(), :]

FS = ReliefF(n_features_to_select = 1000, n_neighbors = 10, n_jobs = 2);    p_top = 1000
feature_ranks = {kk: [ ] for kk in drug_list}
RESULTS_RF  = pd.DataFrame(dtype = float, index = drug_list, columns = ["PCC", "SCC", "NRMSE", "NMAE"])
RESULTS_SVR = pd.DataFrame(dtype = float, index = drug_list, columns = ["PCC", "SCC", "NRMSE", "NMAE"])
RESULTS_EN  = pd.DataFrame(dtype = float, index = drug_list, columns = ["PCC", "SCC", "NRMSE", "NMAE"])
for drug in drug_list[:1]:
    print("Chosen drug =", drug)
    y_data = AUC.iloc[(AUC.inhibitor == drug).tolist(), :]
    X_data, y_data = RNA_filt.loc[:, y_data.lab_id].T, y_data["auc"].to_numpy()
    
    Y_pred = pd.DataFrame(dtype = float, index = X_data.index, columns = ["Actual", "RF", "SVR", "EN"])
    CV = KFold(n_splits = 3, shuffle = False, random_state = 0)
    for train_idx, test_idx in tqdm(CV.split(X_data)):
        X_train, y_train = X_data.iloc[train_idx, :], y_data[train_idx]
        X_test,  y_test  = X_data.iloc[test_idx, :],  y_data[test_idx]
        
        ## Perform ReliefF...
        dt = time();         FS.fit(X_train.values, y_train)
        dt = time() - dt;    print("Elapsed time = %0.4f sec." % dt)
        feat_top = FS.top_features_;    feature_ranks[drug].append(feat_top)
        X_train, X_test = X_train.iloc[:, feat_top[:p_top]], X_test.iloc[:, feat_top[:p_top]]
        
        ## Perform prediction...
        test_idx_lab = X_data.index[test_idx]
        Y_pred.loc[test_idx_lab, "Actual"] = y_test
        Y_pred.loc[test_idx_lab, "RF"]     = RF(X_train, y_train, X_test, seed = 0)
        Y_pred.loc[test_idx_lab, "SVR"]    = SVR(X_train, y_train, X_test)
        Y_pred.loc[test_idx_lab, "EN"]     = RF(X_train, y_train, X_test, seed = 0)
    #### CV loop ends.
    
#     PCC, SCC, NEMSE, NMAE = EVAL_PERF(Y_pred["Actual"], Y_pred["RF"])
    RESULTS_RF.loc[drug, :]  = EVAL_PERF(Y_pred["Actual"], Y_pred["RF"])
#     PCC, SCC, NEMSE, NMAE = EVAL_PERF(Y_pred["Actual"], Y_pred["SVR"])
    RESULTS_SVR.loc[drug, :] = EVAL_PERF(Y_pred["Actual"], Y_pred["SVR"])
    RESULTS_EN.loc[drug, :]  = EVAL_PERF(Y_pred["Actual"], Y_pred["EN"])
    
    print(RESULTS_RF.loc[drug, :])
    print(RESULTS_SVR.loc[drug, :])
    print(RESULTS_EN.loc[drug, :])
    ####
        



In [None]:
# %qtconsole --style monokai