In [7]:
# /*==========================================================================================*\
# **                        _           _ _   _     _  _         _                            **
# **                       | |__  _   _/ | |_| |__ | || |  _ __ | |__                         **
# **                       | '_ \| | | | | __| '_ \| || |_| '_ \| '_ \                        **
# **                       | |_) | |_| | | |_| | | |__   _| | | | | | |                       **
# **                       |_.__/ \__,_|_|\__|_| |_|  |_| |_| |_|_| |_|                       **
# \*==========================================================================================*/


# -----------------------------------------------------------------------------------------------
# Author: Bùi Tiến Thành - Tien-Thanh Bui (@bu1th4nh)
# Title: playground_data.ipynb
# Date: 2024/11/07 14:39:32
# Description: 
# 
# (c) 2024 bu1th4nh. All rights reserved. 
# Written with dedication in the University of Central Florida, EPCOT and the Magic Kingdom.
# -----------------------------------------------------------------------------------------------

from s3fs import S3FileSystem
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, average_precision_score
from sklearn.svm import SVC

from tqdm import tqdm

key = 'bu1th4nh'
secret = 'ariel.anna.elsa'
endpoint_url = 'http://localhost:9000'

s3 = S3FileSystem(
    anon=False, 
    endpoint_url=endpoint_url,
    key=key,
    secret=secret,
    use_ssl=False
)
storage_options = {
    'key': key,
    'secret': secret,
    'endpoint_url': endpoint_url,
}
storage_option = storage_options
DATA_PATH = 's3://datasets/LungCancer/processed_3_omics_mRNA_miRNA_methDNA'
SA_TARG_PATH = f's3://datasets/LungCancer/survivalanalysis_testdata_3_omics_mRNA_miRNA_methDNA'



In [None]:
def evaluate_one_target(H, testdata, methods_list, target):
    # Prepping the data and result

    metrics = ['pred', 'prob', 'ACC', 'REC', 'F1', 'MCC', 'AUROC', 'AUPRC']
    results = {
        method: pd.DataFrame(index = testdata.index, columns = metrics) 
        for method in methods_list
    }

    # Iterate through each test
    for test_id in tqdm(testdata.index, desc=f"Evaluating target {target} on testdata"):
        # Get sample IDs
        train_sample_ids = testdata.loc[test_id, f'train_sample_ids']
        train_gnd_truth = testdata.loc[test_id, f'train_ground_truth']
        test_sample_ids = testdata.loc[test_id, f'test_sample_ids']
        test_gnd_truth = testdata.loc[test_id, f'test_ground_truth']

        # Get train test X/Y
        X_train = H.loc[train_sample_ids].values
        Y_train = np.array(train_gnd_truth)
        X_test = H.loc[test_sample_ids].values
        Y_test = np.array(test_gnd_truth)

        # Evaluate each method
        for cls_method in methods_list:
            if(cls_method == "SVM"):                    cls = SVC(probability=True, verbose=False)
            elif(cls_method == "Random Forest"):        cls = RandomForestClassifier(verbose=False)
            elif(cls_method == "Logistic Regression"):  cls = LogisticRegression(max_iter=1000, verbose=False)
            elif(cls_method == "AdaBoost"):             cls = AdaBoostClassifier()

            # Fit & predict the model
            cls.fit(X_train, Y_train)
            pred = cls.predict(X_test)
            prob = cls.predict_proba(X_test)[::,1]

            # Metrics
            ACC = accuracy_score(Y_test, pred)
            REC = recall_score(Y_test, pred)
            F1 = f1_score(Y_test, pred)
            MCC = matthews_corrcoef(Y_test, pred)
            AUROC = roc_auc_score(Y_test, prob)
            AUPRC = average_precision_score(Y_test, prob)

            # Store the result
            results[cls_method].at[test_id, 'pred'] = pred
            results[cls_method].at[test_id, 'prob'] = prob
            results[cls_method].at[test_id, 'ACC'] = ACC
            results[cls_method].at[test_id, 'REC'] = REC
            results[cls_method].at[test_id, 'F1'] = F1
            results[cls_method].at[test_id, 'MCC'] = MCC
            results[cls_method].at[test_id, 'AUROC'] = AUROC
            results[cls_method].at[test_id, 'AUPRC'] = AUPRC

    return results


In [None]:
# Load the data
H = pd.read_parquet('s3://results/SimilarSampleCrossOmicNMF/brca/k-10-alpha-0-beta-0.01-gamma-overridden/H.parquet', storage_options=storage_option)
test_data = pd.read_parquet('s3://datasets/BreastCancer/clinical_testdata/ER.parquet', storage_options=storage_option)


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_curve, auc, accuracy_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, average_precision_score, precision_score
from sklearn.svm import SVC

from tqdm import tqdm


train_sample_ids = test_data.loc['Test000', 'train_sample_ids']
train_gnd_truth = test_data.loc['Test000', 'train_ground_truth']


In [None]:
Ariel = cross_validate(
    estimator=SVC(),
    X=H.loc[train_sample_ids].values,
    y=train_gnd_truth,
    return_train_score=False,
    # return_estimator=True,
    n_jobs=-1,
    scoring={
        'ACC': 'accuracy',
        'PRE': 'precision',
        'REC': 'recall',
        'F1': 'f1',
        'MCC': 'matthews_corrcoef',
        'AUROC': 'roc_auc',
        'AUPRC': 'average_precision',
    }
)
Ariel

In [3]:


bipart_data = pd.read_parquet(f'{DATA_PATH}/bipart.parquet', storage_options=storage_options)
methDNA = pd.read_parquet(f'{DATA_PATH}/methDNA.parquet', storage_options=storage_options)
miRNA = pd.read_parquet(f'{DATA_PATH}/miRNA.parquet', storage_options=storage_options)
mRNA = pd.read_parquet(f'{DATA_PATH}/mRNA.parquet', storage_options=storage_options)

features_list = [mRNA.index.to_list(), miRNA.index.to_list(), methDNA.index.to_list()]   
omics_data = [mRNA, miRNA, methDNA]



In [9]:

# -----------------------------------------------------------------------------------------------
# Obtain survival analysis targets
# -----------------------------------------------------------------------------------------------
# logging.info(f"Retrieving survival analysis targets from {SA_TARG_PATH}")
surv_targets_data = {}
surv_target_folder = [f's3://{a}' for a in s3.ls(SA_TARG_PATH)]
for tar in tqdm(surv_target_folder, desc='Preloading target data'):
    target_id = str(tar.split('/')[-1]).split('.')[0]
    surv_targets_data[target_id] = pd.read_parquet(tar, storage_options=storage_options)
    print(surv_targets_data[target_id].columns)

survival = surv_targets_data[list(surv_targets_data.keys())[1]]
H_original_data = pd.concat(omics_data, axis=0)
H_original_data = H_original_data[list(set(H_original_data.columns).intersection(set(survival.index)))]

Preloading target data: 100%|██████████| 2/2 [00:00<00:00, 46.72it/s]

Index(['Disease Free Status', 'Disease Free (Months)'], dtype='object')
Index(['Overall Survival Status', 'Overall Survival (Months)'], dtype='object')



