In [5]:
# /*==========================================================================================*\
# **                        _           _ _   _     _  _         _                            **
# **                       | |__  _   _/ | |_| |__ | || |  _ __ | |__                         **
# **                       | '_ \| | | | | __| '_ \| || |_| '_ \| '_ \                        **
# **                       | |_) | |_| | | |_| | | |__   _| | | | | | |                       **
# **                       |_.__/ \__,_|_|\__|_| |_|  |_| |_| |_|_| |_|                       **
# \*==========================================================================================*/


# -----------------------------------------------------------------------------------------------
# Author: Bùi Tiến Thành - Tien-Thanh Bui (@bu1th4nh)
# Title: playground_data.ipynb
# Date: 2024/11/07 14:39:32
# Description: 
# 
# (c) 2024 bu1th4nh. All rights reserved. 
# Written with dedication in the University of Central Florida, EPCOT and the Magic Kingdom.
# -----------------------------------------------------------------------------------------------

from s3fs import S3FileSystem
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, average_precision_score
from sklearn.svm import SVC

from tqdm import tqdm

key = 'bu1th4nh'
secret = 'ariel.anna.elsa'
endpoint_url = 'http://localhost:9000'

s3 = S3FileSystem(
    anon=False, 
    endpoint_url=endpoint_url,
    key=key,
    secret=secret,
    use_ssl=False
)
storage_option = {
    'key': key,
    'secret': secret,
    'endpoint_url': endpoint_url,
}



In [8]:
def evaluate_one_target(H, testdata, methods_list, target):
    # Prepping the data and result

    metrics = ['pred', 'prob', 'ACC', 'REC', 'F1', 'MCC', 'AUROC', 'AUPRC']
    results = {
        method: pd.DataFrame(index = testdata.index, columns = metrics) 
        for method in methods_list
    }

    # Iterate through each test
    for test_id in tqdm(testdata.index, desc=f"Evaluating target {target} on testdata"):
        # Get sample IDs
        train_sample_ids = testdata.loc[test_id, f'train_sample_ids']
        train_gnd_truth = testdata.loc[test_id, f'train_ground_truth']
        test_sample_ids = testdata.loc[test_id, f'test_sample_ids']
        test_gnd_truth = testdata.loc[test_id, f'test_ground_truth']

        # Get train test X/Y
        X_train = H.loc[train_sample_ids].values
        Y_train = np.array(train_gnd_truth)
        X_test = H.loc[test_sample_ids].values
        Y_test = np.array(test_gnd_truth)

        # Evaluate each method
        for cls_method in methods_list:
            if(cls_method == "SVM"):                    cls = SVC(probability=True, verbose=False)
            elif(cls_method == "Random Forest"):        cls = RandomForestClassifier(verbose=False)
            elif(cls_method == "Logistic Regression"):  cls = LogisticRegression(max_iter=1000, verbose=False)
            elif(cls_method == "AdaBoost"):             cls = AdaBoostClassifier()

            # Fit & predict the model
            cls.fit(X_train, Y_train)
            pred = cls.predict(X_test)
            prob = cls.predict_proba(X_test)[::,1]

            # Metrics
            ACC = accuracy_score(Y_test, pred)
            REC = recall_score(Y_test, pred)
            F1 = f1_score(Y_test, pred)
            MCC = matthews_corrcoef(Y_test, pred)
            AUROC = roc_auc_score(Y_test, prob)
            AUPRC = average_precision_score(Y_test, prob)

            # Store the result
            results[cls_method].at[test_id, 'pred'] = pred
            results[cls_method].at[test_id, 'prob'] = prob
            results[cls_method].at[test_id, 'ACC'] = ACC
            results[cls_method].at[test_id, 'REC'] = REC
            results[cls_method].at[test_id, 'F1'] = F1
            results[cls_method].at[test_id, 'MCC'] = MCC
            results[cls_method].at[test_id, 'AUROC'] = AUROC
            results[cls_method].at[test_id, 'AUPRC'] = AUPRC

    return results


In [None]:
# Load the data
H = pd.read_parquet('s3://results/SimilarSampleCrossOmicNMF/brca/k-10-alpha-0-beta-0.01-gamma-overridden/H.parquet', storage_options=storage_option)
test_data = pd.read_parquet('s3://datasets/BreastCancer/clinical_testdata/ER.parquet', storage_options=storage_option)

# Evaluate
result_pack = evaluate_one_target(H, testdata = test_data, methods_list = ["Logistic Regression", "Random Forest"], target = 'ER')

# Load to staging package
data_pack = {
    'run_id': '1',
    'target_id': 'ER',
    'summary': {}
}
for method in result_pack.keys():
    data_pack[method] = result_pack[method].to_dict(orient='index')

    for metric in result_pack[method].columns:
        if str(metric).isupper():
            # Assume all metrics are upper case-noted columns
            data_pack['summary'][f'{method} Mean {metric}'] = np.mean(result_pack[method][metric].values)
            data_pack['summary'][f'{method} Median {metric}'] = np.median(result_pack[method][metric].values)
            data_pack['summary'][f'{method} Std {metric}'] = np.std(result_pack[method][metric].values)
            data_pack['summary'][f'{method} Max {metric}'] = np.max(result_pack[method][metric].values)
            data_pack['summary'][f'{method} Min {metric}'] = np.min(result_pack[method][metric].values)


Evaluating target ER on testdata:   0%|          | 0/200 [00:00<?, ?it/s]

Evaluating target ER on testdata: 100%|██████████| 200/200 [00:22<00:00,  9.02it/s]


In [17]:
result_pack["Random Forest"].to_dict(orient='index', index=True)

{'Test000': {'pred': array([0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
         1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
         1, 0, 1, 0]),
  'prob': array([0.47, 0.96, 0.89, 0.44, 0.98, 0.99, 0.99, 0.87, 0.98, 1.  , 1.  ,
         1.  , 1.  , 1.  , 1.  , 0.52, 0.43, 0.23, 1.  , 0.98, 1.  , 1.  ,
         0.96, 0.45, 1.  , 0.96, 0.99, 0.89, 0.96, 1.  , 0.97, 1.  , 0.99,
         0.99, 0.96, 0.96, 0.27, 1.  , 0.96, 0.99, 0.98, 0.99, 1.  , 0.96,
         0.99, 0.49, 0.96, 0.05]),
  'ACC': 0.9375,
  'REC': 1.0,
  'F1': 0.961038961038961,
  'MCC': 0.8201995322647244,
  'AUROC': 0.9484029484029484,
  'AUPRC': 0.9766762529920425},
 'Test001': {'pred': array([1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
         1, 1, 1, 1]),
  'prob': array([0.92, 0.36, 0.99, 0.92, 1.  , 0.08, 0.99, 0.89, 0.21, 0.23, 0.95,
         0.82, 0.97, 1.  , 0.15, 0.12, 0