In [1]:
# /*==========================================================================================*\
# **                        _           _ _   _     _  _         _                            **
# **                       | |__  _   _/ | |_| |__ | || |  _ __ | |__                         **
# **                       | '_ \| | | | | __| '_ \| || |_| '_ \| '_ \                        **
# **                       | |_) | |_| | | |_| | | |__   _| | | | | | |                       **
# **                       |_.__/ \__,_|_|\__|_| |_|  |_| |_| |_|_| |_|                       **
# \*==========================================================================================*/


# -----------------------------------------------------------------------------------------------
# Author: Bùi Tiến Thành (@bu1th4nh)
# Title: playground_classification.ipynb
# Date: 2024/10/03 15:27:39
# Description: 
# 
# (c) bu1th4nh. All rights reserved
# -----------------------------------------------------------------------------------------------


import numpy as np
import pandas as pd
import mlflow
mlflow.set_tracking_uri(uri="http://127.0.0.1:6969")
mlflow.set_experiment("SimilarSampleCrossOmicNMF")



# Not fixed gamma
# Alpha = 2
# run_name = 'ariel-elsa-aurora-20241002-15.12.43'
# run_id = 'd4cf242b1a3540d5b2cb91dc52dbd991'

# Alpha = 1
# run_name = 'ariel-moana-mulan-20241002-23.32.36'
# run_id = '1e2f0bbe6ada401aae9d027bcd0fa8b5'

# Fixed gamma
# run_name = 'merida-mulan-anna-20241003-08.56.41'
# run_id = 'abc7d2cfd6d04ab7b48de90d24e8cfc4'

# Baseline: NMF Only
# run_name = 'rapunzel-rapunzel-ariel-20241006-10.23.03'
# run_id = '7f933693ee25409c8fdb90b04a5a26b8'

# Baseline: Raw
run_name = 'baseline_rawdata'
run_id = '62565423ab374538bddded038085c625'



ORGL_PATH = '/home/ti514716/Datasets/BreastCancer/processed_crossOmics'
RESULT_PATH = '/home/ti514716/Projects/SimilarSampleCrossOmicNMF/results/' + run_name

## Data Acquisition & Merging

In [None]:
H = pd.read_parquet(f'{RESULT_PATH}/H.parquet')
display(H.head())

mRNA = pd.read_parquet(f'{ORGL_PATH}/mRNA.parquet')
miRNA = pd.read_parquet(f'{ORGL_PATH}/miRNA.parquet')
clinical = pd.read_parquet(f'{ORGL_PATH}/clinical.parquet')

display(clinical.head())
# display(mRNA.head())
# display(miRNA.head())


common_samples = list(set(H.index).intersection(clinical.index))
print(f'Common samples: {len(common_samples)}')

In [None]:
data = H.loc[common_samples]
label = clinical.loc[common_samples, ['ER']]
label['ER'] = label['ER'].apply(lambda x: 1 if x == 'Positive' else 0)


display(data.head())
display(label.head())

display(label['ER'].value_counts())

positive_samples = list(label[label['ER'] == 1].index)
negative_samples = list(label[label['ER'] == 0].index)

positive_data = data.loc[positive_samples].copy(deep=True)
negative_data = data.loc[negative_samples].copy(deep=True)

## Dataset Preparation

In [4]:
from sklearn.model_selection import train_test_split

# Train-test split for positive and negative samples, ensuring that the positive/negative ratio is maintained
X_train_pos, X_test_pos = train_test_split(positive_data, test_size=0.2, random_state=42)
X_train_neg, X_test_neg = train_test_split(negative_data, test_size=0.2, random_state=42)
Y_train_pos = np.ones(X_train_pos.shape[0])
Y_test_pos = np.ones(X_test_pos.shape[0])
Y_train_neg = np.zeros(X_train_neg.shape[0])
Y_test_neg = np.zeros(X_test_neg.shape[0])

train_dataset = pd.concat([X_train_pos, X_train_neg])
test_dataset = pd.concat([X_test_pos, X_test_neg])
Y_train = np.concatenate([Y_train_pos, Y_train_neg])
Y_test = np.concatenate([Y_test_pos, Y_test_neg])

train_dataset['ER'] = Y_train
test_dataset['ER'] = Y_test


# Shuffle
train_dataset = train_dataset.sample(frac=1).reset_index(drop=True)
test_dataset = test_dataset.sample(frac=1).reset_index(drop=True)


# Split X-Y
X_train = train_dataset.drop(columns=['ER']).values
Y_train = train_dataset['ER']
X_test = test_dataset.drop(columns=['ER']).values
Y_test = test_dataset['ER']

## Binary Classification

### Canonical ML Classifiers

In [5]:
def SVMClassifier(train_data, train_labl):
    from sklearn.svm import SVC
    return SVC(probability=True, verbose=True).fit(train_data, train_labl)

def RandomForestClassifier(train_data, train_labl):
    from sklearn.ensemble import RandomForestClassifier
    return RandomForestClassifier(verbose=True).fit(train_data, train_labl)

def LogisticRegressionClassifier(train_data, train_labl):
    from sklearn.linear_model import LogisticRegression
    return LogisticRegression(max_iter=1000, verbose=True).fit(train_data, train_labl)

def AdaBoostClassifier(train_data, train_labl):
    from sklearn.ensemble import AdaBoostClassifier
    return AdaBoostClassifier().fit(train_data, train_labl)

### Train

In [6]:
cls_methods = ["AdaBoost", "Logistic Regression", "Random Forest", "SVM"]
model_dict = {}

In [None]:
# Classification

for (j, cls_method) in enumerate(cls_methods):
    print(f"Processing {cls_method}...")
    print(f"Feature size: {X_train.shape[1]}, Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")

    if(cls_method == "SVM"):                    cls = SVMClassifier(X_train, Y_train)
    elif(cls_method == "Random Forest"):         cls = RandomForestClassifier(X_train, Y_train)
    elif(cls_method == "Logistic Regression"):   cls = LogisticRegressionClassifier(X_train, Y_train)
    elif(cls_method == "AdaBoost"):             cls = AdaBoostClassifier(X_train, Y_train)
    else: raise ValueError("Invalid classification method")

    model_dict[cls_method] = cls


## AUC

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
fig, ax = plt.subplots(
    2, 
    2,
    figsize=(
        20, 7
    )
)


results_AUC = {}
with mlflow.start_run(run_id=run_id):
    for (j, cls_method) in enumerate(cls_methods):
        cls = model_dict[cls_method]
        predicted = cls.predict_proba(X_test)[::,1]


        fpr, tpr, _ = roc_curve(Y_test, predicted)
        auc_value = auc(fpr, tpr)

        results_AUC[cls_method] = auc_value
        mlflow.log_metric(f"{cls_method} AUC", auc_value)
                          

        # Plot ROC curve
        axxx = ax[j // 2, j % 2]
        

        axxx.plot(list(fpr), list(tpr), label=f"AUC = {auc_value:.3f}")
        axxx.set_title(f"{cls_method} | {X_test.shape[1]} features | AUC = {auc_value:.3f}")

    fig.show()
    fig.savefig(f"{RESULT_PATH}/classification_results.pdf")
    mlflow.log_artifact(f"{RESULT_PATH}/classification_results.pdf")
