In [2]:
# /*==========================================================================================*\
# **                        _           _ _   _     _  _         _                            **
# **                       | |__  _   _/ | |_| |__ | || |  _ __ | |__                         **
# **                       | '_ \| | | | | __| '_ \| || |_| '_ \| '_ \                        **
# **                       | |_) | |_| | | |_| | | |__   _| | | | | | |                       **
# **                       |_.__/ \__,_|_|\__|_| |_|  |_| |_| |_|_| |_|                       **
# \*==========================================================================================*/


# -----------------------------------------------------------------------------------------------
# Author: Bùi Tiến Thành - Tien-Thanh Bui (@bu1th4nh)
# Title: main_mofa2.ipynb
# Date: 2024/11/07 12:46:46
# Description: Baseline implementation for MOFA2 against our model
# 
# (c) 2024 bu1th4nh. All rights reserved. 
# Written with dedication in the University of Central Florida, EPCOT and the Magic Kingdom.
# -----------------------------------------------------------------------------------------------

import sys
sys.path.insert(1, '../../')


import mlflow
import pymongo
import logging
import numpy as np
import mofax as mfx
import pandas as pd
from tqdm import tqdm
from s3fs import S3FileSystem
from mofapy2.run.entry_point import entry_point
from typing import List, Dict, Any, Tuple, Union, Literal
from downstream.classification import evaluate_one_target

tqdm.pandas()
mlflow.set_tracking_uri('http://localhost:6969')




key = 'bu1th4nh'
secret = 'ariel.anna.elsa'
endpoint_url = 'http://localhost:19000'

s3 = S3FileSystem(
    anon=False, 
    endpoint_url=endpoint_url,
    key=key,
    secret=secret,
    use_ssl=False
)
storage_option = {
    'key': key,
    'secret': secret,
    'endpoint_url': endpoint_url,
}

mongo = pymongo.MongoClient(
    host='mongodb://localhost',
    port=27017,
    username='bu1th4nh',
    password='ariel.anna.elsa',
)
mongo_db = mongo['SimilarSampleCrossOmicNMF']


configs = [
    ('BreastCancer/processed_crossOmics', 'BreastCancer', 'brca', 'BRCA', 'SimilarSampleCrossOmicNMFv3'),
    ('LungCancer/processed', 'LungCancer', 'luad', 'LUAD', 'SimilarSampleCrossOmicNMFv3_LUAD'),
    ('OvarianCancer/processed', 'OvarianCancer', 'ov', 'OV', 'SimilarSampleCrossOmicNMFv3_OV'),
]
mofa_latent_dims = 15

def find_run(collection, run_id: str, target_id: str): return collection.find_one({'run_id': run_id, 'target_id': target_id})

In [5]:
for ds_name, general_data_name, res_folder, mongo_collection, mlf_experiment_name in configs[1:2]:
    DATA_PATH = f's3://datasets/{ds_name}'
    DATA_PATH = f's3://datasets/{ds_name}'
    TARG_PATH = f's3://datasets/{general_data_name}/clinical_testdata'
    DR_RES_PATH = f's3://results/SimilarSampleCrossOmicNMF/{res_folder}/baseline_MOFA2'
    miRNA = pd.read_parquet(f"{DATA_PATH}/miRNA.parquet", storage_options=storage_option)
    mRNA = pd.read_parquet(f"{DATA_PATH}/mRNA.parquet", storage_options=storage_option)

    mlflow.set_experiment(mlf_experiment_name)
    collection = mongo_db[mongo_collection]

    # miRNA.head()
    print("Dataset: ", ds_name.split('/')[0])
    print("miRNA")
    print(f"Sample size: {miRNA.shape[1]}")
    print(f"Feature size: {miRNA.shape[0]}")
    print("mRNA")
    print(f"Sample size: {mRNA.shape[1]}")
    print(f"Feature size: {mRNA.shape[0]}")

    display(miRNA.head())
    display(mRNA.head())

    data_mat = [[miRNA.T.values], [mRNA.T.values]]

    Ariel = entry_point()
    Ariel.set_data_matrix(
        data_mat, 
        likelihoods=['gaussian', 'gaussian'], 
        views_names=['miRNA', 'mRNA'],
        features_names=[miRNA.index, mRNA.index],
        samples_names=[miRNA.columns],
    )

    Ariel.set_model_options(
        factors=mofa_latent_dims
    )

    Ariel.set_train_options(
        convergence_mode = "fast",
    )

    Ariel.build()
    Ariel.run()

    Ariel.save("output.hdf5")
    Belle = mfx.mofa_model("output.hdf5").get_factors(factors=range(mofa_latent_dims), df=True)
    
    s3.mkdirs(DR_RES_PATH, exist_ok=True)
    Belle.to_parquet(f"{DR_RES_PATH}/H.parquet", storage_options=storage_option)
    
    run_id = s3.open(f"{DR_RES_PATH}/run_id.txt", 'r').read() if s3.exists(f"{DR_RES_PATH}/run_id.txt") else None

    
    with mlflow.start_run(run_id=run_id) if run_id is not None else mlflow.start_run(run_name='baseline_MOFA2'):

        if run_id is None: 
            mlflow.log_param("Number of omics layers", 2)
            mlflow.log_param("Omics layers feature size", [mRNA.shape[0], miRNA.shape[0]])
            mlflow.log_param("Sample size", miRNA.shape[1])
            mlflow.log_param("Latent size", mofa_latent_dims)


            run_id = mlflow.active_run().info.run_id
            with s3.open(f"{DR_RES_PATH}/run_id.txt", 'w') as f:
                f.write(run_id)
        H = Belle.copy(deep=True)
        target_folders = [f's3://{a}' for a in s3.ls(TARG_PATH)]

        for target_folder in target_folders:
            # Retrieve test data
            target_id = str(target_folder.split('/')[-1]).split('.')[0]
            # if find_run(collection, run_id, target_id) is not None:
            #     logging.info(f"Run {run_id} on dataset {target_id} already exists. Skipping")
            #     continue
            test_data = pd.read_parquet(target_folder, storage_options=storage_option)

            # Evaluate
            result_pack = evaluate_one_target(H, testdata = test_data, methods_list = ["Logistic Regression", "Random Forest"], target = target_id)

            # Load to staging package
            data_pack = {
                'run_id': run_id,
                'target_id': target_id,
                'summary': {}
            }
            for method in result_pack.keys():
                data_pack[method] = result_pack[method].to_dict(orient='index')

                for metric in result_pack[method].columns:
                    if str(metric).isupper():
                        # Assume all metrics are upper case-noted columns
                        data_pack['summary'][f'{method} Mean {metric}'] = np.mean(result_pack[method][metric].values)
                        data_pack['summary'][f'{method} Median {metric}'] = np.median(result_pack[method][metric].values)
                        data_pack['summary'][f'{method} Std {metric}'] = np.std(result_pack[method][metric].values)
                        data_pack['summary'][f'{method} Max {metric}'] = np.max(result_pack[method][metric].values)
                        data_pack['summary'][f'{method} Min {metric}'] = np.min(result_pack[method][metric].values)

            # # Log to MLFlow
            # for key in data_pack['summary'].keys():
            #     if 'Mean AUROC' in key: mlflow.log_metric(f'{target_id} {key}', data_pack['summary'][key])
            #     if 'Mean MCC' in key: mlflow.log_metric(f'{target_id} {key}', data_pack['summary'][key])
        
        
            # Save to MongoDB
            collection.update_one(
                {'run_id': run_id, 'target_id': target_id},
                {'$set': data_pack},
                upsert=True
            )

    

Dataset:  LungCancer
miRNA
Sample size: 466
Feature size: 277
mRNA
Sample size: 466
Feature size: 10481


Sample,TCGA-05-4384-01,TCGA-05-4390-01,TCGA-05-4396-01,TCGA-05-4405-01,TCGA-05-4410-01,TCGA-05-4415-01,TCGA-05-4417-01,TCGA-05-4424-01,TCGA-05-4425-01,TCGA-05-4427-01,...,TCGA-NJ-A4YG-01,TCGA-NJ-A4YI-01,TCGA-NJ-A4YP-01,TCGA-NJ-A4YQ-01,TCGA-NJ-A55A-01,TCGA-NJ-A55O-01,TCGA-NJ-A55R-01,TCGA-NJ-A7XG-01,TCGA-O1-A52J-01,TCGA-S2-AA1A-01
miRNA_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MIMAT0000062,15.875872,13.750382,16.025933,14.937915,14.716226,15.018878,14.153767,14.950595,15.739367,13.792453,...,15.107852,15.077873,14.939725,13.586118,14.649898,15.728184,14.388743,14.627557,13.958566,15.270523
MIMAT0000063,13.822868,13.058059,14.58869,14.214259,13.741927,12.606633,13.174706,14.044945,14.523231,13.232921,...,13.701555,13.5922,14.327167,12.381397,13.34455,13.835098,12.853311,13.489058,12.545892,14.034881
MIMAT0000064,10.610422,7.595108,11.115257,11.106765,10.358098,9.216893,9.478037,10.688318,10.838078,9.147103,...,9.968173,10.289848,9.595746,8.937114,11.634541,10.621937,9.282449,10.157641,10.783892,10.965637
MIMAT0000065,7.424421,7.385631,9.066566,7.160014,7.458137,6.326816,7.564645,7.886527,8.355463,7.104644,...,7.39928,7.307456,7.405218,7.538172,7.434089,8.464607,7.220506,6.342038,7.302139,7.380368
MIMAT0000066,10.865542,10.461218,11.470096,10.373828,10.055069,9.836883,9.896856,11.67971,10.195901,9.447986,...,10.491626,10.923376,9.040771,8.90514,9.88468,11.187798,10.469838,10.085967,9.971284,10.430483


Sample,TCGA-05-4384-01,TCGA-05-4390-01,TCGA-05-4396-01,TCGA-05-4405-01,TCGA-05-4410-01,TCGA-05-4415-01,TCGA-05-4417-01,TCGA-05-4424-01,TCGA-05-4425-01,TCGA-05-4427-01,...,TCGA-NJ-A4YG-01,TCGA-NJ-A4YI-01,TCGA-NJ-A4YP-01,TCGA-NJ-A4YQ-01,TCGA-NJ-A55A-01,TCGA-NJ-A55O-01,TCGA-NJ-A55R-01,TCGA-NJ-A7XG-01,TCGA-O1-A52J-01,TCGA-S2-AA1A-01
mRNA_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,7.0041,7.3725,4.2046,7.2009,6.3292,5.0419,6.4314,8.0166,5.5883,6.607,...,7.5803,8.4373,6.2003,7.381,6.6497,5.4265,8.1028,7.9482,7.7171,6.8524
A1CF,0.0,0.0,4.0808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.1075,0.7411,0.0,0.7028,0.0,0.0,5.9771,0.0,0.0,0.727
A4GNT,1.5021,0.7126,0.8141,1.2174,0.8193,1.2224,0.0,1.2809,0.0,0.5893,...,1.9586,2.816,0.8174,0.7028,0.9063,2.1701,1.4233,0.0,2.2714,11.6154
AAGAB,10.1096,9.171,10.753,10.321,9.9913,11.0113,9.934,9.7789,10.0062,10.6721,...,9.6779,9.2994,10.3256,10.2476,9.7706,10.0827,9.8012,9.9746,9.8827,9.9829
AAK1,10.0276,8.8739,10.5039,10.1175,9.9202,9.4367,10.2425,10.5627,9.1593,10.4075,...,9.3442,9.5173,9.2223,10.6471,9.9291,10.4092,10.129,9.0962,9.8908,9.5964



        #########################################################
        ###           __  __  ____  ______                    ### 
        ###          |  \/  |/ __ \|  ____/\    _             ### 
        ###          | \  / | |  | | |__ /  \ _| |_           ### 
        ###          | |\/| | |  | |  __/ /\ \_   _|          ###
        ###          | |  | | |__| | | / ____ \|_|            ###
        ###          |_|  |_|\____/|_|/_/    \_\              ###
        ###                                                   ### 
        ######################################################### 
       
 
        
Groups names not provided, using default naming convention:
- group1, group2, ..., groupG

Successfully loaded view='miRNA' group='group0' with N=466 samples and D=277 features...
Successfully loaded view='mRNA' group='group0' with N=466 samples and D=10481 features...


Model options:
- Automatic Relevance Determination prior on the factors: False
- Automatic Relevance Determin

Evaluating target diseasefree_12_12 on testdata: 100%|██████████| 100/100 [00:14<00:00,  7.01it/s]
Evaluating target diseasefree_12_18 on testdata: 100%|██████████| 100/100 [00:13<00:00,  7.27it/s]
Evaluating target diseasefree_18_18 on testdata: 100%|██████████| 100/100 [00:13<00:00,  7.24it/s]
Evaluating target diseasefree_18_24 on testdata: 100%|██████████| 100/100 [00:13<00:00,  7.58it/s]
Evaluating target diseasefree_18_30 on testdata: 100%|██████████| 100/100 [00:13<00:00,  7.68it/s]
Evaluating target diseasefree_18_36 on testdata: 100%|██████████| 100/100 [00:12<00:00,  7.76it/s]
Evaluating target diseasefree_24_24 on testdata: 100%|██████████| 100/100 [00:13<00:00,  7.53it/s]
Evaluating target diseasefree_24_30 on testdata: 100%|██████████| 100/100 [00:13<00:00,  7.69it/s]
Evaluating target diseasefree_24_36 on testdata: 100%|██████████| 100/100 [00:12<00:00,  7.79it/s]
Evaluating target diseasefree_36_36 on testdata: 100%|██████████| 100/100 [00:12<00:00,  7.73it/s]
Evaluating

🏃 View run baseline_MOFA2 at: http://localhost:6969/#/experiments/6/runs/9df066d2ed6d477691a6723c0e555fd3
🧪 View experiment at: http://localhost:6969/#/experiments/6



