In [9]:
# /*==========================================================================================*\
# **                        _           _ _   _     _  _         _                            **
# **                       | |__  _   _/ | |_| |__ | || |  _ __ | |__                         **
# **                       | '_ \| | | | | __| '_ \| || |_| '_ \| '_ \                        **
# **                       | |_) | |_| | | |_| | | |__   _| | | | | | |                       **
# **                       |_.__/ \__,_|_|\__|_| |_|  |_| |_| |_|_| |_|                       **
# \*==========================================================================================*/


# -----------------------------------------------------------------------------------------------
# Author: Bùi Tiến Thành (@bu1th4nh)
# Title: playground_classification.ipynb
# Date: 2024/10/03 15:27:39
# Description: 
# 
# (c) bu1th4nh. All rights reserved
# -----------------------------------------------------------------------------------------------


import numpy as np
import pandas as pd
import mlflow
mlflow.set_tracking_uri(uri="http://127.0.0.1:6969")
mlflow.set_experiment("SimilarSampleCrossOmicNMF")



# Not fixed gamma
# Alpha = 2
# run_name = 'ariel-elsa-aurora-20241002-15.12.43'
# run_id = 'd4cf242b1a3540d5b2cb91dc52dbd991'

# Alpha = 1
# run_name = 'ariel-moana-mulan-20241002-23.32.36'
# run_id = '1e2f0bbe6ada401aae9d027bcd0fa8b5'

# Fixed gamma
# run_name = 'merida-mulan-anna-20241003-08.56.41'
# run_id = 'abc7d2cfd6d04ab7b48de90d24e8cfc4'

# Baseline: NMF Only
run_name = 'rapunzel-rapunzel-ariel-20241006-10.23.03'
run_id = '7f933693ee25409c8fdb90b04a5a26b8'



ORGL_PATH = '/home/ti514716/Datasets/BreastCancer/processed_crossOmics'
RESULT_PATH = '/home/ti514716/Projects/SimilarSampleCrossOmicNMF/results/' + run_name

## Data Acquisition & Merging

In [10]:
H = pd.read_parquet(f'{RESULT_PATH}/H.parquet')
display(H.head())

mRNA = pd.read_parquet(f'{ORGL_PATH}/mRNA.parquet')
miRNA = pd.read_parquet(f'{ORGL_PATH}/miRNA.parquet')
clinical = pd.read_parquet(f'{ORGL_PATH}/clinical.parquet')

display(clinical.head())
# display(mRNA.head())
# display(miRNA.head())



Unnamed: 0,Latent_000,Latent_001,Latent_002,Latent_003,Latent_004,Latent_005,Latent_006,Latent_007,Latent_008,Latent_009
TCGA-3C-AAAU-01,0.514873,0.064781,0.0,0.342792,0.0,0.0,0.650962,0.0,0.126693,0.44009
TCGA-3C-AALI-01,0.473538,0.173641,0.0,0.370206,0.0,0.128673,0.135606,0.191454,0.64858,0.294466
TCGA-3C-AALJ-01,0.478127,0.184787,0.0,0.377131,0.138077,0.128639,0.384894,0.044703,0.449559,0.200014
TCGA-3C-AALK-01,0.491472,0.22537,0.086863,0.297522,0.232757,0.024056,0.12164,0.469555,0.352698,0.23324
TCGA-4H-AAAK-01,0.464195,0.094795,0.211742,0.315302,0.440416,0.026288,0.228023,0.41115,0.101823,0.280377


Unnamed: 0_level_0,ER,HER2,PR,TN
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-5T-A9QA-01,Positive,Negative,Negative,Negative
TCGA-A1-A0SE-01,Positive,Negative,Positive,Negative
TCGA-A1-A0SH-01,Negative,Negative,Positive,Negative
TCGA-A1-A0SJ-01,Positive,Negative,Positive,Negative
TCGA-A1-A0SM-01,Positive,Positive,Negative,Negative


In [13]:
common_data = H.merge(clinical, left_index=True, right_index=True)
common_data['ER'] = common_data['ER'].apply(lambda x: x == 'Positive')
common_data['PR'] = common_data['PR'].apply(lambda x: x == 'Positive')
common_data['HER2'] = common_data['HER2'].apply(lambda x: x == 'Positive')
common_data['TN'] = common_data['TN'].apply(lambda x: x == 'Positive')

common_data['cluster'] = common_data['ER'].apply(int) * (1 << 0) + common_data['PR'].apply(int) * (1 << 1) + common_data['HER2'].apply(int) * (1 << 2)

display(common_data.head())
common_data['cluster'].value_counts()

Unnamed: 0,Latent_000,Latent_001,Latent_002,Latent_003,Latent_004,Latent_005,Latent_006,Latent_007,Latent_008,Latent_009,ER,HER2,PR,TN,cluster
TCGA-5T-A9QA-01,0.648359,0.0,0.015536,0.306503,0.0,0.0,0.612183,0.0,0.350321,0.131367,True,False,False,False,1
TCGA-A2-A04N-01,0.445612,0.097062,0.160196,0.16818,0.434411,0.075364,0.343165,0.510332,0.154063,0.214934,True,False,True,False,3
TCGA-A2-A04U-01,0.648718,0.039016,0.679034,0.133861,0.245143,0.0,0.275807,0.0,0.475604,0.0,False,True,False,False,4
TCGA-A2-A04W-01,0.590845,0.145345,0.102997,0.19563,0.377602,0.0,0.0,0.281321,0.560449,0.223886,False,True,False,False,4
TCGA-A2-A0CK-01,0.556822,0.354502,0.115142,0.37064,0.148959,0.120001,0.433715,0.270306,0.130995,0.0,True,False,True,False,3


cluster
3    135
0     46
7     21
1     17
5     12
4      5
2      2
6      1
Name: count, dtype: int64