In [11]:
# /*==========================================================================================*\
# **                        _           _ _   _     _  _         _                            **
# **                       | |__  _   _/ | |_| |__ | || |  _ __ | |__                         **
# **                       | '_ \| | | | | __| '_ \| || |_| '_ \| '_ \                        **
# **                       | |_) | |_| | | |_| | | |__   _| | | | | | |                       **
# **                       |_.__/ \__,_|_|\__|_| |_|  |_| |_| |_|_| |_|                       **
# \*==========================================================================================*/


# -----------------------------------------------------------------------------------------------
# Author: Bùi Tiến Thành (@bu1th4nh)
# Title: testPrep.ipynb
# Date: 2024/10/19 19:32:53
# Description: 
# 
# (c) bu1th4nh. All rights reserved
# -----------------------------------------------------------------------------------------------

from tqdm import tqdm
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
import logging
import random
import json
import os
warnings.filterwarnings("ignore")


test_passes = 100
ORGL_PATH = '/home/ti514716/Datasets/BreastCancer/processed_crossOmics_micro'

In [12]:
mRNA = pd.read_parquet(f'{ORGL_PATH}/mRNA.parquet')
miRNA = pd.read_parquet(f'{ORGL_PATH}/miRNA.parquet')
clinical = pd.read_parquet(f'{ORGL_PATH}/clinical.parquet')


common_sample = list(set(mRNA.columns).intersection(miRNA.columns).intersection(clinical.index))
clinical_common = clinical.loc[common_sample, :].copy(deep=True)

for label in clinical_common.columns:
    clinical_common[label] = clinical_common[label].apply(lambda x: 1 if x == 'Positive' else 0)

display(clinical_common.head())
print(clinical_common.columns)

Unnamed: 0_level_0,ER,HER2,PR,TN
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-D8-A1JT-01,1,0,1,0
TCGA-OL-A66K-01,1,0,1,0
TCGA-AR-A1AQ-01,0,0,0,1
TCGA-E2-A15D-01,1,0,1,0
TCGA-E2-A15T-01,1,0,1,0


Index(['ER', 'HER2', 'PR', 'TN'], dtype='object')


In [13]:
test_index = [f'Test{i:03}' for i in range(test_passes)]
tests = pd.DataFrame(index = test_index)
for label in clinical_common.columns:
    positive_samples = list(clinical_common[clinical_common[label] == 1].index)
    negative_samples = list(clinical_common[clinical_common[label] == 0].index)

    print(f'Label: {label}')
    print(f'Positive samples: {len(positive_samples)}')
    print(f'Negative samples: {len(negative_samples)}')

    train_col = []
    test_col = []
    for test_pass in tqdm(range(test_passes), desc=f'Building testcases for {label}'):
        pos_train_idx, pos_test_idx = train_test_split(positive_samples, test_size=0.2)
        neg_train_idx, neg_test_idx = train_test_split(negative_samples, test_size=0.2)

        train_idx = pos_train_idx + neg_train_idx
        test_idx = pos_test_idx + neg_test_idx
        random.shuffle(train_idx)
        random.shuffle(test_idx)

        train_col.append(train_idx)
        test_col.append(test_idx)

    tests[f"{label}_train"] = train_col
    tests[f"{label}_test"] = test_col

display(tests.head())
tests.to_parquet(f'{ORGL_PATH}/testdata_classification.parquet')

Label: ER
Positive samples: 151
Negative samples: 49


Building testcases for ER: 100%|██████████| 100/100 [00:00<00:00, 2426.14it/s]


Label: HER2
Positive samples: 33
Negative samples: 167


Building testcases for HER2: 100%|██████████| 100/100 [00:00<00:00, 2663.49it/s]


Label: PR
Positive samples: 129
Negative samples: 71


Building testcases for PR: 100%|██████████| 100/100 [00:00<00:00, 2714.74it/s]


Label: TN
Positive samples: 42
Negative samples: 158


Building testcases for TN: 100%|██████████| 100/100 [00:00<00:00, 2786.97it/s]


Unnamed: 0,ER_train,ER_test,HER2_train,HER2_test,PR_train,PR_test,TN_train,TN_test
Test000,"[TCGA-E2-A2P6-01, TCGA-A7-A425-01, TCGA-AR-A1A...","[TCGA-AR-A2LJ-01, TCGA-A7-A26I-01, TCGA-A2-A3K...","[TCGA-AR-A24X-01, TCGA-OL-A6VR-01, TCGA-E2-A2P...","[TCGA-GM-A4E0-01, TCGA-AC-A2B8-01, TCGA-B6-A40...","[TCGA-AR-A24N-01, TCGA-D8-A73X-01, TCGA-A2-A0C...","[TCGA-GM-A3XN-01, TCGA-GM-A2DD-01, TCGA-A2-A3X...","[TCGA-E2-A14T-01, TCGA-E2-A14U-01, TCGA-OL-A66...","[TCGA-AR-A1AI-01, TCGA-OL-A66N-01, TCGA-OL-A66..."
Test001,"[TCGA-E2-A15L-01, TCGA-E2-A2P6-01, TCGA-LQ-A4E...","[TCGA-S3-A6ZF-01, TCGA-OL-A5D6-01, TCGA-E2-A15...","[TCGA-BH-A2L8-01, TCGA-AR-A2LH-01, TCGA-E2-A1I...","[TCGA-LQ-A4E4-01, TCGA-A2-A3XV-01, TCGA-EW-A1P...","[TCGA-5T-A9QA-01, TCGA-BH-A5IZ-01, TCGA-OL-A5R...","[TCGA-E2-A15R-01, TCGA-E2-A1B6-01, TCGA-AR-A1A...","[TCGA-B6-A409-01, TCGA-BH-A18T-01, TCGA-OL-A5R...","[TCGA-BH-A0H5-01, TCGA-E2-A15D-01, TCGA-S3-AA1..."
Test002,"[TCGA-AC-A2B8-01, TCGA-BH-A0AU-01, TCGA-BH-A0H...","[TCGA-AC-A2FF-01, TCGA-AR-A2LK-01, TCGA-D8-A1J...","[TCGA-EW-A1P5-01, TCGA-E2-A15L-01, TCGA-AC-A23...","[TCGA-A7-A26F-01, TCGA-AR-A1AQ-01, TCGA-AR-A2L...","[TCGA-A2-A3XT-01, TCGA-E2-A1IO-01, TCGA-D8-A1J...","[TCGA-OL-A66N-01, TCGA-E2-A15R-01, TCGA-AR-A24...","[TCGA-GM-A3XG-01, TCGA-AR-A1AW-01, TCGA-LL-A6F...","[TCGA-E2-A574-01, TCGA-EW-A2FR-01, TCGA-A2-A3X..."
Test003,"[TCGA-A2-A25B-01, TCGA-LL-A5YN-01, TCGA-E2-A15...","[TCGA-E2-A56Z-01, TCGA-E2-A14W-01, TCGA-BH-A18...","[TCGA-A2-A0D3-01, TCGA-B6-A408-01, TCGA-BH-A0B...","[TCGA-E2-A15H-01, TCGA-E2-A14Y-01, TCGA-BH-A0H...","[TCGA-GM-A3XN-01, TCGA-A7-A26H-01, TCGA-GI-A2C...","[TCGA-GM-A2DF-01, TCGA-BH-A0AU-01, TCGA-AO-A1K...","[TCGA-GM-A2DC-01, TCGA-GM-A3XG-01, TCGA-GM-A3X...","[TCGA-GM-A2DF-01, TCGA-AR-A1AW-01, TCGA-A2-A3X..."
Test004,"[TCGA-AR-A24K-01, TCGA-E2-A14T-01, TCGA-LL-A6F...","[TCGA-E2-A10A-01, TCGA-AR-A251-01, TCGA-GM-A4E...","[TCGA-AR-A2LO-01, TCGA-AQ-A54N-01, TCGA-AC-A23...","[TCGA-A7-A3IZ-01, TCGA-AO-A1KP-01, TCGA-MS-A51...","[TCGA-D8-A145-01, TCGA-BH-A2L8-01, TCGA-E2-A57...","[TCGA-GM-A3XN-01, TCGA-A2-A0ES-01, TCGA-S3-AA1...","[TCGA-AR-A1AW-01, TCGA-E2-A1LA-01, TCGA-PE-A5D...","[TCGA-AC-A23H-01, TCGA-S3-AA12-01, TCGA-AR-A2L..."


In [14]:
Ariel = pd.read_parquet(f'{ORGL_PATH}/testdata_classification.parquet')
display(Ariel.head())

Unnamed: 0,ER_train,ER_test,HER2_train,HER2_test,PR_train,PR_test,TN_train,TN_test
Test000,"[TCGA-E2-A2P6-01, TCGA-A7-A425-01, TCGA-AR-A1A...","[TCGA-AR-A2LJ-01, TCGA-A7-A26I-01, TCGA-A2-A3K...","[TCGA-AR-A24X-01, TCGA-OL-A6VR-01, TCGA-E2-A2P...","[TCGA-GM-A4E0-01, TCGA-AC-A2B8-01, TCGA-B6-A40...","[TCGA-AR-A24N-01, TCGA-D8-A73X-01, TCGA-A2-A0C...","[TCGA-GM-A3XN-01, TCGA-GM-A2DD-01, TCGA-A2-A3X...","[TCGA-E2-A14T-01, TCGA-E2-A14U-01, TCGA-OL-A66...","[TCGA-AR-A1AI-01, TCGA-OL-A66N-01, TCGA-OL-A66..."
Test001,"[TCGA-E2-A15L-01, TCGA-E2-A2P6-01, TCGA-LQ-A4E...","[TCGA-S3-A6ZF-01, TCGA-OL-A5D6-01, TCGA-E2-A15...","[TCGA-BH-A2L8-01, TCGA-AR-A2LH-01, TCGA-E2-A1I...","[TCGA-LQ-A4E4-01, TCGA-A2-A3XV-01, TCGA-EW-A1P...","[TCGA-5T-A9QA-01, TCGA-BH-A5IZ-01, TCGA-OL-A5R...","[TCGA-E2-A15R-01, TCGA-E2-A1B6-01, TCGA-AR-A1A...","[TCGA-B6-A409-01, TCGA-BH-A18T-01, TCGA-OL-A5R...","[TCGA-BH-A0H5-01, TCGA-E2-A15D-01, TCGA-S3-AA1..."
Test002,"[TCGA-AC-A2B8-01, TCGA-BH-A0AU-01, TCGA-BH-A0H...","[TCGA-AC-A2FF-01, TCGA-AR-A2LK-01, TCGA-D8-A1J...","[TCGA-EW-A1P5-01, TCGA-E2-A15L-01, TCGA-AC-A23...","[TCGA-A7-A26F-01, TCGA-AR-A1AQ-01, TCGA-AR-A2L...","[TCGA-A2-A3XT-01, TCGA-E2-A1IO-01, TCGA-D8-A1J...","[TCGA-OL-A66N-01, TCGA-E2-A15R-01, TCGA-AR-A24...","[TCGA-GM-A3XG-01, TCGA-AR-A1AW-01, TCGA-LL-A6F...","[TCGA-E2-A574-01, TCGA-EW-A2FR-01, TCGA-A2-A3X..."
Test003,"[TCGA-A2-A25B-01, TCGA-LL-A5YN-01, TCGA-E2-A15...","[TCGA-E2-A56Z-01, TCGA-E2-A14W-01, TCGA-BH-A18...","[TCGA-A2-A0D3-01, TCGA-B6-A408-01, TCGA-BH-A0B...","[TCGA-E2-A15H-01, TCGA-E2-A14Y-01, TCGA-BH-A0H...","[TCGA-GM-A3XN-01, TCGA-A7-A26H-01, TCGA-GI-A2C...","[TCGA-GM-A2DF-01, TCGA-BH-A0AU-01, TCGA-AO-A1K...","[TCGA-GM-A2DC-01, TCGA-GM-A3XG-01, TCGA-GM-A3X...","[TCGA-GM-A2DF-01, TCGA-AR-A1AW-01, TCGA-A2-A3X..."
Test004,"[TCGA-AR-A24K-01, TCGA-E2-A14T-01, TCGA-LL-A6F...","[TCGA-E2-A10A-01, TCGA-AR-A251-01, TCGA-GM-A4E...","[TCGA-AR-A2LO-01, TCGA-AQ-A54N-01, TCGA-AC-A23...","[TCGA-A7-A3IZ-01, TCGA-AO-A1KP-01, TCGA-MS-A51...","[TCGA-D8-A145-01, TCGA-BH-A2L8-01, TCGA-E2-A57...","[TCGA-GM-A3XN-01, TCGA-A2-A0ES-01, TCGA-S3-AA1...","[TCGA-AR-A1AW-01, TCGA-E2-A1LA-01, TCGA-PE-A5D...","[TCGA-AC-A23H-01, TCGA-S3-AA12-01, TCGA-AR-A2L..."


In [15]:
# Prepping the data and result
auc_columns = []
for label in clinical_common.columns:
    for method in ['Cinderella', 'Ariel']:
        auc_columns.append(f"{label}_{method}_AUC")
        auc_columns.append(f"{label}_{method}_TPR")
        auc_columns.append(f"{label}_{method}_FPR")
AUC_result = pd.DataFrame(
    index = Ariel.index,
    columns = auc_columns
)



AUC_result.at['Test000', 'ER_Cinderella_FPR'] = [9, 2, 1, 3]
AUC_result.head()

Unnamed: 0,ER_Cinderella_AUC,ER_Cinderella_TPR,ER_Cinderella_FPR,ER_Ariel_AUC,ER_Ariel_TPR,ER_Ariel_FPR,HER2_Cinderella_AUC,HER2_Cinderella_TPR,HER2_Cinderella_FPR,HER2_Ariel_AUC,...,PR_Cinderella_FPR,PR_Ariel_AUC,PR_Ariel_TPR,PR_Ariel_FPR,TN_Cinderella_AUC,TN_Cinderella_TPR,TN_Cinderella_FPR,TN_Ariel_AUC,TN_Ariel_TPR,TN_Ariel_FPR
Test000,,,"[9, 2, 1, 3]",,,,,,,,...,,,,,,,,,,
Test001,,,,,,,,,,,...,,,,,,,,,,
Test002,,,,,,,,,,,...,,,,,,,,,,
Test003,,,,,,,,,,,...,,,,,,,,,,
Test004,,,,,,,,,,,...,,,,,,,,,,
