In [4]:
# /*==========================================================================================*\
# **                        _           _ _   _     _  _         _                            **
# **                       | |__  _   _/ | |_| |__ | || |  _ __ | |__                         **
# **                       | '_ \| | | | | __| '_ \| || |_| '_ \| '_ \                        **
# **                       | |_) | |_| | | |_| | | |__   _| | | | | | |                       **
# **                       |_.__/ \__,_|_|\__|_| |_|  |_| |_| |_|_| |_|                       **
# \*==========================================================================================*/


# -----------------------------------------------------------------------------------------------
# Author: Bùi Tiến Thành (@bu1th4nh)
# Title: testPrep.ipynb
# Date: 2024/10/19 19:32:53
# Description: 
# 
# (c) bu1th4nh. All rights reserved
# -----------------------------------------------------------------------------------------------

from tqdm import tqdm
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
import logging
import random
import json
import os
warnings.filterwarnings("ignore")


test_passes = 200
ORGL_PATH = '/home/ti514716/Datasets/BreastCancer/processed_crossOmics'

In [5]:
mRNA = pd.read_parquet(f'{ORGL_PATH}/mRNA.parquet')
miRNA = pd.read_parquet(f'{ORGL_PATH}/miRNA.parquet')
clinical = pd.read_parquet(f'{ORGL_PATH}/clinical.parquet')


common_sample = list(set(mRNA.columns).intersection(miRNA.columns).intersection(clinical.index))
clinical_common = clinical.loc[common_sample, :].copy(deep=True)

for label in clinical_common.columns:
    clinical_common[label] = clinical_common[label].apply(lambda x: 1 if x == 'Positive' else 0)

display(clinical_common.head())
print(clinical_common.columns)

Unnamed: 0_level_0,ER,HER2,PR,TN
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-E2-A15H-01,1,1,1,0
TCGA-AR-A24T-01,1,0,1,0
TCGA-AO-A1KT-01,1,0,1,0
TCGA-GM-A3XG-01,1,0,1,0
TCGA-AQ-A54N-01,0,0,0,1


Index(['ER', 'HER2', 'PR', 'TN'], dtype='object')


In [6]:
test_index = [f'Test{i:03}' for i in range(test_passes)]
tests = pd.DataFrame(index = test_index)
for label in clinical_common.columns:
    positive_samples = list(clinical_common[clinical_common[label] == 1].index)
    negative_samples = list(clinical_common[clinical_common[label] == 0].index)

    print(f'Label: {label}')
    print(f'Positive samples: {len(positive_samples)}')
    print(f'Negative samples: {len(negative_samples)}')

    train_col = []
    test_col = []
    for test_pass in tqdm(range(test_passes), desc=f'Building testcases for {label}'):
        pos_train_idx, pos_test_idx = train_test_split(positive_samples, test_size=0.2)
        neg_train_idx, neg_test_idx = train_test_split(negative_samples, test_size=0.2)

        train_idx = pos_train_idx + neg_train_idx
        test_idx = pos_test_idx + neg_test_idx
        random.shuffle(train_idx)
        random.shuffle(test_idx)

        train_col.append(train_idx)
        test_col.append(test_idx)

    tests[f"{label}_train"] = train_col
    tests[f"{label}_test"] = test_col

display(tests.head())
tests.to_parquet(f'{ORGL_PATH}/testdata_classification.parquet')

Label: ER
Positive samples: 185
Negative samples: 54


Building testcases for ER: 100%|██████████| 200/200 [00:00<00:00, 2613.97it/s]


Label: HER2
Positive samples: 39
Negative samples: 200


Building testcases for HER2: 100%|██████████| 200/200 [00:00<00:00, 2640.05it/s]


Label: PR
Positive samples: 159
Negative samples: 80


Building testcases for PR: 100%|██████████| 200/200 [00:00<00:00, 2661.75it/s]


Label: TN
Positive samples: 46
Negative samples: 193


Building testcases for TN: 100%|██████████| 200/200 [00:00<00:00, 2679.24it/s]


Unnamed: 0,ER_train,ER_test,HER2_train,HER2_test,PR_train,PR_test,TN_train,TN_test
Test000,"[TCGA-A2-A25D-01, TCGA-E2-A15S-01, TCGA-BH-A0B...","[TCGA-GM-A2DD-01, TCGA-E2-A576-01, TCGA-LL-A5Y...","[TCGA-BH-A0AU-01, TCGA-BH-A0C7-01, TCGA-GM-A3X...","[TCGA-EW-A1OZ-01, TCGA-BH-A18T-01, TCGA-E2-A10...","[TCGA-AR-A1AQ-01, TCGA-OL-A5RW-01, TCGA-E2-A15...","[TCGA-LL-A6FR-01, TCGA-E2-A9RU-01, TCGA-EW-A1P...","[TCGA-AO-A1KP-01, TCGA-E2-A572-01, TCGA-AC-A23...","[TCGA-OL-A5RW-01, TCGA-E2-A15L-01, TCGA-OL-A66..."
Test001,"[TCGA-AC-A2B8-01, TCGA-V7-A7HQ-01, TCGA-AQ-A1H...","[TCGA-A2-A25B-01, TCGA-A2-A04W-01, TCGA-AR-A24...","[TCGA-GM-A2DC-01, TCGA-E2-A56Z-01, TCGA-OL-A66...","[TCGA-GM-A2DB-01, TCGA-OL-A6VO-01, TCGA-AO-A1K...","[TCGA-OL-A66H-01, TCGA-EW-A1PG-01, TCGA-A7-A26...","[TCGA-BH-A0HL-01, TCGA-BH-A0HN-01, TCGA-BH-A0B...","[TCGA-E2-A14U-01, TCGA-GM-A3XL-01, TCGA-AC-A23...","[TCGA-BH-A0H5-01, TCGA-OL-A66H-01, TCGA-GM-A5P..."
Test002,"[TCGA-AO-A1KT-01, TCGA-A2-A3XW-01, TCGA-GM-A3N...","[TCGA-E2-A3DX-01, TCGA-E2-A15E-01, TCGA-A2-A0D...","[TCGA-AR-A24P-01, TCGA-A7-A13F-01, TCGA-BH-A18...","[TCGA-AR-A1AP-01, TCGA-EW-A1P5-01, TCGA-AR-A24...","[TCGA-A7-A425-01, TCGA-A2-A3XW-01, TCGA-E2-A15...","[TCGA-A2-A0CK-01, TCGA-LL-A5YM-01, TCGA-LL-A74...","[TCGA-A2-A0CK-01, TCGA-E2-A15I-01, TCGA-BH-A0H...","[TCGA-A7-A26I-01, TCGA-E2-A15K-06, TCGA-GM-A4E..."
Test003,"[TCGA-AR-A250-01, TCGA-A2-A0CO-01, TCGA-AQ-A0Y...","[TCGA-OL-A66P-01, TCGA-A2-A1FX-01, TCGA-EW-A1O...","[TCGA-S3-AA11-01, TCGA-A2-A1G1-01, TCGA-D8-A1J...","[TCGA-GM-A2DC-01, TCGA-E2-A1IE-01, TCGA-A2-A1F...","[TCGA-AO-A1KR-01, TCGA-AR-A1AI-01, TCGA-OL-A5D...","[TCGA-GM-A3XL-01, TCGA-E2-A1LA-01, TCGA-D8-A14...","[TCGA-B6-A409-01, TCGA-EW-A1PD-01, TCGA-D8-A27...","[TCGA-AO-A1KS-01, TCGA-D8-A1JN-01, TCGA-AC-A23..."
Test004,"[TCGA-A2-A0CL-01, TCGA-A2-A0CR-01, TCGA-AR-A24...","[TCGA-E2-A15C-01, TCGA-D8-A73X-01, TCGA-BH-A0B...","[TCGA-5T-A9QA-01, TCGA-EW-A1P1-01, TCGA-EW-A1P...","[TCGA-E2-A572-01, TCGA-A2-A0CL-01, TCGA-E2-A3D...","[TCGA-A2-A25B-01, TCGA-AR-A24K-01, TCGA-D8-A14...","[TCGA-E2-A2P6-01, TCGA-AR-A24L-01, TCGA-GM-A5P...","[TCGA-E2-A1B6-01, TCGA-E2-A15R-01, TCGA-A2-A1F...","[TCGA-BH-A18Q-01, TCGA-E2-A576-01, TCGA-OL-A6V..."


In [7]:
Ariel = pd.read_parquet(f'{ORGL_PATH}/testdata_classification.parquet')
display(Ariel.head())

Unnamed: 0,ER_train,ER_test,HER2_train,HER2_test,PR_train,PR_test,TN_train,TN_test
Test000,"[TCGA-A2-A25D-01, TCGA-E2-A15S-01, TCGA-BH-A0B...","[TCGA-GM-A2DD-01, TCGA-E2-A576-01, TCGA-LL-A5Y...","[TCGA-BH-A0AU-01, TCGA-BH-A0C7-01, TCGA-GM-A3X...","[TCGA-EW-A1OZ-01, TCGA-BH-A18T-01, TCGA-E2-A10...","[TCGA-AR-A1AQ-01, TCGA-OL-A5RW-01, TCGA-E2-A15...","[TCGA-LL-A6FR-01, TCGA-E2-A9RU-01, TCGA-EW-A1P...","[TCGA-AO-A1KP-01, TCGA-E2-A572-01, TCGA-AC-A23...","[TCGA-OL-A5RW-01, TCGA-E2-A15L-01, TCGA-OL-A66..."
Test001,"[TCGA-AC-A2B8-01, TCGA-V7-A7HQ-01, TCGA-AQ-A1H...","[TCGA-A2-A25B-01, TCGA-A2-A04W-01, TCGA-AR-A24...","[TCGA-GM-A2DC-01, TCGA-E2-A56Z-01, TCGA-OL-A66...","[TCGA-GM-A2DB-01, TCGA-OL-A6VO-01, TCGA-AO-A1K...","[TCGA-OL-A66H-01, TCGA-EW-A1PG-01, TCGA-A7-A26...","[TCGA-BH-A0HL-01, TCGA-BH-A0HN-01, TCGA-BH-A0B...","[TCGA-E2-A14U-01, TCGA-GM-A3XL-01, TCGA-AC-A23...","[TCGA-BH-A0H5-01, TCGA-OL-A66H-01, TCGA-GM-A5P..."
Test002,"[TCGA-AO-A1KT-01, TCGA-A2-A3XW-01, TCGA-GM-A3N...","[TCGA-E2-A3DX-01, TCGA-E2-A15E-01, TCGA-A2-A0D...","[TCGA-AR-A24P-01, TCGA-A7-A13F-01, TCGA-BH-A18...","[TCGA-AR-A1AP-01, TCGA-EW-A1P5-01, TCGA-AR-A24...","[TCGA-A7-A425-01, TCGA-A2-A3XW-01, TCGA-E2-A15...","[TCGA-A2-A0CK-01, TCGA-LL-A5YM-01, TCGA-LL-A74...","[TCGA-A2-A0CK-01, TCGA-E2-A15I-01, TCGA-BH-A0H...","[TCGA-A7-A26I-01, TCGA-E2-A15K-06, TCGA-GM-A4E..."
Test003,"[TCGA-AR-A250-01, TCGA-A2-A0CO-01, TCGA-AQ-A0Y...","[TCGA-OL-A66P-01, TCGA-A2-A1FX-01, TCGA-EW-A1O...","[TCGA-S3-AA11-01, TCGA-A2-A1G1-01, TCGA-D8-A1J...","[TCGA-GM-A2DC-01, TCGA-E2-A1IE-01, TCGA-A2-A1F...","[TCGA-AO-A1KR-01, TCGA-AR-A1AI-01, TCGA-OL-A5D...","[TCGA-GM-A3XL-01, TCGA-E2-A1LA-01, TCGA-D8-A14...","[TCGA-B6-A409-01, TCGA-EW-A1PD-01, TCGA-D8-A27...","[TCGA-AO-A1KS-01, TCGA-D8-A1JN-01, TCGA-AC-A23..."
Test004,"[TCGA-A2-A0CL-01, TCGA-A2-A0CR-01, TCGA-AR-A24...","[TCGA-E2-A15C-01, TCGA-D8-A73X-01, TCGA-BH-A0B...","[TCGA-5T-A9QA-01, TCGA-EW-A1P1-01, TCGA-EW-A1P...","[TCGA-E2-A572-01, TCGA-A2-A0CL-01, TCGA-E2-A3D...","[TCGA-A2-A25B-01, TCGA-AR-A24K-01, TCGA-D8-A14...","[TCGA-E2-A2P6-01, TCGA-AR-A24L-01, TCGA-GM-A5P...","[TCGA-E2-A1B6-01, TCGA-E2-A15R-01, TCGA-A2-A1F...","[TCGA-BH-A18Q-01, TCGA-E2-A576-01, TCGA-OL-A6V..."


In [11]:
# Prepping the data and result
auc_columns = []
for label in clinical_common.columns:
    for method in ['Cinderella', 'Ariel']:
        auc_columns.append(f"{label}_{method}_AUC")
        auc_columns.append(f"{label}_{method}_TPR")
        auc_columns.append(f"{label}_{method}_FPR")
AUC_result = pd.DataFrame(
    index = Ariel.index,
    columns = auc_columns
)



AUC_result.at['Test000', 'ER_Cinderella_FPR'] = [9, 2, 1, 3]
AUC_result.head()

Unnamed: 0,ER_Cinderella_AUC,ER_Cinderella_TPR,ER_Cinderella_FPR,ER_Ariel_AUC,ER_Ariel_TPR,ER_Ariel_FPR,HER2_Cinderella_AUC,HER2_Cinderella_TPR,HER2_Cinderella_FPR,HER2_Ariel_AUC,...,PR_Cinderella_FPR,PR_Ariel_AUC,PR_Ariel_TPR,PR_Ariel_FPR,TN_Cinderella_AUC,TN_Cinderella_TPR,TN_Cinderella_FPR,TN_Ariel_AUC,TN_Ariel_TPR,TN_Ariel_FPR
Test000,,,"[9, 2, 1, 3]",,,,,,,,...,,,,,,,,,,
Test001,,,,,,,,,,,...,,,,,,,,,,
Test002,,,,,,,,,,,...,,,,,,,,,,
Test003,,,,,,,,,,,...,,,,,,,,,,
Test004,,,,,,,,,,,...,,,,,,,,,,
