In [75]:
# /*==========================================================================================*\
# **                        _           _ _   _     _  _         _                            **
# **                       | |__  _   _/ | |_| |__ | || |  _ __ | |__                         **
# **                       | '_ \| | | | | __| '_ \| || |_| '_ \| '_ \                        **
# **                       | |_) | |_| | | |_| | | |__   _| | | | | | |                       **
# **                       |_.__/ \__,_|_|\__|_| |_|  |_| |_| |_|_| |_|                       **
# \*==========================================================================================*/


# -----------------------------------------------------------------------------------------------
# Author: Bùi Tiến Thành (@bu1th4nh)
# Title: BRCA.ipynb
# Date: 2024/09/12 13:56:42
# Description: Pre-process the data for the Breast Cancer dataset, Cross-omics setting
# 
# (c) bu1th4nh. All rights reserved
# Written with dedication in the University of Central Florida, EPCOT and the Magic Kingdom.
# -----------------------------------------------------------------------------------------------


import s3fs
import logging
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import List, Dict, Any, Tuple, Union, Literal
import ydata_profiling as ydp
import matplotlib.pyplot as plt
import os



# DATA_PATH = '/home/ti514716/Datasets/BreastCancer'
# storage_options = None
# s3 = None

TEST_PASS = 100
OVERRIDE_EXISTING_2_OMICS = False
OVERRIDE_EXISTING_3_OMICS = True




DATA_PATH = 's3://datasets/BreastCancer'
storage_options = {
    'key': 'bu1th4nh',
    'secret': 'ariel.anna.elsa',
    'endpoint_url': 'http://localhost:9000',
}
s3 = s3fs.S3FileSystem(
    key=storage_options['key'],
    secret=storage_options['secret'],
    endpoint_url=storage_options['endpoint_url'],
    use_ssl=False,
)

In [76]:
# Parquet-ize the data
parquetize = False
if s3 is not None:
    if not s3.exists(f'{DATA_PATH}/raw_parquet'):
        s3.makedirs(f'{DATA_PATH}/raw_parquet')
        parquetize = True
else:
    if not os.path.exists(f'{DATA_PATH}/raw_parquet'):
        os.makedirs(f'{DATA_PATH}/raw_parquet')
        parquetize = True


if parquetize:
    pd.read_csv(f'{DATA_PATH}/raw_textfiles/bipartite_targetscan_gene.csv', storage_options=storage_options).to_parquet(f'{DATA_PATH}/raw_parquet/bipartite_targetscan_gene.parquet', storage_options=storage_options)
    pd.read_csv(f'{DATA_PATH}/raw_textfiles/brca_clinical.csv', storage_options=storage_options).to_parquet(f'{DATA_PATH}/raw_parquet/brca_clinical.parquet', storage_options=storage_options)
    pd.read_csv(f'{DATA_PATH}/raw_textfiles/DNAMethyl_BRCA_450k', sep='\t', storage_options=storage_options).to_parquet(f'{DATA_PATH}/raw_parquet/methDNA.parquet', storage_options=storage_options)
    pd.read_csv(f'{DATA_PATH}/raw_textfiles/miRNA.csv', storage_options=storage_options).to_parquet(f'{DATA_PATH}/raw_parquet/miRNA.parquet', storage_options=storage_options)
    pd.read_csv(f'{DATA_PATH}/raw_textfiles/mRNA.csv', storage_options=storage_options).to_parquet(f'{DATA_PATH}/raw_parquet/mRNA.parquet', storage_options=storage_options)


# Data Acquisition

In [77]:
raw_bipart_data = pd.read_parquet(f'{DATA_PATH}/raw_parquet/bipartite_targetscan_gene.parquet', storage_options=storage_options)
raw_clinical = pd.read_parquet(f'{DATA_PATH}/raw_parquet/brca_clinical.parquet', storage_options=storage_options)
raw_methDNA = pd.read_parquet(f'{DATA_PATH}/raw_parquet/methDNA.parquet', storage_options=storage_options)
raw_miRNA = pd.read_parquet(f'{DATA_PATH}/raw_parquet/miRNA.parquet', storage_options=storage_options)
raw_mRNA = pd.read_parquet(f'{DATA_PATH}/raw_parquet/mRNA.parquet', storage_options=storage_options)

# Elementary Data Analysis

### mRNA, miRNA, and methylation data

In [None]:
# Sampling
display(raw_mRNA.head())
display(raw_miRNA.head())
display(raw_methDNA.head())

In [None]:
# Shape
print(f'mRNA shape: {raw_mRNA.shape[0]} rows and {raw_mRNA.shape[1]} columns')
print(f'miRNA shape: {raw_miRNA.shape[0]} rows and {raw_miRNA.shape[1]} columns')
print(f'methDNA shape: {raw_methDNA.shape[0]} rows and {raw_methDNA.shape[1]} columns')

In [None]:
# Columns
print(f'mRNA columns: {len(raw_mRNA.columns)}  :  {list(raw_mRNA.columns)}')
print(f'miRNA columns: {len(raw_miRNA.columns)}  :  {list(raw_miRNA.columns)}')
print(f'methDNA columns: {len(raw_methDNA.columns)}  :  {list(raw_methDNA.columns)}')
print()
# Sample Intersection
Ariel = set(raw_mRNA.columns) & set(raw_miRNA.columns) & set(raw_methDNA.columns)
print(f'Intersection all  {len(Ariel)}  :  {Ariel}')
print()
Ariel = set(raw_mRNA.columns) & set(raw_miRNA.columns)
print(f'Intersection mRNA and miRNA  {len(Ariel)}  :  {Ariel}')
Ariel = set(raw_mRNA.columns) & set(raw_methDNA.columns)
print(f'Intersection mRNA and methDNA  {len(Ariel)}  :  {Ariel}')
Ariel = set(raw_miRNA.columns) & set(raw_methDNA.columns)
print(f'Intersection miRNA and methDNA  {len(Ariel)}  :  {Ariel}')


# Sample Difference
Ariel = set(raw_mRNA.columns) - set(raw_miRNA.columns)
print(f'Sample Difference mRNA and miRNA  {len(Ariel)}  :  {Ariel}')
Ariel = set(raw_miRNA.columns) - set(raw_mRNA.columns)
print(f'Sample Difference miRNA and mRNA  {len(Ariel)}  :  {Ariel}')

Ariel = set(raw_methDNA.columns) - set(raw_miRNA.columns)
print(f'Sample Difference methDNA and miRNA  {len(Ariel)}  :  {Ariel}')
Ariel = set(raw_miRNA.columns) - set(raw_methDNA.columns)
print(f'Sample Difference miRNA and methDNA  {len(Ariel)}  :  {Ariel}')

Ariel = set(raw_methDNA.columns) - set(raw_mRNA.columns)
print(f'Sample Difference methDNA and mRNA  {len(Ariel)}  :  {Ariel}')
Ariel = set(raw_mRNA.columns) - set(raw_methDNA.columns)
print(f'Sample Difference mRNA and methDNA  {len(Ariel)}  :  {Ariel}')


In [None]:
# 1st columns - mRNA/miRNA value
# Why 'sample'? Because of Pandas' read_csv mechanism. It is not very intuitive, but it is what it is.
mRNA_for_genes = raw_mRNA['sample']
miRNA_for_genes = raw_miRNA['sample']
methDNA_for_genes = raw_methDNA['sample']

print(f'mRNAs: {len(mRNA_for_genes)} :  {list(mRNA_for_genes)}')
print(f'miRNAs: {len(miRNA_for_genes)} :  {list(miRNA_for_genes)}')
print(f'methDNAs: {len(methDNA_for_genes)} :  {list(methDNA_for_genes)[:100]}')

### Bipatitite Data mRNAs and miRNAs

In [None]:
# Sampling
display(raw_bipart_data.head())

In [None]:
# Shape
print(f'bipart_data shape: {raw_bipart_data.shape[0]} rows and {raw_bipart_data.shape[1]} columns')
print()

# Columns
print(f'bipart_data columns: {len(raw_bipart_data.columns)}  :  {list(raw_bipart_data.columns)}')
print()

# 1st column - mRNAs aka 'rows'
mRNA_bipart = raw_bipart_data['gene_name']   
print(f'mRNA: {len(mRNA_bipart)} :  {list(mRNA_bipart)}')

### Clinical Data

In [None]:
# Sampling
display(raw_clinical.head())

In [None]:
# Shape
print(f'clinical shape: {raw_clinical.shape[0]} rows and {raw_clinical.shape[1]} columns')
print()

# Columns
print(f'clinical columns: {len(raw_clinical.columns)}  :  {list(raw_clinical.columns)}')
print()

# 1st column - Sample IDs but have "Unnamed: 0"
sample_clinical = raw_clinical['Unnamed: 0']
print(f'Sample: {len(sample_clinical)} :  {list(sample_clinical)}')

# Aligning data to correct format

### mRNA

In [None]:
mRNA = raw_mRNA.copy(deep=True)

# Rename first column as 'mRNA ID'
mRNA.rename(columns = {'sample':'mRNA_ID'}, inplace = True)

# Set index as 'mRNA_ID'
mRNA.set_index('mRNA_ID', inplace = True)

# Rename the 'columns' row as 'Sample'
mRNA.columns.name = 'Sample'

# Fill-NA - 0
mRNA.fillna(0, inplace=True)

# Sampling again
display(mRNA.head())

# Zero features
zero_features = []
for feature in mRNA.index:
    if(np.all(mRNA.loc[feature].values == 0)):
        zero_features.append(feature)
print(f'{len(zero_features)} features in mRNA have all zero values')

# Drop zero features
mRNA.drop(zero_features, inplace=True, axis=0)

# Final shape
print(f'mRNA shape: {mRNA.shape[0]} mRNAs and {mRNA.shape[1]} samples')

# Heatmap
plt.imshow(mRNA, cmap='hot', interpolation='nearest', aspect='auto')
plt.show()

### miRNA

In [None]:
miRNA = raw_miRNA.copy(deep=True)

# Rename first column as 'miRNA ID'
miRNA.rename(columns = {'sample':'miRNA_ID'}, inplace = True)

# Set index as 'miRNA_ID'
miRNA.set_index('miRNA_ID', inplace = True)

# Rename the 'columns' row as 'Sample'
miRNA.columns.name = 'Sample'

# Fill-NA - 0
miRNA.fillna(0, inplace=True)

# Sampling again
display(miRNA.head())

# Zero features
zero_features = []
for feature in miRNA.index:
    if(np.all(miRNA.loc[feature].values == 0)):
        zero_features.append(feature)
print(f'{len(zero_features)} features in miRNA have all zero values')

# Drop zero features
miRNA.drop(zero_features, inplace=True, axis=0)

# Final shape
print(f'miRNA shape: {miRNA.shape[0]} miRNAs and {miRNA.shape[1]} samples')

# Heatmap
plt.imshow(miRNA, cmap='hot', interpolation='nearest', aspect='auto')
plt.show()

### DNA Methylation

In [None]:
methDNA = raw_methDNA.copy(deep=True)

# Rename first column as 'methDNA ID'
methDNA.rename(columns = {'sample':'methDNA_ID'}, inplace = True)

# Set index as 'methDNA_ID'
methDNA.set_index('methDNA_ID', inplace = True)

# Rename the 'columns' row as 'Sample'
methDNA.columns.name = 'Sample'

# Fill-NA - 0
methDNA.fillna(0, inplace=True)

# Sampling again
display(methDNA.head())

# Zero features
zero_features = []
for feature in tqdm(methDNA.index, desc='Zero features calculation'):
    if(np.all(methDNA.loc[feature].values == 0)):
        zero_features.append(feature)
print(f'{len(zero_features)} features in methDNA have all zero values')

# Drop zero features
methDNA.drop(zero_features, inplace=True, axis=0)

# Top hi-variant features
Ariel = methDNA.var(axis=1).sort_values(ascending=False).head(10480)
methDNA = methDNA.loc[Ariel.index].copy(deep=True)

# Final shape
print(f'methDNA shape: {methDNA.shape[0]} methDNAs and {methDNA.shape[1]} samples')

# Heatmap
plt.imshow(methDNA, cmap='hot', interpolation='nearest', aspect='auto')
plt.show()

### Bipartite Graph

In [None]:
bipart = raw_bipart_data.copy(deep=True)

# Rename first column ('gene_name') as 'mRNA ID'
bipart.rename(columns = {'gene_name':'mRNA_ID'}, inplace = True)

# Set index as 'mRNA_ID'
bipart.set_index('mRNA_ID', inplace = True)

# Rename the 'columns' row as 'miRNA_ID'
bipart.columns.name = 'miRNA_ID'

# Sampling again
display(bipart.head())

# Final shape
print(f'Bipart shape: {bipart.shape[0]} mRNAs and {bipart.shape[1]} miRNAs')

# Heatmap
plt.imshow(bipart, cmap='hot', interpolation='nearest', aspect='auto')
plt.show()

### Clinical Data

In [None]:
clinical = raw_clinical.copy(deep=True)

# Rename first column as 'Sample'
clinical.rename(columns = {'Unnamed: 0':'Sample'}, inplace = True)

# Set index as 'Sample'
clinical.set_index('Sample', inplace = True)

# Display
display(clinical.head())

# Final shape
print(f'Clinical shape: {clinical.shape[0]} samples and {clinical.shape[1]} features as {list(clinical.columns)}')

# Aggregating, Label Engineering & Saving Data

In [None]:
# Sample again all data
display(f'mRNA shape: {mRNA.shape[0]} mRNAs and {mRNA.shape[1]} samples')
# display(mRNA.head())


display(f'miRNA shape: {miRNA.shape[0]} miRNAs and {miRNA.shape[1]} samples')
# display(miRNA.head())


display(f'methDNA shape: {methDNA.shape[0]} methDNAs and {methDNA.shape[1]} samples')
# display(methDNA.head())


display(f'Bipart shape: {bipart.shape[0]} mRNAs and {bipart.shape[1]} miRNAs')
# display(bipart.head())


display(f'Clinical shape: {clinical.shape[0]} samples and {clinical.shape[1]} features as {list(clinical.columns)}')
# display(clinical.head())

### 2-omic: mRNA and miRNA

In [None]:
common_samples_2omics = list(set(mRNA.columns) & set(miRNA.columns))
print(f'Common samples: {len(common_samples_2omics)} - {list(common_samples_2omics)}')
print(f'% of common samples wrt mRNA: {len(common_samples_2omics) / len(mRNA.columns) * 100:.2f}%')
print(f'% of common samples wrt miRNA: {len(common_samples_2omics) / len(miRNA.columns) * 100:.2f}%')
print('\n')

common_mRNAs = list(set(mRNA.index) & set(bipart.index))
print(f'Common mRNAs: {len(common_mRNAs)} - {list(common_mRNAs)}')
print(f'% of common mRNAs wrt mRNA: {len(common_mRNAs) / len(mRNA.index) * 100:.2f}%')
print(f'% of common mRNAs wrt bipart: {len(common_mRNAs) / len(bipart.index) * 100:.2f}%')
print('\n')

common_miRNAs = list(set(miRNA.index) & set(bipart.columns))
print(f'Common miRNAs: {len(common_miRNAs)} - {list(common_miRNAs)}')
print(f'% of common miRNAs wrt miRNA: {len(common_miRNAs) / len(miRNA.index) * 100:.2f}%')
print(f'% of common miRNAs wrt bipart: {len(common_miRNAs) / len(bipart.columns) * 100:.2f}%')

In [None]:
# Sort the indexes
common_samples_2omics.sort()
common_mRNAs.sort()
common_miRNAs.sort()

# Align the data
mRNA_common = mRNA.loc[common_mRNAs, common_samples_2omics].copy(deep=True)
miRNA_common = miRNA.loc[common_miRNAs, common_samples_2omics].copy(deep=True)
bipart_common = bipart.loc[common_mRNAs, common_miRNAs].copy(deep=True)


# Zero features for mRNA_common
zero_features = []
for feature in mRNA_common.index:
    if(np.all(mRNA_common.loc[feature].values == 0)):
        zero_features.append(feature)
print(f'{len(zero_features)} features in mRNA_common have all zero values')

# Zero features for miRNA_common
zero_features = []
for feature in miRNA_common.index:
    if(np.all(miRNA_common.loc[feature].values == 0)):
        zero_features.append(feature)
print(f'{len(zero_features)} features in miRNA_common have all zero values')




display(f'Common mRNA shape: {mRNA_common.shape[0]} mRNAs and {mRNA_common.shape[1]} samples')
# display(mRNA_common.head())

display(f'Common miRNA shape: {miRNA_common.shape[0]} miRNAs and {miRNA_common.shape[1]} samples')
# display(miRNA_common.head())

display(f'Common bipart shape: {bipart_common.shape[0]} mRNAs and {bipart_common.shape[1]} miRNAs')
# display(bipart_common.head())


In [94]:
if s3 is not None:
    if not s3.exists(f'{DATA_PATH}/processed_2_omics_mRNA_miRNA'):
        s3.makedirs(f'{DATA_PATH}/processed_2_omics_mRNA_miRNA')
    if not s3.exists(f'{DATA_PATH}/clinical_testdata_2_omics_mRNA_miRNA'):
        s3.makedirs(f'{DATA_PATH}/clinical_testdata_2_omics_mRNA_miRNA')
else:
    if not os.path.exists(f'{DATA_PATH}/processed_2_omics_mRNA_miRNA'):
        os.makedirs(f'{DATA_PATH}/processed_2_omics_mRNA_miRNA')
    if not os.path.exists(f'{DATA_PATH}/clinical_testdata_2_omics_mRNA_miRNA'):
        os.makedirs(f'{DATA_PATH}/clinical_testdata_2_omics_mRNA_miRNA')

if OVERRIDE_EXISTING_2_OMICS:
    mRNA_common.to_parquet(f'{DATA_PATH}/processed_2_omics_mRNA_miRNA/mRNA.parquet', storage_options=storage_options)
    miRNA_common.to_parquet(f'{DATA_PATH}/processed_2_omics_mRNA_miRNA/miRNA.parquet', storage_options=storage_options)
    bipart_common.to_parquet(f'{DATA_PATH}/processed_2_omics_mRNA_miRNA/bipart.parquet', storage_options=storage_options)
    clinical.to_parquet(f'{DATA_PATH}/processed_2_omics_mRNA_miRNA/clinical.parquet', storage_options=storage_options)

In [None]:
import random
from sklearn.model_selection import train_test_split
common_samples_2omics_for_clinical = list(set(common_samples_2omics) & set(clinical.index))
clinical_common_2omics = clinical.loc[common_samples_2omics_for_clinical].copy(deep=True)
clinical_common_2omics.replace({'Positive': 1, 'Negative': 0}, inplace=True)


print(f'Common samples eligible for threshold engineering: {len(clinical_common_2omics)}')
display(clinical_common_2omics.head())



test_index = [f'Test{i:03}' for i in range(TEST_PASS)]
for label in clinical_common_2omics.columns:
    clinical_testdata = pd.DataFrame(
        index = test_index,
        columns = ['train_sample_ids', 'test_sample_ids', 'train_ground_truth', 'test_ground_truth']
    )
    positive_samples = list(clinical_common_2omics[clinical_common_2omics[label] == 1].index)
    negative_samples = list(clinical_common_2omics[clinical_common_2omics[label] == 0].index)

    print(f'Label: {label}')
    print(f'Positive samples: {len(positive_samples)}')
    print(f'Negative samples: {len(negative_samples)}')

    label_dict = {s:1 for s in positive_samples}
    label_dict.update({s:0 for s in negative_samples}) 
    for test_pass in tqdm(range(TEST_PASS), desc=f'Building testcases for {label}'):
        pos_train_idx, pos_test_idx = train_test_split(positive_samples, test_size=0.2)
        neg_train_idx, neg_test_idx = train_test_split(negative_samples, test_size=0.2)

        train_idx = pos_train_idx + neg_train_idx
        test_idx = pos_test_idx + neg_test_idx
        random.shuffle(train_idx)
        random.shuffle(test_idx)

        clinical_testdata.loc[f'Test{test_pass:03}', 'train_sample_ids'] = train_idx
        clinical_testdata.loc[f'Test{test_pass:03}', 'test_sample_ids'] = test_idx
        clinical_testdata.loc[f'Test{test_pass:03}', 'train_ground_truth'] = [label_dict[s] for s in train_idx]
        clinical_testdata.loc[f'Test{test_pass:03}', 'test_ground_truth'] = [label_dict[s] for s in test_idx]
    
    
    # if OVERRIDE_EXISTING_2_OMICS:
    #     clinical_testdata.to_parquet(f'{DATA_PATH}/clinical_testdata_2_omics_mRNA_miRNA/{label}.parquet', storage_options=storage_options)



### 3-omic: mRNA, miRNA, and DNA methylation

In [None]:
common_samples_3omics = list(set(mRNA.columns) & set(miRNA.columns) & set(methDNA.columns))
print(f'Common samples: {len(common_samples_3omics)} - {list(common_samples_2omics)}')
print(f'% of common samples wrt mRNA: {len(common_samples_3omics) / len(mRNA.columns) * 100:.2f}%')
print(f'% of common samples wrt miRNA: {len(common_samples_3omics) / len(miRNA.columns) * 100:.2f}%')
print(f'% of common samples wrt methDNA: {len(common_samples_3omics) / len(methDNA.columns) * 100:.2f}%')
print(f'% of common samples wrt 2-omics: {len(common_samples_3omics) / len(common_samples_2omics) * 100:.2f}%')
print('\n')

common_mRNAs = list(set(mRNA.index) & set(bipart.index))
print(f'Common mRNAs: {len(common_mRNAs)} - {list(common_mRNAs)}')
print(f'% of common mRNAs wrt mRNA: {len(common_mRNAs) / len(mRNA.index) * 100:.2f}%')
print(f'% of common mRNAs wrt bipart: {len(common_mRNAs) / len(bipart.index) * 100:.2f}%')
print('\n')

common_miRNAs = list(set(miRNA.index) & set(bipart.columns))
print(f'Common miRNAs: {len(common_miRNAs)} - {list(common_miRNAs)}')
print(f'% of common miRNAs wrt miRNA: {len(common_miRNAs) / len(miRNA.index) * 100:.2f}%')
print(f'% of common miRNAs wrt bipart: {len(common_miRNAs) / len(bipart.columns) * 100:.2f}%')

In [None]:
# Sort the indexes
common_samples_3omics.sort()
common_mRNAs.sort()
common_miRNAs.sort()

# Align the data
mRNA_common = mRNA.loc[common_mRNAs, common_samples_3omics].copy(deep=True)
miRNA_common = miRNA.loc[common_miRNAs, common_samples_3omics].copy(deep=True)
methDNA_common = methDNA.loc[:, common_samples_3omics].copy(deep=True)
bipart_common = bipart.loc[common_mRNAs, common_miRNAs].copy(deep=True)


# Zero features for mRNA_common
zero_features = []
for feature in mRNA_common.index:
    if(np.all(mRNA_common.loc[feature].values == 0)):
        zero_features.append(feature)
print(f'{len(zero_features)} features in mRNA_common have all zero values')

# Zero features for miRNA_common
zero_features = []
for feature in miRNA_common.index:
    if(np.all(miRNA_common.loc[feature].values == 0)):
        zero_features.append(feature)
print(f'{len(zero_features)} features in miRNA_common have all zero values')

# Zero features for methDNA_common
zero_features = []
for feature in methDNA_common.index:
    if(np.all(methDNA_common.loc[feature].values == 0)):
        zero_features.append(feature)
print(f'{len(zero_features)} features in methDNA_common have all zero values')



display(f'Common mRNA shape: {mRNA_common.shape[0]} mRNAs and {mRNA_common.shape[1]} samples')
# display(mRNA_common.head())

display(f'Common miRNA shape: {miRNA_common.shape[0]} miRNAs and {miRNA_common.shape[1]} samples')
# display(miRNA_common.head())

display(f'Common methDNA shape: {methDNA_common.shape[0]} methDNAs and {methDNA_common.shape[1]} samples')
# display(methDNA_common.head())

display(f'Common bipart shape: {bipart_common.shape[0]} mRNAs and {bipart_common.shape[1]} miRNAs')
# display(bipart_common.head())



In [98]:
if s3 is not None:
    if not s3.exists(f'{DATA_PATH}/processed_3_omics_mRNA_miRNA_methDNA'):
        s3.makedirs(f'{DATA_PATH}/processed_3_omics_mRNA_miRNA_methDNA')
    if not s3.exists(f'{DATA_PATH}/clinical_testdata_3_omics_mRNA_miRNA_methDNA'):
        s3.makedirs(f'{DATA_PATH}/clinical_testdata_3_omics_mRNA_miRNA_methDNA')
    
else:
    if not os.path.exists(f'{DATA_PATH}/processed_3_omics_mRNA_miRNA_methDNA'):
        os.makedirs(f'{DATA_PATH}/processed_3_omics_mRNA_miRNA_methDNA')
    if not os.path.exists(f'{DATA_PATH}/clinical_testdata_3_omics_mRNA_miRNA_methDNA'):
        os.makedirs(f'{DATA_PATH}/clinical_testdata_3_omics_mRNA_miRNA_methDNA')
    


if OVERRIDE_EXISTING_3_OMICS:
    mRNA_common.to_parquet(f'{DATA_PATH}/processed_3_omics_mRNA_miRNA_methDNA/mRNA.parquet', storage_options=storage_options)
    miRNA_common.to_parquet(f'{DATA_PATH}/processed_3_omics_mRNA_miRNA_methDNA/miRNA.parquet', storage_options=storage_options)
    methDNA_common.to_parquet(f'{DATA_PATH}/processed_3_omics_mRNA_miRNA_methDNA/methDNA.parquet', storage_options=storage_options)
    bipart_common.to_parquet(f'{DATA_PATH}/processed_3_omics_mRNA_miRNA_methDNA/bipart.parquet', storage_options=storage_options)
    clinical.to_parquet(f'{DATA_PATH}/processed_3_omics_mRNA_miRNA_methDNA/clinical.parquet', storage_options=storage_options)



In [None]:
import random
from sklearn.model_selection import train_test_split
common_samples_3omics_for_clinical = list(set(common_samples_3omics) & set(clinical.index))
clinical_common_3omics = clinical.loc[common_samples_3omics_for_clinical].copy(deep=True)

clinical_common_3omics.replace({'Positive': 1, 'Negative': 0}, inplace=True)

print(f'Common samples eligible for threshold engineering: {len(clinical_common_3omics)}')
display(clinical_common_3omics.head())


test_index = [f'Test{i:03}' for i in range(TEST_PASS)]
for label in clinical_common_3omics.columns:
    clinical_testdata = pd.DataFrame(
        index = test_index,
        columns = ['train_sample_ids', 'test_sample_ids', 'train_ground_truth', 'test_ground_truth']
    )
    positive_samples = list(clinical_common_3omics[clinical_common_3omics[label] == 1].index)
    negative_samples = list(clinical_common_3omics[clinical_common_3omics[label] == 0].index)

    print(f'Label: {label}')
    print(f'Positive samples: {len(positive_samples)}')
    print(f'Negative samples: {len(negative_samples)}')

    label_dict = {s:1 for s in positive_samples}
    label_dict.update({s:0 for s in negative_samples}) 
    for test_pass in tqdm(range(TEST_PASS), desc=f'Building testcases for {label}'):
        pos_train_idx, pos_test_idx = train_test_split(positive_samples, test_size=0.2)
        neg_train_idx, neg_test_idx = train_test_split(negative_samples, test_size=0.2)

        train_idx = pos_train_idx + neg_train_idx
        test_idx = pos_test_idx + neg_test_idx
        random.shuffle(train_idx)
        random.shuffle(test_idx)

        clinical_testdata.loc[f'Test{test_pass:03}', 'train_sample_ids'] = train_idx
        clinical_testdata.loc[f'Test{test_pass:03}', 'test_sample_ids'] = test_idx
        clinical_testdata.loc[f'Test{test_pass:03}', 'train_ground_truth'] = [label_dict[s] for s in train_idx]
        clinical_testdata.loc[f'Test{test_pass:03}', 'test_ground_truth'] = [label_dict[s] for s in test_idx]


    if OVERRIDE_EXISTING_3_OMICS:
        clinical_testdata.to_parquet(f'{DATA_PATH}/clinical_testdata_3_omics_mRNA_miRNA_methDNA/{label}.parquet', storage_options=storage_options)


# Mini Dataset

In [None]:
mini_dataset_mRNA_feature_size = 50
mini_dataset_miRNA_feature_size = 50
mini_dataset_methDNA_feature_size = 50
mini_dataset_samples_size = 200
folder = 'processed_micro'

common_sample = list(set(mRNA_common.columns).intersection(miRNA_common.columns).intersection(clinical.index))
clinical_common = clinical.loc[common_sample, :].copy(deep=True)

import random

mini_mRNA = sorted(list(set(random.sample(mRNA_common.index.to_list(), k = mini_dataset_mRNA_feature_size))))
mini_miRNA = sorted(list(set(random.sample(miRNA_common.index.to_list(), k = mini_dataset_miRNA_feature_size))))
mini_methDNA = sorted(list(set(random.sample(methDNA_common.index.to_list(), k = mini_dataset_methDNA_feature_size))))
mini_samples = sorted(list(set(random.sample(list(common_sample), k = mini_dataset_samples_size))))

mRNA_mini = mRNA_common.loc[mini_mRNA, mini_samples].copy(deep=True)
miRNA_mini = miRNA_common.loc[mini_miRNA, mini_samples].copy(deep=True)
methDNA_mini = methDNA_common.loc[mini_methDNA, mini_samples].copy(deep=True)
bipart_mini = bipart_common.loc[mini_mRNA, mini_miRNA].copy(deep=True)



# Zero features for mRNA_mini
zero_features = []
for feature in mRNA_mini.index:
    if(np.all(mRNA_mini.loc[feature].values == 0)):
        zero_features.append(feature)
mRNA_mini.drop(zero_features, inplace=True, axis=0)
bipart_mini.drop(zero_features, inplace=True, axis=0)
print(f'{len(zero_features)} features in mRNA_mini have all zero values')

# Zero features for miRNA_mini
zero_features = []
for feature in miRNA_mini.index:
    if(np.all(miRNA_mini.loc[feature].values == 0)):
        zero_features.append(feature)
miRNA_mini.drop(zero_features, inplace=True, axis=0)
bipart_mini.drop(zero_features, inplace=True, axis=1)
print(f'{len(zero_features)} features in miRNA_mini have all zero values')

# Zero features for methDNA_mini
zero_features = []
for feature in methDNA_mini.index:
    if(np.all(methDNA_mini.loc[feature].values == 0)):
        zero_features.append(feature)
methDNA_mini.drop(zero_features, inplace=True, axis=0)
print(f'{len(zero_features)} features in methDNA_mini have all zero values')



if s3 is not None:
    if not s3.exists(f'{DATA_PATH}/{folder}'):
        s3.makedirs(f'{DATA_PATH}/{folder}')
else:
    if not os.path.exists(f'{DATA_PATH}/{folder}'):
        os.makedirs(f'{DATA_PATH}/{folder}')

mRNA_mini.to_parquet(f'{DATA_PATH}/{folder}/mRNA.parquet', storage_options=storage_options)
miRNA_mini.to_parquet(f'{DATA_PATH}/{folder}/miRNA.parquet', storage_options=storage_options)
bipart_mini.to_parquet(f'{DATA_PATH}/{folder}/bipart.parquet', storage_options=storage_options)
methDNA_mini.to_parquet(f'{DATA_PATH}/{folder}/methDNA.parquet', storage_options=storage_options)