In [11]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

# Suppress FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from glob import glob
from tqdm import tqdm

In [12]:
def split_into_files(lines, num_files):
    # Calculate the number of lines per file
    lines_per_file = len(lines) // num_files
    remainder = len(lines) % num_files
    
    # Split lines into chunks for each file
    chunks = []
    start = 0
    for i in range(num_files):
        # Allocate the remainder evenly
        end = start + lines_per_file + (1 if i < remainder else 0)
        chunks.append(lines[start:end])
        start = end
    
    # Write each chunk to a separate file
    for i, chunk in enumerate(chunks):
        with open(f'sh{i+1}.sh', 'w') as f:
            f.write('\n'.join(chunk))

In [13]:
commands = []
count = 0

mlp_configs = [
    "256,256,128",
    "128,128,64",
    "512,256,128",
    "64,64,32",
    "128,64,32",
    "256,128,64",
    "512,512,256",
    "1024,512,256",
    "256,128,32",
    "128,128,128"
]

classifier_dim_configs = {
    "ROSMAP": mlp_configs,
    "LGG": mlp_configs,
    "KIPAN": mlp_configs,
    "BRCA": mlp_configs
}
for view_list in ["1", "2", "3", "1,2", "1,3", "2,3", "1,2,3"]:
    for dataset in ["BRCA", "ROSMAP", "KIPAN", "LGG"]:
        for classifier_dims in classifier_dim_configs[dataset]:
            exp_save_path = f"exp/{dataset}/{view_list}/{classifier_dims} "
            command = f"python trainer.py --exp_save_path={exp_save_path}" \
                      f"--view_list={view_list} --dataset={dataset} --classifier_dims={classifier_dims}"
            print(command)
            count += 1
            commands.append(command)
print(count)

split_into_files(commands, 4)

python trainer.py --exp_save_path=exp/BRCA/1/256,256,128 --view_list=1 --dataset=BRCA --classifier_dims=256,256,128
python trainer.py --exp_save_path=exp/BRCA/1/128,128,64 --view_list=1 --dataset=BRCA --classifier_dims=128,128,64
python trainer.py --exp_save_path=exp/BRCA/1/512,256,128 --view_list=1 --dataset=BRCA --classifier_dims=512,256,128
python trainer.py --exp_save_path=exp/BRCA/1/64,64,32 --view_list=1 --dataset=BRCA --classifier_dims=64,64,32
python trainer.py --exp_save_path=exp/BRCA/1/128,64,32 --view_list=1 --dataset=BRCA --classifier_dims=128,64,32
python trainer.py --exp_save_path=exp/BRCA/1/256,128,64 --view_list=1 --dataset=BRCA --classifier_dims=256,128,64
python trainer.py --exp_save_path=exp/BRCA/1/512,512,256 --view_list=1 --dataset=BRCA --classifier_dims=512,512,256
python trainer.py --exp_save_path=exp/BRCA/1/1024,512,256 --view_list=1 --dataset=BRCA --classifier_dims=1024,512,256
python trainer.py --exp_save_path=exp/BRCA/1/256,128,32 --view_list=1 --dataset=BRCA

# Determine base modalities

In [14]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

save_path = "exp"
df_performance = pd.DataFrame(columns=['dataset', 'view_list','classifier_dims', 'acc','measure1','measure2','uncertainty'])
for dataset in ["BRCA", "ROSMAP", "LGG", "KIPAN"]:
    for view_list in ["1", "2", "3", "1,2", "1,3", "2,3", "1,2,3"]:
        for classifier_dims in os.listdir(f"{save_path}/{dataset}/{view_list}"):
            df = pd.read_csv(f"{save_path}/{dataset}/{view_list}/{classifier_dims}/best_results.csv")
            pred = df['pred'].to_numpy()
            label = df['label'].to_numpy()
            uncertainty = np.mean(df['u'].to_numpy())
            if dataset in ['ROSMAP', 'LGG']:
                acc = accuracy_score(label, pred)
                measure1 = f1_score(label, pred)
                measure2 = roc_auc_score(label, pred)
            elif dataset in ['BRCA', 'KIPAN']:
                acc = accuracy_score(label, pred)
                measure1 = f1_score(label, pred, average='weighted')
                measure2 = f1_score(label, pred, average='macro')

            df_performance.loc[df_performance.shape[0]] = pd.Series({"dataset": dataset, "view_list": view_list.replace(",","-"), 
                                                                     "classifier_dims": classifier_dims.replace(",", "-"),
                                                                     "acc": acc, "measure1": measure1, "measure2": measure2, "uncertainty": uncertainty})
df_performance.to_csv(f"{save_path}/performance.csv", index=False)
df_performance.head(5)


Unnamed: 0,dataset,view_list,classifier_dims,acc,measure1,measure2,uncertainty
0,BRCA,1,256-128-32,0.80916,0.794509,0.645966,0.460108
1,BRCA,1,256-256-128,0.824427,0.827014,0.767425,0.417269
2,BRCA,1,64-64-32,0.763359,0.742521,0.598248,0.584526
3,BRCA,1,128-128-128,0.820611,0.818739,0.737682,0.456955
4,BRCA,1,512-512-256,0.835878,0.839377,0.77808,0.423758


# select the modality with highest accuracy

In [15]:
result_df = pd.DataFrame(columns=["dataset", "view_list", "classifier_dims", "acc", "measure1", "measure2", "uncertainty"])
for dataset in ["BRCA", "ROSMAP", "LGG", "KIPAN"]:
    df_performance_sub = df_performance[df_performance['dataset']==dataset]
    for view_list in ["1","2","3"]:
        single_modality_df = df_performance_sub[df_performance_sub['view_list'].apply(lambda x: '-' not in x)]
        single_modality_df = single_modality_df[single_modality_df['view_list']==view_list]
        best_single_modality = single_modality_df.loc[single_modality_df['acc'].idxmax()]
        result_df = pd.concat([result_df, best_single_modality.to_frame().T])

    for view_list in ["1-2", "1-3", "2-3"]:
        two_modality_df = df_performance_sub[df_performance_sub['view_list'].apply(lambda x: '-' in x and len(x.split('-')) == 2)]
        two_modality_df = two_modality_df[two_modality_df['view_list']==view_list]
        best_two_modality = two_modality_df.loc[two_modality_df['acc'].idxmax()]
        result_df = pd.concat([result_df, best_two_modality.to_frame().T])

    # Step 3: Keep the row with the view list of 1-2-3 if it exists
    three_modality_df = df_performance_sub[df_performance_sub['view_list'] == '1-2-3']
    best_modality_df = three_modality_df.loc[three_modality_df['acc'].idxmax()]

    # Combine all selected rows into a new dataframe
    result_df = pd.concat([result_df, best_modality_df.to_frame().T])

result_df.to_csv(f"{save_path}/result_df.csv", index=False)
result_df

Unnamed: 0,dataset,view_list,classifier_dims,acc,measure1,measure2,uncertainty
8,BRCA,1,1024-512-256,0.839695,0.843416,0.792608,0.438094
18,BRCA,2,1024-512-256,0.744275,0.725572,0.623286,0.491743
24,BRCA,3,512-512-256,0.717557,0.687362,0.543431,0.561685
38,BRCA,1-2,1024-512-256,0.816794,0.816105,0.772323,0.191025
43,BRCA,1-3,128-128-128,0.79771,0.76991,0.625831,0.414774
57,BRCA,2-3,512-256-128,0.744275,0.705897,0.563426,0.436396
62,BRCA,1-2-3,64-64-32,0.885496,0.887766,0.857951,0.0
70,ROSMAP,1,256-128-32,0.838095,0.841121,0.838235,0.560839
84,ROSMAP,2,512-512-256,0.752381,0.734694,0.754902,0.351275
95,ROSMAP,3,128-64-32,0.752381,0.759259,0.752179,0.560614


In [16]:
result_df = pd.DataFrame(columns=["dataset", "view_list", "classifier_dims", "acc", "measure1", "measure2", "uncertainty"])
for dataset in ["BRCA", "ROSMAP", "LGG", "KIPAN"]:
    df_performance_sub = df_performance[df_performance['dataset']==dataset]
    single_modality_df = df_performance_sub[df_performance_sub['view_list'].apply(lambda x: '-' not in x)]
    best_single_modality = single_modality_df.loc[single_modality_df['acc'].idxmax()]

    # Step 2: Select the best two-modality view containing the best single modality
    best_single_view = best_single_modality['view_list']
    two_modality_df = df_performance_sub[df_performance_sub['view_list'].apply(lambda x: '-' in x and len(x.split('-')) == 2 and best_single_view in x)]
    best_two_modality = two_modality_df.loc[two_modality_df['acc'].idxmax()]

    # Step 3: Keep the row with the view list of 1-2-3 if it exists
    three_modality_df = df_performance_sub[df_performance_sub['view_list'] == '1-2-3']
    best_modality_df = three_modality_df.loc[three_modality_df['acc'].idxmax()]

    # Combine all selected rows into a new dataframe
    result_df_sub = pd.concat([best_single_modality.to_frame().T, best_two_modality.to_frame().T, best_modality_df.to_frame().T])
    result_df = pd.concat([result_df_sub, result_df])

result_df.to_csv(f"{save_path}/result_df.csv", index=False)
result_df

Unnamed: 0,dataset,view_list,classifier_dims,acc,measure1,measure2,uncertainty
228,KIPAN,2,1024-512-256,1.0,1.0,1.0,0.096597
244,KIPAN,1-2,512-512-256,1.0,1.0,1.0,0.008619
277,KIPAN,1-2-3,512-256-128,0.984772,0.984746,0.982156,0.0
141,LGG,1,256-256-128,0.828947,0.8375,0.828136,0.431666
183,LGG,1-3,128-128-128,0.848684,0.853503,0.848406,0.352138
200,LGG,1-2-3,256-128-32,0.828947,0.828947,0.829522,0.0
70,ROSMAP,1,256-128-32,0.838095,0.841121,0.838235,0.560839
111,ROSMAP,1-3,256-256-128,0.866667,0.867925,0.867102,0.376465
137,ROSMAP,1-2-3,512-256-128,0.847619,0.854545,0.84695,0.0
8,BRCA,1,1024-512-256,0.839695,0.843416,0.792608,0.438094


# Set thresh

In [20]:
df_staged = pd.DataFrame(columns=["dataset", "acc", "measure1", "measure2", "t1", "t2"])

for dataset in ["ROSMAP", "BRCA", "LGG", "KIPAN"]:

    view_lists = result_df[result_df['dataset']==dataset]['view_list'].to_list()
    classifier_dims = result_df[result_df['dataset']==dataset]['classifier_dims'].to_list() 

    converted_view_list = [view.replace('-', ',') for view in view_lists]
    print(f"dataset = {dataset}, view_list = {converted_view_list}, classifier_dims = {classifier_dims}")

    view_lists = result_df[result_df['dataset']==dataset]['view_list'].to_list()
    converted_view_list = [view.replace('-', ',') for view in view_lists]
    converted_classifier_dims = [cd.replace('-', ',') for cd in classifier_dims]

    df_uni = pd.read_csv(f"{save_path}/{dataset}/{converted_view_list[0]}/{converted_classifier_dims[0]}/best_results.csv").copy()
    df_bi = pd.read_csv(f"{save_path}/{dataset}/{converted_view_list[1]}/{converted_classifier_dims[1]}/best_results.csv").copy()
    df_tri = pd.read_csv(f"{save_path}/{dataset}/{converted_view_list[2]}/{converted_classifier_dims[2]}/best_results.csv").copy()
    threshold_range_uni = np.linspace(df_uni['u'].min(), df_uni['u'].max(), 100)
    threshold_range_bi = np.linspace(df_bi['u'].min(), df_bi['u'].max(), 100)

    best_accuracy = 0.
    best_threshold_t1 = 0
    best_threshold_t2 = 0

    for t1 in tqdm(threshold_range_uni):
        for t2 in threshold_range_bi:
            df_classified_confident_stage = pd.DataFrame(columns=["patient_id", "stage", "u", "pred", "label"])
            move_forward = True

            # single view
            df_uni['confident'] = df_uni['u'] <= t1
            df_uni_confident = df_uni[df_uni['confident']]
            df_uni_confident = df_uni_confident.copy()
            df_uni_confident.loc[:, "stage"] = view_lists[0]

            df_uni_confident = df_uni_confident[["patient_id", "stage", "u", "pred", "label"]]
            df_classified_confident_stage = pd.concat([df_classified_confident_stage, df_uni_confident], ignore_index=True)

            if df_classified_confident_stage.shape[0] != df_uni.shape[0]:
                # Unconfident predictions - use 2 views
                df_uni_unconfident = df_uni[~df_uni['confident']]
                df_bi_sub = df_bi[df_bi['patient_id'].isin(df_uni_unconfident['patient_id'])]
                df_bi_sub = df_bi_sub.copy()
                df_bi_sub.loc[:, 'confident'] = df_bi_sub['u'] <= t2
                df_bi_confident = df_bi_sub[df_bi_sub['confident']]
                
                if df_bi_confident.shape[0] == 0:
                    # t2 is too small, no subjects in bi-view model is confident
                    continue
                
                df_bi_confident = df_bi_confident.copy()
                df_bi_confident.loc[:, "stage"] = view_lists[1]
                df_bi_confident = df_bi_confident[["patient_id", "stage", "u", "pred", "label"]]
                df_classified_confident_stage = pd.concat([df_classified_confident_stage, df_bi_confident], ignore_index=True)

                if df_classified_confident_stage.shape[0] != df_uni.shape[0]:
                    # Unconfident predictions - use 3 views
                    df_bi_unconfident = df_bi_sub[~df_bi_sub['confident']]
                    df_tri_sub = df_tri[df_tri['patient_id'].isin(df_bi_unconfident['patient_id'])]
                    df_tri_sub = df_tri_sub.copy()
                    df_tri_sub.loc[:, 'stage'] = view_lists[2]
                    df_tri_sub = df_tri_sub[["patient_id", "stage", "u", "pred", "label"]]
                    df_classified_confident_stage = pd.concat([df_classified_confident_stage, df_tri_sub], ignore_index=True)

            assert df_classified_confident_stage.shape[0] == df_uni.shape[0]
            total_accuracy = accuracy_score(df_classified_confident_stage['label'].to_list(), df_classified_confident_stage['pred'].to_list())
            
            # Update best threshold if this one is better
            if total_accuracy > best_accuracy:
                best_accuracy = total_accuracy
                best_threshold_t1 = t1
                best_threshold_t2 = t2
                df_classified_confident_stage.to_csv(f"{save_path}/{dataset}/staged_results.csv", index=False)
            # print(f"acc = {total_accuracy}, t1 = {t1}, t2 = {t2}")

    if dataset in ["ROSMAP", "LGG"]:
        measure1 = f1_score(df_classified_confident_stage['label'].to_list(), df_classified_confident_stage['pred'].to_list())
        measure2 = roc_auc_score(df_classified_confident_stage['label'].to_list(), df_classified_confident_stage['pred'].to_list())
    elif dataset in ["KIPAN", "BRCA"]:
        measure1 = f1_score(df_classified_confident_stage['label'].to_list(), df_classified_confident_stage['pred'].to_list(), average='weighted')
        measure2 = f1_score(df_classified_confident_stage['label'].to_list(), df_classified_confident_stage['pred'].to_list(), average='macro')
    print(f"data_folder = {dataset}, best_accuracy = {best_accuracy}, measure1 = {measure1}, measure2 = {measure2}, best_threshold_t1 = {best_threshold_t1}, best_threshold_t2 = {best_threshold_t2}")
    df_staged.loc[df_staged.shape[0]] = pd.Series({"data_folder": dataset, "acc": best_accuracy, "measure1": measure1, "measure2": measure2, 
                                                    "t1": best_threshold_t1, "t2": best_threshold_t2})
df_staged.to_csv(f"{save_path}/staged_performance.csv", index=False)

dataset = ROSMAP, view_list = ['1', '1,3', '1,2,3'], classifier_dims = ['256-128-32', '256-256-128', '512-256-128']


100%|██████████| 100/100 [00:19<00:00,  5.10it/s]


data_folder = ROSMAP, best_accuracy = 0.8857142857142857, measure1 = 0.8411214953271028, measure2 = 0.8382352941176471, best_threshold_t1 = 0.2672834694385528, best_threshold_t2 = 0.4308858425027192
dataset = BRCA, view_list = ['1', '1,2', '1,2,3'], classifier_dims = ['1024-512-256', '1024-512-256', '64-64-32']


100%|██████████| 100/100 [00:21<00:00,  4.59it/s]


data_folder = BRCA, best_accuracy = 0.8854961832061069, measure1 = 0.8434160159715136, measure2 = 0.7926078782772348, best_threshold_t1 = 0.0979643315076828, best_threshold_t2 = 0.0019763009622693
dataset = LGG, view_list = ['1', '1,3', '1,2,3'], classifier_dims = ['256-256-128', '128-128-128', '256-128-32']


100%|██████████| 100/100 [00:18<00:00,  5.28it/s]


data_folder = LGG, best_accuracy = 0.8486842105263158, measure1 = 0.8375, measure2 = 0.8281358281358281, best_threshold_t1 = 0.1470242589712143, best_threshold_t2 = 0.5043524503707886
dataset = KIPAN, view_list = ['2', '1,2', '1,2,3'], classifier_dims = ['1024-512-256', '512-512-256', '512-256-128']


100%|██████████| 100/100 [00:18<00:00,  5.50it/s]

data_folder = KIPAN, best_accuracy = 1.0, measure1 = 1.0, measure2 = 1.0, best_threshold_t1 = 0.039534717798233, best_threshold_t2 = 0.07177203094629056





In [24]:
df_staged_percent = pd.DataFrame(columns=["dataset", "stage1", "stage2", "stage3"])
for dataset in ["ROSMAP", "BRCA", "LGG", "KIPAN"]:
    df_classified_confident_stage = pd.read_csv(f"{save_path}/{dataset}/staged_results.csv")
    n_subjects = df_classified_confident_stage.shape[0]
    stages = pd.unique(df_classified_confident_stage['stage'])
    stage_strings = []
    for stage in stages:
        stage_string = ""
        if stage.rfind("1")!=-1:
            stage_string += "mRNA|"
        if stage.rfind("2")!=-1:
            stage_string += "methy|"
        if stage.rfind("3")!=-1:
            stage_string += "miRNA|"
        stage_string += f"{df_classified_confident_stage[df_classified_confident_stage['stage']==stage].shape[0]/n_subjects *100 : 0.2f}%"
        stage_strings.append(stage_string)
    if len(stage_strings) == 3:
        df_staged_percent.loc[df_staged_percent.shape[0]] = pd.Series({"dataset": dataset, "stage1": stage_strings[0] , "stage2": stage_strings[1] , "stage3": stage_strings[2]})
    else:
        df_staged_percent.loc[df_staged_percent.shape[0]] = pd.Series({"dataset": dataset, "stage1": stage_strings[0] , "stage2": stage_strings[1] , "stage3": "mRNA|methy|miRNA| 0%"})

df_staged_percent.to_csv(f"{save_path}/staged_percentage.csv", index=False)
df_staged_percent


Unnamed: 0,dataset,stage1,stage2,stage3
0,ROSMAP,mRNA| 0.95%,mRNA|miRNA| 65.71%,mRNA|methy|miRNA| 33.33%
1,BRCA,mRNA| 0.38%,mRNA|methy| 0.38%,mRNA|methy|miRNA| 99.24%
2,LGG,mRNA| 0.66%,mRNA|miRNA| 99.34%,mRNA|methy|miRNA| 0%
3,KIPAN,methy| 0.51%,mRNA|methy| 98.48%,mRNA|methy|miRNA| 1.02%
