In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os,sys,time
from utils import *
#from scipy.stats import pearsonr

In [2]:
##===========================================================================
print(" ")
print("--- Collecting labels and preds ---")

project = "BRCA"
#project = sys.argv[1]
path2meta = "../10metadata/"

result_dir = f"{project}_results/"
os.makedirs(result_dir,exist_ok=True)

 
--- Collecting labels and preds ---


In [3]:
genes = np.loadtxt(f"../12target/{project}_genes.txt", dtype="str")
genes = genes[:,0]
n_genes = len(genes)
print("n_genes:", n_genes)

n_genes_each = 22000
n_models = int((n_genes-1)/n_genes_each) + 1
print("n_models:", n_models)

i_steps = np.array([i*n_genes_each for i in range(n_models)])
print("i_steps.shape:", i_steps.shape)
print(i_steps)

n_genes: 18450
n_models: 1
i_steps.shape: (1,)
[0]


In [4]:
##===========================================================================
ik_folds = [0,1,2,3,4]
il_folds = [0,1,2,3,4]

labels_all = [] ; preds_all = []
for ik,ik_fold in enumerate(ik_folds):
    print(" ")
    print("ik_fold:", ik_fold)
    
    ##=====================================
    ## labels
    labels = []
    for i in i_steps:
        label = np.load(f"results/result_{ik_fold}_{il_folds[0]}_{i}/test_labels.npy")
        labels.append(label)

    labels = np.concatenate(labels,axis=1)

    #np.save("labels_ik%s.npy"%ik_fold,labels)
    
    ##=====================================
    ## preds
    preds_mean = np.zeros((len(il_folds), labels.shape[0], labels.shape[1]))
    for il_fold in il_folds:
        preds = []
        for i in i_steps:
            pred = np.load(f"results/result_{ik_fold}_{il_fold}_{i}/test_preds.npy")
            preds.append(pred)
        
        preds = np.concatenate(preds,axis=1)

        preds_mean[il_fold,:,:] = preds
    
    preds_mean = np.mean(preds_mean,axis=0)
    #np.save("preds_ik%s.npy"%ik_fold, preds_mean)

    ##-----------------------
    #np.save(f"{result_folder}test_actual_ik{ik_fold}.npy",labels)
    #np.save(f"{result_folder}test_pred_ik{ik_fold}.npy", preds_mean)

    ##=====================================
    labels_all.append(labels)
    preds_all.append(preds_mean)

labels_all = np.concatenate(labels_all)
preds_all = np.concatenate(preds_all)

print(labels_all.shape, preds_all.shape)
##===========================================================================
np.save(f"{result_dir}labels_all.npy", labels_all)
np.save(f"{result_dir}preds_all.npy", preds_all)

 
ik_fold: 0
 
ik_fold: 1
 
ik_fold: 2
 
ik_fold: 3
 
ik_fold: 4
(1096, 18450) (1096, 18450)


In [5]:
## create predicted file
df_meta = pd.read_csv(f"{path2meta}{project}_slide_selected.csv")

## patient split
train_valid_test_idx = np.load(f"{path2meta}{project}_train_valid_test_idx.npz", allow_pickle=True)
for ik_fold in np.arange(5):
    test_idx = train_valid_test_idx["test_idx"][ik_fold]
    
    if ik_fold == 0:
        test_idx_all = test_idx
    else:
        test_idx_all = np.hstack((test_idx_all,test_idx))

df_test = df_meta.loc[test_idx_all].reset_index(drop=True)

In [6]:
df_actual = pd.DataFrame(columns=genes, data=labels_all)
df_actual = pd.concat([df_test, df_actual],axis=1)

df_pred = pd.DataFrame(columns=genes, data=preds_all)
df_pred = pd.concat([df_test, df_pred],axis=1)

## get the same order of slide_selected file
df_actual = df_actual.sort_values(by="slide_name", ignore_index=True)
df_pred = df_pred.sort_values(by="slide_name", ignore_index=True)

In [7]:
df_actual.to_pickle(f"{result_dir}{project}_actual.pkl")
df_pred.to_pickle(f"{result_dir}{project}_pred.pkl")

In [8]:
##
print("coefs_all:")
coefs, slope, padj = compute_coef_slope_padj(labels_all, preds_all)
print(sum(coefs > 0.4),sum(coefs > 0.45),sum(coefs > 0.5), sum(coefs > 0.55), sum(coefs > 0.6))

## sort genes by coefs
#i0 = np.argsort(coefs)[::-1]
#np.savetxt(f"{result_dir}R_all.txt", np.array((genes, coefs, slope, p)).T, fmt="%s %s %s %s")
#np.savetxt(f"{result_dir}R_all_sorted.txt", np.array((genes[i0], coefs[i0], slope[i0], p[i0])).T, fmt="%s %s %s %s")

coefs_all:
4100 2531 1438 604 180


In [9]:
##----------------
coef0 = np.linspace(0.4, 0.6, 11, endpoint=True)
n = np.zeros(len(coef0))
for i, t in enumerate(coef0):
    n[i] = sum(coefs > t)
    print(i, t, n[i])
#np.savetxt(f"{result_dir}n_genes_threshold.txt", np.array((coef0, n)).T, fmt="%s %s")

0 0.4 4100.0
1 0.42000000000000004 3400.0
2 0.44 2790.0
3 0.46 2292.0
4 0.48 1857.0
5 0.5 1438.0
6 0.52 1073.0
7 0.54 738.0
8 0.56 494.0
9 0.58 314.0
10 0.6 180.0
