In [None]:
ASSAYS_TO_CV = A[A['auroc_avg'] > 0.7][['assay_id', 'activity_type', 'unit']].values.tolist()
ASSAYS_TO_CV = [[i[0], i[1], i[2], "qt_ref_probs.npz"] for i in ASSAYS_TO_CV]
ASSAYS_TO_CV = ["_".join(i) for i in ASSAYS_TO_CV]

In [None]:
assay_names = []
prob_vectors = []

for f in ASSAYS_TO_CV:
    path = os.path.join(PATH_TO_CORRELATIONS, f)
    if os.path.exists(path) is False:
        continue
    data = np.load(path)
    y_prob_ref = data["y_prob_ref"]

    assay_label = f.replace("_ref_probs.npz", "")
    assay_names.append(assay_label)
    prob_vectors.append(y_prob_ref)

P = np.array(prob_vectors)
corr, pvals = spearmanr(P.T)

In [None]:
import matplotlib.pyplot as plt

corr_df = pd.DataFrame(corr, index=assay_names, columns=assay_names)

# --- Plot heatmap ---
fig, ax = plt.subplots(figsize=(8, 6))
im = ax.imshow(corr_df.values, vmin=0, vmax=1)

ax.set_xticks(np.arange(len(assay_names)))
ax.set_yticks(np.arange(len(assay_names)))
ax.set_xticklabels([], rotation=90)
ax.set_yticklabels(assay_names)

ax.set_title("Correlation among datasets", pad=10, size=12)
cbar = fig.colorbar(im, ax=ax, fraction=0.045)
cbar.set_label("Spearman Correlation coefficient")

plt.tight_layout()
plt.show()

In [None]:
AUROCs_CV = []

for a in ASSAYS_TO_CV:
    
    # Load data
    filename = "_".join([str(a.split("_")[0]), str(a.split("_")[1]), str(a.split("_")[2]), "qt.csv.gz"])
    df = pd.read_csv(os.path.join(OUTPUT, pathogen_code, 'datasets', filename))

    # Prepare matrices
    X = np.array(df['compound_chembl_id'].map(ecfps).to_list())
    Y = np.array(df['bin'].tolist())

    print(f'Assay ID: {a.split("_")[0]}, Activity type: {a.split("_")[1]}, Unit: {a.split("_")[2]}')
    print(X.shape, Y.shape)

    # Shuffle systematically
    rng = np.random.default_rng(42)   # fixed seed
    idx = rng.permutation(len(Y))
    X = X[idx]
    Y = Y[idx]

    # Initialize RF
    RF = init_RF()

    print("Fitting model on full data...")

    # Fit model
    RF.fit(X, Y)

    print("Predicting on the rest of datasets...")

    AUROCs_MODEL = []

    for dataset in ASSAYS_TO_CV:

        # Load data
        filename = "_".join([str(dataset.split("_")[0]), str(dataset.split("_")[1]), str(dataset.split("_")[2]), "qt.csv.gz"])
        df = pd.read_csv(os.path.join(OUTPUT, pathogen_code, 'datasets', filename))

        # Prepare matrices
        X_test = np.array(df['compound_chembl_id'].map(ecfps).to_list())
        Y_test = np.array(df['bin'].tolist())

        # Predict on test
        y_pred = RF.predict_proba(X_test)[:, 1]
        auroc = roc_auc_score(Y_test, y_pred)

        AUROCs_MODEL.append(auroc)

    AUROCs_CV.append(AUROCs_MODEL)

In [None]:
# --- Plot heatmap ---
fig, ax = plt.subplots(figsize=(8, 6))
im = ax.imshow(AUROCs_CV, vmin=0.5, vmax=1, cmap='Spectral_r')

ax.set_xticks(np.arange(len(ASSAYS_TO_CV)))
ax.set_yticks(np.arange(len(ASSAYS_TO_CV)))
ax.set_xticklabels([], rotation=90)
ax.set_yticklabels([i.replace("_ref_probs.npz", "") for i in ASSAYS_TO_CV])

ax.set_title("Prediction performance among datasets", pad=10, size=12)
cbar = fig.colorbar(im, ax=ax, fraction=0.045)
cbar.set_label("AUROC")

plt.tight_layout()
plt.show()