# Plot

In [1]:
import os
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
import pandas as pd
import numpy as np
import matplotlib as mpl

mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['pdf.use14corefonts'] = False
# mpl.rcParams['pdf.usecorefonts'] = True
mpl.rcParams['pdf.compression'] = 9

from IPython.display import display, HTML

import matplotlib.pyplot as plt
import scienceplots

plt.style.use(['science', 'nature'])

from sklearn.manifold import TSNE
import umap.umap_ as umap
from sklearn.preprocessing import MinMaxScaler

class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

2023-07-18 13:35:24.240219: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-07-18 13:35:24.240234: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


解析xlsx结果文件

In [2]:
def decode_total_df(result_df: pd.DataFrame):
    model_name_list = result_df.loc[result_df.index.levels[0][0], :]["Model_Information"]['Classifier_Name'].to_list()
    feature_model_df = result_df['Feature_Selected']
    feature_model_df.reset_index(level=1, inplace=True)
    feature_df = feature_model_df.iloc[:, 1:].reset_index(drop=True)
    model_df = pd.DataFrame(np.eye(feature_model_df['Model_Type'].max()+1)[feature_model_df['Model_Type']], columns=[
        f"model-{i}"
        for i in range(feature_model_df['Model_Type'].max()+1)
    ]).reset_index(drop=True)
    auc_df = result_df['Best_Performance']['rocAUC'].to_frame().reset_index(drop=True)
    mcc_df = result_df['Best_Performance']['mmc'].to_frame().reset_index(drop=True)
    model_cata_df = feature_model_df['Model_Type'].to_frame().reset_index(drop=True)
    return {
        "feature_df": feature_df,
        "model_df": model_df,
        "auc_df": auc_df,
        "mcc_df": mcc_df,
        "model_cata_df": model_cata_df,
        "model_name_list": model_name_list
    }

In [3]:
prot_type = 2
fig_output_dir = f"out/libfeatureselection/T{prot_type}/model/"

读取xlsx

In [4]:
bayes_result_df = pd.read_excel(
    f"out/libfeatureselection/T{prot_type}/model/Bayes/searched_result.xlsx",
    f"T{prot_type}",
    header=[0, 1],
    index_col=[0, 1]
)
bayes_result_df_decode = decode_total_df(result_df=bayes_result_df)

onehot_result_df = pd.read_excel(
    f"out/libfeatureselection/T{prot_type}/model/Onehot/searched_result.xlsx",
    f"T{prot_type}",
    header=[0, 1],
    index_col=[0, 1]
)
onehot_result_df_decode = decode_total_df(result_df=onehot_result_df)

# Fig3a

In [None]:
n_jobs = (
    max(1, os.cpu_count() - 2)
    if "n_jobs" not in os.environ or os.environ['n_jobs'] == "" else
    int(os.environ['n_jobs'])
)

In [None]:
def feature_2d_plot(
    data: np.ndarray,
    auc: np.ndarray,
    mcc: np.ndarray,
    # scheme_type: np.ndarray,
    desc: str,
    path_to_out_dir: str,
    n_jobs: int = n_jobs
):
    # desc Need Type and Feature

    os.makedirs(path_to_out_dir, exist_ok=True)

    tsne = TSNE(
        n_components=2,
        verbose=0,
        n_jobs=n_jobs,
        random_state=42
    )
    z0 = MinMaxScaler().fit_transform(tsne.fit_transform(data))

    umaper = umap.UMAP(
        n_neighbors=5,
        n_components=2,
        n_epochs=10000,
        min_dist=0.1,
        local_connectivity=1,
        n_jobs=n_jobs,
        random_state=42
    )
    z1 = MinMaxScaler().fit_transform(umaper.fit_transform(data))

    df0 = pd.DataFrame()
    df0["comp-1"] = z0[:, 0]
    df0["comp-2"] = z0[:, 1]
    df0["auc"] = auc
    df0["mcc"] = mcc
    # df0["scheme_type"] = scheme_type

    df1 = pd.DataFrame()
    df1["comp-1"] = z1[:, 0]
    df1["comp-2"] = z1[:, 1]
    df1["auc"] = auc
    df1["mcc"] = mcc
    # df1["scheme_type"] = scheme_type

    fig, ax = plt.subplots(
        nrows=1,
        ncols=2,
        figsize=(19.2 / 2, 10.8 / 4),
    )

    ax[0].scatter(
        df0["comp-1"],
        df0["comp-2"],
        c=df0["auc"],
        s=df0["mcc"],
        cmap="Reds",
        vmin=0.6,
        vmax=1.0,
        alpha=0.5,
    )
    ax[0].set_title(f"{desc} Feature Combinations T-SNE projection")
    ax[0].set_aspect('equal', adjustable='box')

    ax[1].scatter(
        df1["comp-1"],
        df1["comp-2"],
        c=df1["auc"],
        s=df1["mcc"],
        cmap="Reds",
        vmin=0.6,
        vmax=1.0,
        alpha=0.5,
    )
    ax[1].set_title(f"{desc} Feature Combinations UMAP projection")
    ax[1].set_aspect('equal', adjustable='box')

    plt.tight_layout()
    plt.savefig(os.path.join(path_to_out_dir, f"{desc}_Feature_Combinations_projection.pdf"))
    plt.close(fig)
    return

In [None]:
feature_2d_plot(
    data=bayes_result_df_decode["feature_df"].values,
    auc=bayes_result_df_decode["auc_df"].values,
    mcc=bayes_result_df_decode["mcc_df"].values,
    desc=f"T{prot_type}",
    path_to_out_dir=f"{fig_output_dir}/scheme_plot/"
)

# Fig 3b

In [5]:
import seaborn as sns

In [6]:
threshold = AttrDict({
    "rocauc": 0.9,
    "mcc": 0.8,
})

统计两组特征选择算法有效的数目

仅仅看两组auc 大于阈值

In [7]:
fig = plt.figure(
    # figsize=(3.5, 5.0)
)
ax = fig.add_subplot()
bayes_effectness_model_count = bayes_result_df.groupby(level=0).apply(
    lambda gdf: gdf[((
        (gdf["Best_Performance"]["rocAUC"] > threshold.rocauc) & (gdf["Best_Performance"]["mmc"] > threshold.mcc)
    ) & (
        (gdf["5FoldCV_Performance"]["rocAUC"] > threshold.rocauc) & (gdf["5FoldCV_Performance"]["mmc"] > threshold.mcc)
    ))].shape[0]
)
onehot_effectness_model_count = onehot_result_df.groupby(level=0).apply(
    lambda gdf: gdf[((
        (gdf["Best_Performance"]["rocAUC"] > threshold.rocauc) & (gdf["Best_Performance"]["mmc"] > threshold.mcc)
    ) & (
        (gdf["5FoldCV_Performance"]["rocAUC"] > threshold.rocauc) & (gdf["5FoldCV_Performance"]["mmc"] > threshold.mcc)
    ))].shape[0]
)
sns.histplot(data=bayes_effectness_model_count, stat='probability',discrete=True,label='Comb', color="#dd1940aa")
sns.histplot(data=onehot_effectness_model_count, stat='probability',discrete=True,label='Single', color="#1f4aecaa")
# plt.axvline(x=0.5, color='#b9975daa', linestyle='--')
# plt.axvline(x=1.5, color='#b9975daa', linestyle='--')
# plt.axvline(x=2.5, color='#b9975daa', linestyle='--')
# plt.axvline(x=3.5, color='#b9975daa', linestyle='--')

plt.title("Distribution of effective T2SP models")
ax.xaxis.set_minor_locator(mpl.ticker.AutoMinorLocator(1))
ax.tick_params(axis='x', length=2, top=False, )
plt.xticks(np.arange(0, np.max([np.max(bayes_effectness_model_count), np.max(onehot_effectness_model_count)]) + 2))
plt.xlabel("The number of effective models")
plt.ylabel("Frequency")
plt.legend()
plt.tight_layout()
plt.savefig(f"{fig_output_dir}/effectiness.pdf")
plt.clf()
f"{fig_output_dir}/effectiness.pdf"

'out/libfeatureselection/T2/model//effectiness.pdf'

<Figure size 330x250 with 0 Axes>