In [1]:
import os
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
import pandas as pd
import numpy as np
import matplotlib as mpl

mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['pdf.use14corefonts'] = False
# mpl.rcParams['pdf.usecorefonts'] = True
mpl.rcParams['pdf.compression'] = 9

from IPython.display import display, HTML

import matplotlib.pyplot as plt
import scienceplots

plt.style.use(['science', 'nature'])

from sklearn.manifold import TSNE
import umap.umap_ as umap
from sklearn.preprocessing import MinMaxScaler

class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

2023-07-18 14:40:18.168204: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-07-18 14:40:18.168218: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
def decode_total_df(result_df: pd.DataFrame):
    model_name_list = result_df.loc[result_df.index.levels[0][0], :]["Model_Information"]['Classifier_Name'].to_list()
    feature_model_df = result_df['Feature_Selected']
    feature_model_df.reset_index(level=1, inplace=True)
    feature_df = feature_model_df.iloc[:, 1:].reset_index(drop=True)
    model_df = pd.DataFrame(np.eye(feature_model_df['Model_Type'].max()+1)[feature_model_df['Model_Type']], columns=[
        f"model-{i}"
        for i in range(feature_model_df['Model_Type'].max()+1)
    ]).reset_index(drop=True)
    auc_df = result_df['Best_Performance']['rocAUC'].to_frame().reset_index(drop=True)
    mcc_df = result_df['Best_Performance']['mmc'].to_frame().reset_index(drop=True)
    model_cata_df = feature_model_df['Model_Type'].to_frame().reset_index(drop=True)
    return {
        "feature_df": feature_df,
        "model_df": model_df,
        "auc_df": auc_df,
        "mcc_df": mcc_df,
        "model_cata_df": model_cata_df,
        "model_name_list": model_name_list
    }

In [3]:
prot_type = 6
fig_output_dir = f"out/libfeatureselection/T{prot_type}/model/"
fig_output_dir

'out/libfeatureselection/T6/model/'

In [4]:
bayes_result_df = pd.read_excel(
    f"out/libfeatureselection/T{prot_type}/model/Bayes/searched_result.xlsx",
    f"T{prot_type}",
    header=[0, 1],
    index_col=[0, 1]
)
bayes_result_df_decode = decode_total_df(result_df=bayes_result_df)

In [5]:
threshold = AttrDict({
    "rocauc": 0.75,
    "mcc": 0.5,
})

In [6]:
dfs_with_index = [ [index, gdf] for index, gdf in bayes_result_df.groupby(level=0) if gdf[((
    (gdf["Best_Performance"]["rocAUC"] > threshold.rocauc) & (gdf["Best_Performance"]["mmc"] > threshold.mcc)
) & (
    (gdf["5FoldCV_Performance"]["rocAUC"] > threshold.rocauc) & (gdf["5FoldCV_Performance"]["mmc"] > threshold.mcc)
))].shape[0] >=5 ]
filted_df = pd.concat([
    df[1] for df in dfs_with_index
])
filted_df = filted_df[~filted_df.index.duplicated()]

In [7]:
data_out = f"out/libfeatureselection/FigS6/T{prot_type}/data.json"
csv_out = f"out/libfeatureselection/FigS6/T{prot_type}/Feature_Scheme.csv"
data_out, csv_out

('out/libfeatureselection/FigS6/T6/data.json',
 'out/libfeatureselection/FigS6/T6/Feature_Scheme.csv')

In [8]:
feature_col = pd.Series([ i[0] for i in filted_df.index ]).replace(
    { i[0]:f"Combination_{index}" for index,i in enumerate(dfs_with_index, start=1) }
)
feature_col

0       Combination_1
1       Combination_1
2       Combination_1
3       Combination_1
4       Combination_1
            ...      
205    Combination_14
206    Combination_14
207    Combination_14
208    Combination_14
209    Combination_14
Length: 210, dtype: object

In [9]:
import json

feature_col = pd.Series([ i[0] for i in filted_df.index ]).replace(
    { i[0]:f"Combination_{index}" for index,i in enumerate(dfs_with_index, start=1) }
)
model_col = filted_df["Model_Information"]['Classifier_Name'].values
tt_mcc = filted_df['Best_Performance']['mmc'].values
tt_auc = filted_df['Best_Performance']['rocAUC'].values
cv_mcc = filted_df['5FoldCV_Performance']['mmc'].values
cv_auc = filted_df['5FoldCV_Performance']['rocAUC'].values


os.makedirs(os.path.dirname(data_out), exist_ok=True)

with open(
    data_out,
    "w+",
    encoding="UTF-8"
) as f:
    json.dump({
        "Feature_Name": feature_col.tolist(),
        "Model_Type": model_col.tolist(),
        "TT_MCC": tt_mcc.tolist(),
        "TT_rocAUC": tt_auc.tolist(),
        "CV_MCC": cv_mcc.tolist(),
        "CV_rocAUC": cv_auc.tolist(),
        "Title": f"T{prot_type} Model Performance of Single Feature",
        "ProtType": prot_type
    }, f)

In [10]:
with open(f"out/libfeatureselection/bubble_plot/Feature_name_Fix.json", "r", encoding="UTF-8") as f:
    Feature_name_Fix_Feature = json.load(f)['Feature']

In [11]:
scheme_filted_df = filted_df["Feature_Selected"].drop_duplicates().reset_index(level=1, drop=1)
scheme_filted_df.index = scheme_filted_df.index.to_series().replace(
    { i[0]:f"Combination_{index}" for index,i in enumerate(dfs_with_index, start=1) }
)
scheme_filted_df.apply(
    lambda row:", ".join(filted_df["Feature_Selected"].columns[row == 1].to_series().replace(
        Feature_name_Fix_Feature
    ).to_list()),
    axis=1
).to_csv(csv_out)