# T1SEstacker-RTX

In [1]:
import sys
sys.path.append("src")
import os
os.environ["n_jobs"] = "2"
import json

import libpybiofeature

import utils
work_Dir = utils.workdir.workdir(os.getcwd(), 4)

import numpy as np
import pandas as pd

from Bio import SeqIO

random_seed = 42
np.random.seed(random_seed)

from sklearn.model_selection import StratifiedKFold

In [2]:
import matplotlib as mpl

mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['pdf.use14corefonts'] = False
# mpl.rcParams['pdf.usecorefonts'] = True
mpl.rcParams['pdf.compression'] = 9

import matplotlib.pyplot as plt
import scienceplots

plt.style.use(['science', 'nature'])


from sklearn.metrics import roc_curve, confusion_matrix, precision_score, accuracy_score, f1_score, matthews_corrcoef, auc

def get_evaluation(label: list, pred: list, pro_cutoff: float = None):
    pred = np.nan_to_num(
        pred, copy=True, nan=0.0
    )
    fpr, tpr, thresholds = roc_curve(label, pred)
    if pro_cutoff is None:
        best_one_optimal_idx = np.argmax(tpr - fpr)
        pro_cutoff = thresholds[best_one_optimal_idx]
    pred_l = [1 if i >= pro_cutoff else 0 for i in pred]
    confusion_matrix_1d = confusion_matrix(label, pred_l).ravel()
    confusion_dict = {N: n for N, n in zip(['tn', 'fp', 'fn', 'tp'], list(
        confusion_matrix_1d * 2 / np.sum(confusion_matrix_1d)))}
    evaluation = {
        "accuracy": accuracy_score(label, pred_l),
        "precision": precision_score(label, pred_l),
        "f1_score": f1_score(label, pred_l),
        "mmc": matthews_corrcoef(label, pred_l),
        "rocAUC": auc(fpr, tpr),
        "specificity": confusion_dict['tn'] / (confusion_dict['tn'] + confusion_dict['fp']),
        "sensitivity": confusion_dict['tp'] / (confusion_dict['tp'] + confusion_dict['fn']),
        # "confusion_matrix": confusion_dict,
        # "_roc_Data": {'fpr': list(fpr), 'tpr': list(tpr)},
        'pro_cutoff': pro_cutoff
    }
    return evaluation


def plot_roc_curve(target, pred, path_to_: str):
    fpr, tpr, thresholds = roc_curve(target, pred)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(19.2 / 4, 10.8 / 4))
    plt.axis('square')
    plt.plot(
        fpr, tpr, color='red', lw=2,
        label='ROC curve (area = %0.2f)' % roc_auc
    )
    plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic (ROC) curve')
    plt.legend(loc="lower right")

    plt.savefig(f"{path_to_}", transparent=True)
    plt.clf()

In [3]:
from Bio import SeqIO
def load_AAC_feature(TxSE_args: dict):

    # Extract Feature
    seq_id_dict = None
    with open(TxSE_args['seq_id'], 'r', encoding='UTF-8') as f:
        seq_id_dict = json.load(f)

    # AAC
    AAC_feature = {
        "name": "AAC",
        "t_p": libpybiofeature.featurebuilder.build_acc_feature(
            path_to_fasta=TxSE_args['fasta']['t']['p'],
            seq_id_list=seq_id_dict['t']['p'],
            desc='t_p',
            NCF="C",
            terlength=60
        ),
        "t_n": libpybiofeature.featurebuilder.build_acc_feature(
            path_to_fasta=TxSE_args['fasta']['t']['n'],
            seq_id_list=seq_id_dict['t']['n'],
            desc='t_n',
            NCF="C",
            terlength=60
        ),
        "v_p": libpybiofeature.featurebuilder.build_acc_feature(
            path_to_fasta=TxSE_args['fasta']['v']['p'],
            seq_id_list=seq_id_dict['v']['p'],
            desc='v_p',
            NCF="C",
            terlength=60
        ),
        "v_n": libpybiofeature.featurebuilder.build_acc_feature(
            path_to_fasta=TxSE_args['fasta']['v']['n'],
            seq_id_list=seq_id_dict['v']['n'],
            desc='v_n',
            NCF="C",
            terlength=60
        ),
    }

    return AAC_feature

def load_DAC_feature(TxSE_args: dict):

    # Extract Feature
    seq_id_dict = None
    with open(TxSE_args['seq_id'], 'r', encoding='UTF-8') as f:
        seq_id_dict = json.load(f)

    # AAC
    DAC_feature = {
        "name": "DAC",
        "t_p": libpybiofeature.featurebuilder.build_dac_feature(
            path_to_fasta=TxSE_args['fasta']['t']['p'],
            seq_id_list=seq_id_dict['t']['p'],
            desc='t_p',
            NCF="C",
            terlength=60
        ),
        "t_n": libpybiofeature.featurebuilder.build_dac_feature(
            path_to_fasta=TxSE_args['fasta']['t']['n'],
            seq_id_list=seq_id_dict['t']['n'],
            desc='t_n',
            NCF="C",
            terlength=60
        ),
        "v_p": libpybiofeature.featurebuilder.build_dac_feature(
            path_to_fasta=TxSE_args['fasta']['v']['p'],
            seq_id_list=seq_id_dict['v']['p'],
            desc='v_p',
            NCF="C",
            terlength=60
        ),
        "v_n": libpybiofeature.featurebuilder.build_dac_feature(
            path_to_fasta=TxSE_args['fasta']['v']['n'],
            seq_id_list=seq_id_dict['v']['n'],
            desc='v_n',
            NCF="C",
            terlength=60
        ),
    }

    return DAC_feature

import tqdm
from src.libpybiofeature import AC, oneHot

def build_DigitAA_feature(
    path_to_fasta: str,
    seq_id_list: list,
    desc: str = 'undefine',
    NCF='C',
    terlength: int = 60
):
    assert NCF == "C"
    assert terlength == 60

    seq_list = list(SeqIO.parse(path_to_fasta, 'fasta'))
    df = None
    
    df = pd.DataFrame([
        [
            oneHot.default_aa_dict[aa]
            for aa in str(seq.seq)[-1 * terlength:]
        ]
        for seq in tqdm.tqdm(seq_list, desc=f'{desc}_AAC')
    ]).fillna(0)

    df.columns = list(range(df.shape[1]))
    df.index = [seq.id for seq in seq_list]

    if seq_id_list is not None:
        return df.loc[seq_id_list, :]
    else:
        return df

def load_DigitAA_feature(TxSE_args: dict):

    # Extract Feature
    seq_id_dict = None
    with open(TxSE_args['seq_id'], 'r', encoding='UTF-8') as f:
        seq_id_dict = json.load(f)

    # AAC
    DAC_feature = {
        "name": "DigitAA",
        "t_p": build_DigitAA_feature(
            path_to_fasta=TxSE_args['fasta']['t']['p'],
            seq_id_list=seq_id_dict['t']['p'],
            desc='t_p',
            NCF="C",
            terlength=60
        ),
        "t_n": build_DigitAA_feature(
            path_to_fasta=TxSE_args['fasta']['t']['n'],
            seq_id_list=seq_id_dict['t']['n'],
            desc='t_n',
            NCF="C",
            terlength=60
        ),
        "v_p": build_DigitAA_feature(
            path_to_fasta=TxSE_args['fasta']['v']['p'],
            seq_id_list=seq_id_dict['v']['p'],
            desc='v_p',
            NCF="C",
            terlength=60
        ),
        "v_n": build_DigitAA_feature(
            path_to_fasta=TxSE_args['fasta']['v']['n'],
            seq_id_list=seq_id_dict['v']['n'],
            desc='v_n',
            NCF="C",
            terlength=60
        ),
    }

    return DAC_feature

def load_BPBAac_feature(TxSE_args: dict):

    # Extract Feature
    seq_id_dict = None
    with open(TxSE_args['seq_id'], 'r', encoding='UTF-8') as f:
        seq_id_dict = json.load(f)

    # BPBaac
    BPBaac_seq_data = {
        "t_p": libpybiofeature.libdataloader.fasta_seq_loader.prepare_data(
            path_to_fasta=TxSE_args['fasta']['t']['p'],
            seq_id_list=seq_id_dict['t']['p'],
        )[0].values.tolist(),
        "t_n": libpybiofeature.libdataloader.fasta_seq_loader.prepare_data(
            path_to_fasta=TxSE_args['fasta']['t']['n'],
            seq_id_list=seq_id_dict['t']['n'],
        )[0].values.tolist(),
        "v_p": libpybiofeature.libdataloader.fasta_seq_loader.prepare_data(
            path_to_fasta=TxSE_args['fasta']['v']['p'],
            seq_id_list=seq_id_dict['v']['p'],
        )[0].values.tolist(),
        "v_n": libpybiofeature.libdataloader.fasta_seq_loader.prepare_data(
            path_to_fasta=TxSE_args['fasta']['v']['n'],
            seq_id_list=seq_id_dict['v']['n'],
        )[0].values.tolist(),
    }

    BPBaac_profile = {
        "p": libpybiofeature.BPBaac_psp.mat_constructor(
            fasta_db=BPBaac_seq_data['t_p'],
            cter=TxSE_args['fasta']['cter'],
            terlength=60,
            padding_ac='A'
        ),
        "n": libpybiofeature.BPBaac_psp.mat_constructor(
            fasta_db=BPBaac_seq_data['t_n'],
            cter=TxSE_args['fasta']['cter'],
            terlength=60,
            padding_ac='A'
        ),
    }

    with open("out/libfeatureselection/RTX_feature_research/aac/rtx/BPBaac_profile_C60.json", "w+", encoding='UTF-8') as f:
        json.dump(BPBaac_profile, f)

    for data_type in BPBaac_seq_data.keys():
        BPBaac_seq_data[data_type] = pd.DataFrame(
            [
                libpybiofeature.BPBaac_psp.mat_mapper(
                    seq=str(seq.seq),
                    pmat=BPBaac_profile['p'],
                    nmat=BPBaac_profile['n'],
                    cter=TxSE_args['fasta']['cter'],
                    terlength=60,
                    padding_ac='A'
                ) for seq in BPBaac_seq_data[data_type]
            ],
            index=seq_id_dict[data_type.split("_")[0]][data_type.split("_")[1]]
        )
    BPBaac_seq_data['name'] = "BPBaac"

    return BPBaac_seq_data

In [6]:
prot_type = 1
cter_bool = True
Tx_arg = {
    "type": f'T{prot_type}',
    'seq_id': "data/T1SE/seq_id.json",
    'fasta': {
        'cter': cter_bool,
        't': {
            'p': "data/T1SE/RTX_filted_prot.fasta",
            'n': "data/T1SE/n_RTX_filted_prot.fasta"
        },
        'v': {
            'p': "data/T1SE/non-RTX_filted_prot.fasta",
            'n': "data/T1SE/n_non-RTX_filted_prot.fasta"
        },
    },
}
save_dir = "out/libfeatureselection/T1/T1SEstacker-RTX/"
os.makedirs(save_dir, exist_ok=True)

In [7]:
aac_data = load_AAC_feature(
    TxSE_args=Tx_arg
)
dac_data = load_DAC_feature(
    TxSE_args=Tx_arg
)
ac_data = {
    datatype: pd.concat([
        item[datatype] for item in [aac_data, dac_data]
    ], axis=1)
    for datatype in ["t_p", "t_n", "v_p", "v_n"]
}
bpb_data = load_BPBAac_feature(
    TxSE_args=Tx_arg
)
digitaa_data = load_DigitAA_feature(
    TxSE_args=Tx_arg
)
aa_type = list(aac_data['t_p'].columns)

t_p_AAC: 100%|██████████| 74/74 [00:00<00:00, 31281.85it/s]
t_n_AAC: 100%|██████████| 74/74 [00:00<00:00, 35067.05it/s]
v_p_AAC: 100%|██████████| 25/25 [00:00<00:00, 16407.07it/s]
v_n_AAC: 100%|██████████| 25/25 [00:00<00:00, 23358.79it/s]
t_p_DAC: 100%|██████████| 74/74 [00:00<00:00, 9125.56it/s]
t_n_DAC: 100%|██████████| 74/74 [00:00<00:00, 8621.86it/s]
v_p_DAC: 100%|██████████| 25/25 [00:00<00:00, 8217.68it/s]
v_n_DAC: 100%|██████████| 25/25 [00:00<00:00, 7337.83it/s]
t_p_AAC: 100%|██████████| 74/74 [00:00<00:00, 55743.26it/s]
t_n_AAC: 100%|██████████| 74/74 [00:00<00:00, 5754.89it/s]
v_p_AAC: 100%|██████████| 25/25 [00:00<00:00, 12908.73it/s]
v_n_AAC: 100%|██████████| 25/25 [00:00<00:00, 32963.72it/s]


# 构建模型

In [8]:
import gzip
import typing
import pickle
from datetime import datetime

from sklearn.base import ClassifierMixin

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.model_selection._search import BaseSearchCV
from skopt import BayesSearchCV

import tensorflow as tf

2023-07-22 13:44:23.737370: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-22 13:44:23.752725: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-07-22 13:44:23.752837: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## AAC / DAC

In [9]:
from libfeatureselection import model, model_space
n_jobs = (
    (os.cpu_count() - 2)
    if "n_jobs" not in os.environ or os.environ['n_jobs'] == "" else
    int(os.environ['n_jobs'])
)
n_jobs

2023-07-22 13:44:29.437256: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-07-22 13:44:29.437475: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-07-22 13:44:29.437586: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (A7LAB): /proc/driver/nvidia/version does not exist


2

In [10]:
search_result_in_a_scheme_df = pd.DataFrame()
for model_index in tqdm.tqdm(range(len(model_space.find_space))):
    if model_space.find_space[model_index]['name'] not in [
        'SVC',
        'GaussianNB',
        'RandomForestClassifier',
        'DecisionTreeClassifier'
    ]:
        continue
    model_information_summary, searched_result_performance_summary, searched_result_5C_performance_summary = model.MyOptimitzer(
        classifier_name=model_space.find_space[model_index]['name'],
        classifier_class=model_space.find_space[model_index]['class'],
        classifier_param_dict=model_space.find_space[model_index]['param'],
    ).find_best(
        X=pd.concat([ac_data['t_p'], ac_data['t_n']]).values,
        y=np.concatenate([np.ones((ac_data['t_p'].shape[0], )), np.zeros((ac_data['t_n'].shape[0], ))]),
        validation=(
            pd.concat([ac_data['v_p'], ac_data['v_n']]).values,
            np.concatenate([np.ones((ac_data['v_p'].shape[0], )), np.zeros((ac_data['v_n'].shape[0], ))]),
        ),
        search_method=(
            "BayesSearchCV"
            if "Bayes" not in model_space.find_space[model_index]
            or model_space.find_space[model_index]['Bayes'] == True
            else "GridSearchCV"
        ),
        n_jobs=n_jobs
    ).get_summary(
        path_to_dir=f"out/libfeatureselection/T1/T1SEstacker-RTX/model/ac/"
    )

    # 记录结果，插入到 search_result_in_a_scheme_df
    result_series = pd.concat([
        pd.Series(model_information_summary),
        pd.Series(searched_result_performance_summary),
        pd.Series(searched_result_5C_performance_summary),
    ], keys=[
        "Model_Information",
        "Best_Performance",
        "5FoldCV_Performance",
    ])

    result_series.name = model_index

    search_result_in_a_scheme_df = pd.concat([
        search_result_in_a_scheme_df,
        result_series.to_frame().T
    ], axis=0, ignore_index=False)

    search_result_in_a_scheme_df.index = search_result_in_a_scheme_df.index.set_names(
        ["Model_Type",]
    )

    local_xlsx_path = f"out/libfeatureselection/T1/T1SEstacker-RTX/model/ac/searched_result.xlsx"

    # 缓存 search_result_in_a_scheme_df
    search_result_in_a_scheme_df.to_excel(
        local_xlsx_path,
        "T1SEstacker-RTX",
        freeze_panes=(2, 1)
    )

100%|██████████| 15/15 [00:14<00:00,  1.01it/s]


<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

## BPBAac

In [11]:
search_result_in_a_scheme_df = pd.DataFrame()
for model_index in tqdm.tqdm(range(len(model_space.find_space))):
    if model_space.find_space[model_index]['name'] not in [
        'SVC',
    ]:
        continue
    model_information_summary, searched_result_performance_summary, searched_result_5C_performance_summary = model.MyOptimitzer(
        classifier_name=model_space.find_space[model_index]['name'],
        classifier_class=model_space.find_space[model_index]['class'],
        classifier_param_dict=model_space.find_space[model_index]['param'],
    ).find_best(
        X=pd.concat([ac_data['t_p'], ac_data['t_n']]).values,
        y=np.concatenate([np.ones((ac_data['t_p'].shape[0], )), np.zeros((ac_data['t_n'].shape[0], ))]),
        validation=(
            pd.concat([ac_data['v_p'], ac_data['v_n']]).values,
            np.concatenate([np.ones((ac_data['v_p'].shape[0], )), np.zeros((ac_data['v_n'].shape[0], ))]),
        ),
        search_method=(
            "BayesSearchCV"
            if "Bayes" not in model_space.find_space[model_index]
            or model_space.find_space[model_index]['Bayes'] == True
            else "GridSearchCV"
        ),
        n_jobs=n_jobs
    ).get_summary(
        path_to_dir=f"out/libfeatureselection/T1/T1SEstacker-RTX/model/bpb"
    )

    # 记录结果，插入到 search_result_in_a_scheme_df
    result_series = pd.concat([
        pd.Series(model_information_summary),
        pd.Series(searched_result_performance_summary),
        pd.Series(searched_result_5C_performance_summary),
    ], keys=[
        "Model_Information",
        "Best_Performance",
        "5FoldCV_Performance",
    ])

    result_series.name = model_index

    search_result_in_a_scheme_df = pd.concat([
        search_result_in_a_scheme_df,
        result_series.to_frame().T
    ], axis=0, ignore_index=False)

    search_result_in_a_scheme_df.index = search_result_in_a_scheme_df.index.set_names(
        ["Model_Type",]
    )

    local_xlsx_path = f"out/libfeatureselection/T1/T1SEstacker-RTX/model/bpb/searched_result.xlsx"

    # 缓存 search_result_in_a_scheme_df
    search_result_in_a_scheme_df.to_excel(
        local_xlsx_path,
        "T1SEstacker-RTX",
        freeze_panes=(2, 1)
    )

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 15/15 [00:02<00:00,  6.23it/s]


<Figure size 480x270 with 0 Axes>

## DNN

In [12]:
def get_DNN_model(seq_length: int, sizeof_ac_dict: int):
    input1 = tf.keras.layers.Input(shape=(seq_length,), name='Input_Layer')
    embedding1 = tf.keras.layers.Embedding(
        input_dim=sizeof_ac_dict, output_dim=sizeof_ac_dict, name="AC_EMBEDED")(input1)
    flatten_layer = tf.keras.layers.Flatten()(embedding1)
    dense3 = tf.keras.layers.Dense(
        1, activation=tf.keras.activations.sigmoid)(flatten_layer)

    model = tf.keras.models.Model(inputs=input1, outputs=dense3, name='simple')
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
        loss=tf.keras.losses.binary_crossentropy,
        metrics=[
            tf.keras.metrics.BinaryAccuracy(),
            tf.keras.metrics.AUC(),
            tf.keras.metrics.Precision(),
            tf.keras.metrics.TruePositives(),
            tf.keras.metrics.TrueNegatives(),
            tf.keras.metrics.FalsePositives(),
            tf.keras.metrics.FalseNegatives()
        ]
    )
    return model


class DNN_Trainer:
    def __init__(self, ) -> None:
        self.classifier_name = "DNN"
        self.classifier_class = get_DNN_model
        self.classifier_param_dict = {
            "seq_length": 60,
            "sizeof_ac_dict": 20
        }

        self.model = None
        self.train_best_predicted_pair = None
        self.train_best_5C_predicted_pair = None
        self.best_predicted_pair = None
        self.best_5C_predicted_pair = None
        self.start_to_train_time = datetime.now()
        self.end_of_train_time = None
        pass

    def find_best(
        self,
        X: np.ndarray,
        y: np.ndarray,
        validation: tuple,
    ):

        self.model = self.classifier_class(
            **self.classifier_param_dict
        )
        self.model.fit(
            X,
            y,
            epochs=10,
            use_multiprocessing=True,
            steps_per_epoch=None,
            verbose=2
        )
        self.best_predicted_pair = [
            np.nan_to_num(self.model.predict(
                validation[0]
            ), nan=0.0),
            validation[1]
        ]
        self.train_best_predicted_pair = [
            np.nan_to_num(self.model.predict(
                X
            ), nan=0.0),
            y
        ]

        # 5倍交叉验证
        # 合并数据
        full_X = np.concatenate([
            X, validation[0]
        ])
        full_y = np.concatenate([
            y, validation[1]
        ])

        # 跑模型
        self.best_5C_predicted_pair = []
        self.train_best_5C_predicted_pair = []
        for Kfold_id, (train_id, test_id) in enumerate(
            StratifiedKFold(
                n_splits=5,
                shuffle=True,
                random_state=42
            ).split(full_X, full_y)
        ):

            # 定义模型并加载参数
            fiveC_model = self.classifier_class(
                **self.classifier_param_dict,
            )

            fiveC_model.fit(
                full_X[train_id],
                full_y[train_id],
                epochs=10,
                use_multiprocessing=True,
                steps_per_epoch=None,
                verbose=2
            )

            # 预测并记录
            self.best_5C_predicted_pair.append([
                np.nan_to_num(fiveC_model.predict(
                    full_X[test_id]
                ), nan=0.0),
                full_y[test_id]
            ])
            self.train_best_5C_predicted_pair.append([
                np.nan_to_num(fiveC_model.predict(
                    full_X[train_id]
                ), nan=0.0),
                full_y[train_id]
            ])

        return self

    def get_summary(self, path_to_dir: str = None):
        os.makedirs(path_to_dir, exist_ok=True)
        model_path = "-"
        if "SAVE_MODEL" in os.environ and os.environ['SAVE_MODEL'] == "1":

            model_path = f"{path_to_dir}/{self.classifier_name}.pkl"
            if path_to_dir is not None:
                with gzip.open(model_path, "wb") as f:
                    pickle.dump(
                        self.grid_search, f
                    )

        model_score_path = f"{path_to_dir}/{self.classifier_name}_score.pkl"
        if path_to_dir is not None:
            with gzip.open(model_score_path, "wb") as f:
                pickle.dump(
                    {
                        "best_predicted_pair": self.best_predicted_pair,
                        "best_5C_predicted_pair": self.best_5C_predicted_pair,
                    }, f
                )
            with gzip.open(model_score_path + ".train", "wb") as f:
                pickle.dump(
                    {
                        "best_predicted_pair": self.train_best_predicted_pair,
                        "best_5C_predicted_pair": self.train_best_5C_predicted_pair,
                    }, f
                )
        else:
            model_score_path = "-"

        plot_roc_curve(
            target=self.best_predicted_pair[1],
            pred=self.best_predicted_pair[0],
            path_to_=f"{path_to_dir}/{self.classifier_name}.pdf"
        )

        model_information = {
            "Classifier_Name": self.classifier_name,
            "Optimitied_Param": dict(),
            "Score": model_score_path,
            "Model_Path": model_path,
            "TimeToStartFit": self.start_to_train_time.strftime("%Y-%m-%d %H:%M:%S")
        }

        training_testing_performance = get_evaluation(
            label=self.best_predicted_pair[1],
            pred=self.best_predicted_pair[0],
        )

        # 计算5C中的平均表现
        FiveFold_result = {}
        for keys in training_testing_performance.keys():
            value_list = []
            for item in self.best_5C_predicted_pair:

                item_performance = get_evaluation(
                    label=item[1],
                    pred=item[0],
                )
                value_list.append(item_performance[keys])

            if keys == "pro_cutoff":
                FiveFold_result[keys] = value_list
            else:
                FiveFold_result[keys] = sum(value_list) / len(value_list)

        self.end_of_train_time = datetime.now()
        model_information["TimeOfSummary"] = self.end_of_train_time.strftime(
            "%Y-%m-%d %H:%M:%S"
        )
        model_information["TimeSpend"] = str(
            self.end_of_train_time - self.start_to_train_time
        )

        return model_information, training_testing_performance, FiveFold_result

In [13]:
DNN_Trainer().find_best(
    X=pd.concat([digitaa_data['t_p'], digitaa_data['t_n']]),
    y=np.concatenate([np.ones((digitaa_data['t_p'].shape[0], )), np.zeros((digitaa_data['t_n'].shape[0], ))]),
    validation=(
        pd.concat([digitaa_data['v_p'], digitaa_data['v_n']]),
        np.concatenate([np.ones((digitaa_data['v_p'].shape[0], )), np.zeros((digitaa_data['v_n'].shape[0], ))]),
    )
).get_summary(
    path_to_dir="out/libfeatureselection/T1/T1SEstacker-RTX/"
)

2023-07-22 13:46:07.601818: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/10
5/5 - 6s - loss: 0.6800 - binary_accuracy: 0.6351 - auc: 0.6902 - precision: 0.6786 - true_positives: 38.0000 - true_negatives: 56.0000 - false_positives: 18.0000 - false_negatives: 36.0000 - 6s/epoch - 1s/step
Epoch 2/10
5/5 - 0s - loss: 0.5499 - binary_accuracy: 0.9392 - auc: 0.9947 - precision: 0.9012 - true_positives: 73.0000 - true_negatives: 66.0000 - false_positives: 8.0000 - false_negatives: 1.0000 - 58ms/epoch - 12ms/step
Epoch 3/10
5/5 - 0s - loss: 0.3737 - binary_accuracy: 1.0000 - auc: 1.0000 - precision: 1.0000 - true_positives: 74.0000 - true_negatives: 74.0000 - false_positives: 0.0000e+00 - false_negatives: 0.0000e+00 - 82ms/epoch - 16ms/step
Epoch 4/10
5/5 - 0s - loss: 0.2021 - binary_accuracy: 1.0000 - auc: 1.0000 - precision: 1.0000 - true_positives: 74.0000 - true_negatives: 74.0000 - false_positives: 0.0000e+00 - false_negatives: 0.0000e+00 - 57ms/epoch - 11ms/step
Epoch 5/10
5/5 - 0s - loss: 0.1004 - binary_accuracy: 1.0000 - auc: 1.0000 - precision: 1.

({'Classifier_Name': 'DNN',
  'Optimitied_Param': {},
  'Score': 'out/libfeatureselection/T1/T1SEstacker-RTX//DNN_score.pkl',
  'Model_Path': '-',
  'TimeToStartFit': '2023-07-22 13:46:07',
  'TimeOfSummary': '2023-07-22 13:47:03',
  'TimeSpend': '0:00:55.723758'},
 {'accuracy': 0.76,
  'precision': 0.8421052631578947,
  'f1_score': 0.7272727272727272,
  'mmc': 0.5356556682297139,
  'rocAUC': 0.8,
  'specificity': 0.88,
  'sensitivity': 0.64,
  'pro_cutoff': 0.23253872},
 {'accuracy': 0.8987179487179487,
  'precision': 0.9251082251082252,
  'f1_score': 0.893901674919005,
  'mmc': 0.8060895619300334,
  'rocAUC': 0.9421315789473684,
  'specificity': 0.9184210526315789,
  'sensitivity': 0.8768421052631579,
  'pro_cutoff': [0.69159526, 0.07496858, 0.5368884, 0.6837912, 0.8814933]})

<Figure size 480x270 with 0 Axes>

## RNN

In [14]:
def get_RNN_model(seq_length: int, sizeof_ac_dict: int):
    input1 = tf.keras.layers.Input(shape=(seq_length,), name='Input_Layer')
    embedding1 = tf.keras.layers.Embedding(
        input_dim=sizeof_ac_dict, output_dim=sizeof_ac_dict, name="AC_EMBEDED")(input1)

    conv1 = tf.keras.layers.LSTM(10)(embedding1)

    flatten_layer = tf.keras.layers.Flatten()(conv1)

    dense3 = tf.keras.layers.Dense(
        1, activation=tf.keras.activations.sigmoid)(flatten_layer)

    model = tf.keras.models.Model(
        inputs=input1, outputs=dense3, name='simple_WithRNN')
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
        loss=tf.keras.losses.binary_crossentropy,
        metrics=[
            tf.keras.metrics.BinaryAccuracy(),
            tf.keras.metrics.AUC(),
            tf.keras.metrics.Precision(),
            tf.keras.metrics.TruePositives(),
            tf.keras.metrics.TrueNegatives(),
            tf.keras.metrics.FalsePositives(),
            tf.keras.metrics.FalseNegatives()
        ]
    )
    return model


class RNN_Trainer:
    def __init__(self, ) -> None:
        self.classifier_name = "RNN"
        self.classifier_class = get_RNN_model
        self.classifier_param_dict = {
            "seq_length": 60,
            "sizeof_ac_dict": 20
        }

        self.model = None
        self.train_best_predicted_pair = None
        self.train_best_5C_predicted_pair = None
        self.best_predicted_pair = None
        self.best_5C_predicted_pair = None
        self.start_to_train_time = datetime.now()
        self.end_of_train_time = None
        pass

    def find_best(
        self,
        X: np.ndarray,
        y: np.ndarray,
        validation: tuple,
    ):

        self.model = self.classifier_class(
            **self.classifier_param_dict
        )
        self.model.fit(
            X,
            y,
            epochs=10,
            use_multiprocessing=True,
            steps_per_epoch=None,
            verbose=2
        )
        self.best_predicted_pair = [
            np.nan_to_num(self.model.predict(
                validation[0]
            ), nan=0.0),
            validation[1]
        ]
        self.train_best_predicted_pair = [
            np.nan_to_num(self.model.predict(
                X
            ), nan=0.0),
            y
        ]

        # 5倍交叉验证
        # 合并数据
        full_X = np.concatenate([
            X, validation[0]
        ])
        full_y = np.concatenate([
            y, validation[1]
        ])

        # 跑模型
        self.best_5C_predicted_pair = []
        self.train_best_5C_predicted_pair = []
        for Kfold_id, (train_id, test_id) in enumerate(
            StratifiedKFold(
                n_splits=5,
                shuffle=True,
                random_state=42
            ).split(full_X, full_y)
        ):

            # 定义模型并加载参数
            fiveC_model = self.classifier_class(
                **self.classifier_param_dict,
            )

            fiveC_model.fit(
                full_X[train_id],
                full_y[train_id],
                epochs=10,
                use_multiprocessing=True,
                steps_per_epoch=None,
                verbose=2
            )

            # 预测并记录
            self.best_5C_predicted_pair.append([
                np.nan_to_num(fiveC_model.predict(
                    full_X[test_id]
                ), nan=0.0),
                full_y[test_id]
            ])
            self.train_best_5C_predicted_pair.append([
                np.nan_to_num(fiveC_model.predict(
                    full_X[train_id]
                ), nan=0.0),
                full_y[train_id]
            ])

        return self

    def get_summary(self, path_to_dir: str = None):
        os.makedirs(path_to_dir, exist_ok=True)
        model_path = "-"
        if "SAVE_MODEL" in os.environ and os.environ['SAVE_MODEL'] == "1":

            model_path = f"{path_to_dir}/{self.classifier_name}.pkl"
            if path_to_dir is not None:
                with gzip.open(model_path, "wb") as f:
                    pickle.dump(
                        self.grid_search, f
                    )

        model_score_path = f"{path_to_dir}/{self.classifier_name}_score.pkl"
        if path_to_dir is not None:
            with gzip.open(model_score_path, "wb") as f:
                pickle.dump(
                    {
                        "best_predicted_pair": self.best_predicted_pair,
                        "best_5C_predicted_pair": self.best_5C_predicted_pair,
                    }, f
                )
            with gzip.open(model_score_path + ".train", "wb") as f:
                pickle.dump(
                    {
                        "best_predicted_pair": self.train_best_predicted_pair,
                        "best_5C_predicted_pair": self.train_best_5C_predicted_pair,
                    }, f
                )
        else:
            model_score_path = "-"

        plot_roc_curve(
            target=self.best_predicted_pair[1],
            pred=self.best_predicted_pair[0],
            path_to_=f"{path_to_dir}/{self.classifier_name}.pdf"
        )

        model_information = {
            "Classifier_Name": self.classifier_name,
            "Optimitied_Param": dict(),
            "Score": model_score_path,
            "Model_Path": model_path,
            "TimeToStartFit": self.start_to_train_time.strftime("%Y-%m-%d %H:%M:%S")
        }

        training_testing_performance = get_evaluation(
            label=self.best_predicted_pair[1],
            pred=self.best_predicted_pair[0],
        )

        # 计算5C中的平均表现
        FiveFold_result = {}
        for keys in training_testing_performance.keys():
            value_list = []
            for item in self.best_5C_predicted_pair:

                item_performance = get_evaluation(
                    label=item[1],
                    pred=item[0],
                )
                value_list.append(item_performance[keys])

            if keys == "pro_cutoff":
                FiveFold_result[keys] = value_list
            else:
                FiveFold_result[keys] = sum(value_list) / len(value_list)

        self.end_of_train_time = datetime.now()
        model_information["TimeOfSummary"] = self.end_of_train_time.strftime(
            "%Y-%m-%d %H:%M:%S"
        )
        model_information["TimeSpend"] = str(
            self.end_of_train_time - self.start_to_train_time
        )

        return model_information, training_testing_performance, FiveFold_result

In [15]:
RNN_Trainer().find_best(
    X=pd.concat([digitaa_data['t_p'], digitaa_data['t_n']]),
    y=np.concatenate([np.ones((digitaa_data['t_p'].shape[0], )), np.zeros((digitaa_data['t_n'].shape[0], ))]),
    validation=(
        pd.concat([digitaa_data['v_p'], digitaa_data['v_n']]),
        np.concatenate([np.ones((digitaa_data['v_p'].shape[0], )), np.zeros((digitaa_data['v_n'].shape[0], ))]),
    )
).get_summary(
    path_to_dir="out/libfeatureselection/T1/T1SEstacker-RTX/"
)

Epoch 1/10
5/5 - 12s - loss: 0.6872 - binary_accuracy: 0.5405 - auc_6: 0.5957 - precision_6: 0.5214 - true_positives_6: 73.0000 - true_negatives_6: 7.0000 - false_positives_6: 67.0000 - false_negatives_6: 1.0000 - 12s/epoch - 2s/step
Epoch 2/10
5/5 - 0s - loss: 0.6143 - binary_accuracy: 0.7838 - auc_6: 0.9410 - precision_6: 0.7059 - true_positives_6: 72.0000 - true_negatives_6: 44.0000 - false_positives_6: 30.0000 - false_negatives_6: 2.0000 - 337ms/epoch - 67ms/step
Epoch 3/10
5/5 - 0s - loss: 0.4014 - binary_accuracy: 0.9054 - auc_6: 0.9505 - precision_6: 0.8846 - true_positives_6: 69.0000 - true_negatives_6: 65.0000 - false_positives_6: 9.0000 - false_negatives_6: 5.0000 - 351ms/epoch - 70ms/step
Epoch 4/10
5/5 - 0s - loss: 0.3737 - binary_accuracy: 0.8378 - auc_6: 0.9317 - precision_6: 0.8378 - true_positives_6: 62.0000 - true_negatives_6: 62.0000 - false_positives_6: 12.0000 - false_negatives_6: 12.0000 - 283ms/epoch - 57ms/step
Epoch 5/10
5/5 - 0s - loss: 0.2986 - binary_accuracy

({'Classifier_Name': 'RNN',
  'Optimitied_Param': {},
  'Score': 'out/libfeatureselection/T1/T1SEstacker-RTX//RNN_score.pkl',
  'Model_Path': '-',
  'TimeToStartFit': '2023-07-22 13:47:21',
  'TimeOfSummary': '2023-07-22 13:49:58',
  'TimeSpend': '0:02:37.164119'},
 {'accuracy': 0.62,
  'precision': 0.6666666666666666,
  'f1_score': 0.5581395348837209,
  'mmc': 0.25,
  'rocAUC': 0.6144,
  'specificity': 0.76,
  'sensitivity': 0.48,
  'pro_cutoff': 0.10723479},
 {'accuracy': 0.857948717948718,
  'precision': 0.9579260651629072,
  'f1_score': 0.8333145716072545,
  'mmc': 0.7418831510145412,
  'rocAUC': 0.8758947368421055,
  'specificity': 0.96,
  'sensitivity': 0.7557894736842105,
  'pro_cutoff': [0.4490599, 0.7960867, 0.9003739, 0.9451657, 0.8054305]})

<Figure size 480x270 with 0 Axes>

## SA

In [16]:
def get_SA_model(seq_length: int, sizeof_ac_dict: int):
    input1 = tf.keras.layers.Input(shape=(seq_length,), name='Input_Layer')
    embedding1 = tf.keras.layers.Embedding(
        input_dim=sizeof_ac_dict, output_dim=sizeof_ac_dict, name="AC_EMBEDED")(input1)
    flatten_layer = tf.keras.layers.Flatten()(embedding1)
    Q = tf.keras.layers.Dense(
        20, activation=tf.keras.activations.sigmoid)(flatten_layer)
    K = tf.keras.layers.Dense(
        20, activation=tf.keras.activations.sigmoid)(flatten_layer)
    V = tf.keras.layers.Dense(
        20, activation=tf.keras.activations.sigmoid)(flatten_layer)

    Attention = tf.keras.layers.Multiply()([Q, K])
    softmax_Attention = tf.keras.activations.softmax(Attention)
    Self_Attention = tf.keras.layers.Multiply()([V, softmax_Attention])

    d = tf.keras.layers.Dense(
        1, activation=tf.keras.activations.sigmoid)(Self_Attention)

    model = tf.keras.models.Model(
        inputs=input1, outputs=d, name='SelfAttantion')
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
        loss=tf.keras.losses.binary_crossentropy,
        metrics=[
            tf.keras.metrics.BinaryAccuracy(),
            tf.keras.metrics.AUC(),
            tf.keras.metrics.Precision(),
            tf.keras.metrics.TruePositives(),
            tf.keras.metrics.TrueNegatives(),
            tf.keras.metrics.FalsePositives(),
            tf.keras.metrics.FalseNegatives()
        ]
    )
    return model


class SA_Trainer:
    def __init__(self, ) -> None:
        self.classifier_name = "SA"
        self.classifier_class = get_SA_model
        self.classifier_param_dict = {
            "seq_length": 60,
            "sizeof_ac_dict": 20
        }

        self.model = None
        self.train_best_predicted_pair = None
        self.train_best_5C_predicted_pair = None
        self.best_predicted_pair = None
        self.best_5C_predicted_pair = None
        self.start_to_train_time = datetime.now()
        self.end_of_train_time = None
        pass

    def find_best(
        self,
        X: np.ndarray,
        y: np.ndarray,
        validation: tuple,
    ):

        self.model = self.classifier_class(
            **self.classifier_param_dict
        )
        self.model.fit(
            X,
            y,
            epochs=10,
            use_multiprocessing=True,
            steps_per_epoch=None,
            verbose=2
        )
        self.best_predicted_pair = [
            np.nan_to_num(self.model.predict(
                validation[0]
            ), nan=0.0),
            validation[1]
        ]
        self.train_best_predicted_pair = [
            np.nan_to_num(self.model.predict(
                X
            ), nan=0.0),
            y
        ]

        # 5倍交叉验证
        # 合并数据
        full_X = np.concatenate([
            X, validation[0]
        ])
        full_y = np.concatenate([
            y, validation[1]
        ])

        # 跑模型
        self.best_5C_predicted_pair = []
        self.train_best_5C_predicted_pair = []
        for Kfold_id, (train_id, test_id) in enumerate(
            StratifiedKFold(
                n_splits=5,
                shuffle=True,
                random_state=42
            ).split(full_X, full_y)
        ):

            # 定义模型并加载参数
            fiveC_model = self.classifier_class(
                **self.classifier_param_dict,
            )

            fiveC_model.fit(
                full_X[train_id],
                full_y[train_id],
                epochs=10,
                use_multiprocessing=True,
                steps_per_epoch=None,
                verbose=2
            )

            # 预测并记录
            self.best_5C_predicted_pair.append([
                np.nan_to_num(fiveC_model.predict(
                    full_X[test_id]
                ), nan=0.0),
                full_y[test_id]
            ])
            self.train_best_5C_predicted_pair.append([
                np.nan_to_num(fiveC_model.predict(
                    full_X[train_id]
                ), nan=0.0),
                full_y[train_id]
            ])

        return self

    def get_summary(self, path_to_dir: str = None):
        os.makedirs(path_to_dir, exist_ok=True)
        model_path = "-"
        if "SAVE_MODEL" in os.environ and os.environ['SAVE_MODEL'] == "1":

            model_path = f"{path_to_dir}/{self.classifier_name}.pkl"
            if path_to_dir is not None:
                with gzip.open(model_path, "wb") as f:
                    pickle.dump(
                        self.grid_search, f
                    )

        model_score_path = f"{path_to_dir}/{self.classifier_name}_score.pkl"
        if path_to_dir is not None:
            with gzip.open(model_score_path, "wb") as f:
                pickle.dump(
                    {
                        "best_predicted_pair": self.best_predicted_pair,
                        "best_5C_predicted_pair": self.best_5C_predicted_pair,
                    }, f
                )
            with gzip.open(model_score_path + ".train", "wb") as f:
                pickle.dump(
                    {
                        "best_predicted_pair": self.train_best_predicted_pair,
                        "best_5C_predicted_pair": self.train_best_5C_predicted_pair,
                    }, f
                )
        else:
            model_score_path = "-"

        plot_roc_curve(
            target=self.best_predicted_pair[1],
            pred=self.best_predicted_pair[0],
            path_to_=f"{path_to_dir}/{self.classifier_name}.pdf"
        )

        model_information = {
            "Classifier_Name": self.classifier_name,
            "Optimitied_Param": dict(),
            "Score": model_score_path,
            "Model_Path": model_path,
            "TimeToStartFit": self.start_to_train_time.strftime("%Y-%m-%d %H:%M:%S")
        }

        training_testing_performance = get_evaluation(
            label=self.best_predicted_pair[1],
            pred=self.best_predicted_pair[0],
        )

        # 计算5C中的平均表现
        FiveFold_result = {}
        for keys in training_testing_performance.keys():
            value_list = []
            for item in self.best_5C_predicted_pair:

                item_performance = get_evaluation(
                    label=item[1],
                    pred=item[0],
                )
                value_list.append(item_performance[keys])

            if keys == "pro_cutoff":
                FiveFold_result[keys] = value_list
            else:
                FiveFold_result[keys] = sum(value_list) / len(value_list)

        self.end_of_train_time = datetime.now()
        model_information["TimeOfSummary"] = self.end_of_train_time.strftime(
            "%Y-%m-%d %H:%M:%S"
        )
        model_information["TimeSpend"] = str(
            self.end_of_train_time - self.start_to_train_time
        )

        return model_information, training_testing_performance, FiveFold_result

In [17]:
SA_Trainer().find_best(
    X=pd.concat([digitaa_data['t_p'], digitaa_data['t_n']]),
    y=np.concatenate([np.ones((digitaa_data['t_p'].shape[0], )), np.zeros((digitaa_data['t_n'].shape[0], ))]),
    validation=(
        pd.concat([digitaa_data['v_p'], digitaa_data['v_n']]),
        np.concatenate([np.ones((digitaa_data['v_p'].shape[0], )), np.zeros((digitaa_data['v_n'].shape[0], ))]),
    )
).get_summary(
    path_to_dir="out/libfeatureselection/T1/T1SEstacker-RTX/"
)

Epoch 1/10
5/5 - 8s - loss: 0.6929 - binary_accuracy: 0.5000 - auc_12: 0.5330 - precision_12: 0.5000 - true_positives_12: 74.0000 - true_negatives_12: 0.0000e+00 - false_positives_12: 74.0000 - false_negatives_12: 0.0000e+00 - 8s/epoch - 2s/step
Epoch 2/10
5/5 - 0s - loss: 0.6804 - binary_accuracy: 0.6689 - auc_12: 0.9825 - precision_12: 0.6016 - true_positives_12: 74.0000 - true_negatives_12: 25.0000 - false_positives_12: 49.0000 - false_negatives_12: 0.0000e+00 - 65ms/epoch - 13ms/step
Epoch 3/10
5/5 - 0s - loss: 0.6524 - binary_accuracy: 0.9797 - auc_12: 0.9990 - precision_12: 0.9610 - true_positives_12: 74.0000 - true_negatives_12: 71.0000 - false_positives_12: 3.0000 - false_negatives_12: 0.0000e+00 - 76ms/epoch - 15ms/step
Epoch 4/10
5/5 - 0s - loss: 0.6138 - binary_accuracy: 0.9730 - auc_12: 0.9961 - precision_12: 1.0000 - true_positives_12: 70.0000 - true_negatives_12: 74.0000 - false_positives_12: 0.0000e+00 - false_negatives_12: 4.0000 - 69ms/epoch - 14ms/step
Epoch 5/10
5/5 

({'Classifier_Name': 'SA',
  'Optimitied_Param': {},
  'Score': 'out/libfeatureselection/T1/T1SEstacker-RTX//SA_score.pkl',
  'Model_Path': '-',
  'TimeToStartFit': '2023-07-22 13:50:05',
  'TimeOfSummary': '2023-07-22 13:51:14',
  'TimeSpend': '0:01:09.007109'},
 {'accuracy': 0.72,
  'precision': 0.7619047619047619,
  'f1_score': 0.6956521739130435,
  'mmc': 0.4457424941602093,
  'rocAUC': 0.776,
  'specificity': 0.8,
  'sensitivity': 0.64,
  'pro_cutoff': 0.44693962},
 {'accuracy': 0.8938461538461538,
  'precision': 0.9428571428571428,
  'f1_score': 0.8885998890876939,
  'mmc': 0.7994379249629955,
  'rocAUC': 0.9428157894736842,
  'specificity': 0.9400000000000001,
  'sensitivity': 0.8494736842105264,
  'pro_cutoff': [0.5304556, 0.59257233, 0.5537683, 0.62785786, 0.5536931]})

<Figure size 480x270 with 0 Axes>

# Voting

In [19]:
prot_type = 1
job_name = "T1-RTX-rStacker"
path_to_score_dir = "out/libfeatureselection/T1/T1SEstacker-RTX/model/"
path_to_dnnscore_dir = "out/libfeatureselection/T1/T1SEstacker-RTX/"
path_to_model_score_path = "out/libfeatureselection/T1/T1SEstacker-RTX/"

In [20]:
import os
import sys
sys.path.append("src")
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

import warnings
warnings.filterwarnings('ignore')

from libfeatureselection import model_space

In [21]:
model_allname_list = [
    item['name']
    for item in model_space.find_space
]
model_list_dict = { item['name']:item for item in model_space.find_space }

In [22]:
import gzip
import pickle
import numpy as np

In [23]:
from sklearn.metrics import roc_curve
def get_optimal_threshold(target: np.ndarray, predict: np.ndarray, multi_dim: int):
    
    if multi_dim is not None:
        predict = predict[:, multi_dim]

    predict = np.nan_to_num(
        predict, copy=True, nan=0.0
    )
    fpr, tpr, thresholds = roc_curve(target, predict)
    best_one_optimal_idx = np.argmax(tpr - fpr)
    pro_cutoff = thresholds[best_one_optimal_idx]
    predict_l = [1 if i >= pro_cutoff else 0 for i in predict]

    return pro_cutoff

def get_threshold_for_dict(_score_dict: dict, multi_dim: int = None):
    # best_predicted_pair
    _score_dict['best_predicted_pair_pro_cutoff'] = get_optimal_threshold(
        target=_score_dict['best_predicted_pair'][1],
        predict=_score_dict['best_predicted_pair'][0],
        multi_dim=multi_dim
    )
    _score_dict['best_predicted_binary'] = (
        _score_dict['best_predicted_pair'][0] >= _score_dict['best_predicted_pair_pro_cutoff']
    ).astype(int)

    if multi_dim is not None:
        _score_dict['best_predicted_binary'] = _score_dict['best_predicted_binary'][:, multi_dim]

    # best_5C_predicted_pair
    _score_dict['best_5C_predicted_pair_pro_cutoff'] = [
        get_optimal_threshold(
            target=fold_item[1],
            predict=fold_item[0],
            multi_dim=multi_dim
        )
        for fold_item in _score_dict['best_5C_predicted_pair']
    ]
    _score_dict['best_5C_predicted_binary'] = [
        (
            _score_dict['best_5C_predicted_pair'][fold_id][0] >= _score_dict['best_5C_predicted_pair_pro_cutoff'][fold_id]
        ).astype(int) 
        for fold_id in range(len(_score_dict['best_5C_predicted_pair']))
    ]
    _score_dict['best_5C_predicted_binary'] = [
        _score_dict['best_5C_predicted_binary'][fold_id] if multi_dim is None else _score_dict['best_5C_predicted_binary'][fold_id][:, multi_dim]
        for fold_id in range(len(_score_dict['best_5C_predicted_pair']))
    ]

    return _score_dict

In [24]:
train_score_dict = {
    model_name: get_threshold_for_dict(
        pickle.load(
            gzip.open(f"{path_to_score_dir}/ac/{model_name}_score.pkl.train", "rb")
        ),
        multi_dim=1
    )
    for model_name in [
        'SVC',
        'GaussianNB',
        'RandomForestClassifier',
        'DecisionTreeClassifier'
    ]
} | {
    "BPBAac": get_threshold_for_dict(
        pickle.load(
            gzip.open(f"{path_to_score_dir}/bpb/SVC_score.pkl.train", "rb")
        ),
        multi_dim=1
    )
} | {
    model_name: get_threshold_for_dict(
        pickle.load(
            gzip.open(f"{path_to_dnnscore_dir}/{model_name}_score.pkl.train", "rb")
        ),
        multi_dim=0
    )
    for model_name in [
        'DNN',
        'RNN',
        'SA',
    ]
}
score_dict = {
    model_name: get_threshold_for_dict(
        pickle.load(
            gzip.open(f"{path_to_score_dir}/ac/{model_name}_score.pkl", "rb")
        ),
        multi_dim=1
    )
    for model_name in [
        'SVC',
        'GaussianNB',
        'RandomForestClassifier',
        'DecisionTreeClassifier'
    ]
} | {
    "BPBAac": get_threshold_for_dict(
        pickle.load(
            gzip.open(f"{path_to_score_dir}/bpb/SVC_score.pkl", "rb")
        ),
        multi_dim=1
    )
} | {
    model_name: get_threshold_for_dict(
        pickle.load(
            gzip.open(f"{path_to_dnnscore_dir}/{model_name}_score.pkl", "rb")
        ),
        multi_dim=0
    )
    for model_name in [
        'DNN',
        'RNN',
        'SA',
    ]
}
voting_model_name_list = [
    'SVC',
    'GaussianNB',
    'RandomForestClassifier',
    'DecisionTreeClassifier',
    "BPBAac",
    'DNN',
    'RNN',
    'SA',
]

In [None]:
from sklearn.svm import SVC

train_tt_voting_score_pair = [
    np.stack([
        train_score_dict[model_name]['best_predicted_binary'] for model_name in voting_model_name_list
    ], axis=1),
    next(iter(train_score_dict.items()))[1]['best_predicted_pair'][1],
]
tt_voting_score_pair = [
    np.stack([
        score_dict[model_name]['best_predicted_binary'] for model_name in voting_model_name_list
    ], axis=1),
    next(iter(score_dict.items()))[1]['best_predicted_pair'][1],
]

tt_voting_score_pair[0] = SVC(probability=True).fit(
    train_tt_voting_score_pair[0],
    train_tt_voting_score_pair[1],
).predict_proba(tt_voting_score_pair[0])

train_cv_voting_score_pair_list = [
    [
        np.stack([
            score_dict[model_name]['best_5C_predicted_binary'][fold_id] for model_name in voting_model_name_list
        ], axis=1),
        next(iter(score_dict.items()))[1]['best_5C_predicted_pair'][fold_id][1],
    ]
    for fold_id in range(len(next(iter(score_dict.items()))[1]['best_5C_predicted_pair']))
]
cv_voting_score_pair_list = [
    [
        np.stack([
            score_dict[model_name]['best_5C_predicted_binary'][fold_id] for model_name in voting_model_name_list
        ], axis=1),
        next(iter(score_dict.items()))[1]['best_5C_predicted_pair'][fold_id][1],
    ]
    for fold_id in range(len(next(iter(score_dict.items()))[1]['best_5C_predicted_pair']))
]
for fold_id in range(len(next(iter(score_dict.items()))[1]['best_5C_predicted_pair'])):
    cv_voting_score_pair_list[fold_id][0] = SVC(probability=True).fit(
        train_cv_voting_score_pair_list[fold_id][0],
        train_cv_voting_score_pair_list[fold_id][1],
    ).predict_proba(cv_voting_score_pair_list[fold_id][0])

os.makedirs(path_to_model_score_path, exist_ok=True)
with gzip.open(f"{path_to_model_score_path}/{job_name}_score.pkl", "wb") as f:
    pickle.dump(
        {
            "best_predicted_pair": tt_voting_score_pair,
            "best_5C_predicted_pair": cv_voting_score_pair_list,
        }, f
    )