# T1 Voting

> 要不要看看T1SPs的那个AAC各模型，除去LPA、MLP、ERT以及DT四个，剩下来11个模型集成一下，看看性能如何？

In [1]:
# rT1-TT-AAC
prot_type = 1
job_name = "rT1-TT-AAC_Voting"
path_to_score_dir = "model/T1/model/"
path_to_model_score_path = f"model/T1/tmp/"

## 按模型顺序加载预测分数

In [2]:
import os
import sys
sys.path.append("src")
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

import warnings
warnings.filterwarnings('ignore')

from libfeatureselection import model_space

2023-08-02 13:44:38.236835: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-02 13:44:38.239744: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-08-02 13:44:38.239753: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-08-02 13:44:38.990266: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-08-02 13:44:38.990284: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to c

In [3]:
model_allname_list = [
    item['name']
    for item in model_space.find_space
]
model_list_dict = { item['name']:item for item in model_space.find_space }

In [4]:
import gzip
import pickle
import numpy as np

In [5]:
from sklearn.metrics import roc_curve
def get_optimal_threshold(target: np.ndarray, predict: np.ndarray):

    predict = np.nan_to_num(
        predict, copy=True, nan=0.0
    )
    fpr, tpr, thresholds = roc_curve(target, predict)
    best_one_optimal_idx = np.argmax(tpr - fpr)
    pro_cutoff = thresholds[best_one_optimal_idx]
    predict_l = [1 if i >= pro_cutoff else 0 for i in predict]

    return pro_cutoff

def get_threshold_for_dict(_score_dict: dict):
    # best_predicted_pair
    _score_dict['best_predicted_pair_pro_cutoff'] = get_optimal_threshold(
        target=_score_dict['best_predicted_pair'][1],
        predict=_score_dict['best_predicted_pair'][0][:, 1]
    )
    _score_dict['best_predicted_binary'] = (
        _score_dict['best_predicted_pair'][0] >= _score_dict['best_predicted_pair_pro_cutoff']
    ).astype(int)

    # best_5C_predicted_pair
    _score_dict['best_5C_predicted_pair_pro_cutoff'] = [
        get_optimal_threshold(
            target=fold_item[1],
            predict=fold_item[0][:, 1]
        )
        for fold_item in _score_dict['best_5C_predicted_pair']
    ]
    _score_dict['best_5C_predicted_binary'] = [
        (
            _score_dict['best_5C_predicted_pair'][fold_id][0] >= _score_dict['best_5C_predicted_pair_pro_cutoff'][fold_id]
        ).astype(int) for fold_id in range(len(_score_dict['best_5C_predicted_pair']))
    ]

    return _score_dict

In [6]:
score_dict = {
    model_name: get_threshold_for_dict(
        pickle.load(
            gzip.open(f"{path_to_score_dir}/{model_name}_score.pkl", "rb")
        )
    )
    for model_name in model_allname_list
}

Voting

In [7]:
voting_model_name_list = [
    'XGBClassifier',
    'LGBMClassifier',
    'SVC',
    'LogisticRegression',
    'SGDClassifier',
    'ExtraTreesClassifier',
    'GradientBoostingClassifier',
    'GaussianNB',
    'GaussianProcessClassifier',
    'KNeighborsClassifier',
    'RandomForestClassifier'
]

tt_voting_score_pair = [
    np.stack([
        score_dict[model_name]['best_predicted_binary'][:, 1] for model_name in voting_model_name_list
    ], axis=1).mean(axis=1),
    next(iter(score_dict.items()))[1]['best_predicted_pair'][1],
]

cv_voting_score_pair_list = [
    [
        np.stack([
            score_dict[model_name]['best_5C_predicted_binary'][fold_id][:, 1] for model_name in voting_model_name_list
        ], axis=1).mean(axis=1),
        next(iter(score_dict.items()))[1]['best_5C_predicted_pair'][fold_id][1],
    ]
    for fold_id in range(len(next(iter(score_dict.items()))[1]['best_5C_predicted_pair']))
]

os.makedirs(path_to_model_score_path, exist_ok=True)
with gzip.open(f"{path_to_model_score_path}/{job_name}_score.pkl", "wb") as f:
    pickle.dump(
        {
            "best_predicted_pair": tt_voting_score_pair,
            "best_5C_predicted_pair": cv_voting_score_pair_list,
        }, f
    )

In [14]:
from sklearn.metrics import roc_curve
import pandas as pd

def get_threshold(target: np.ndarray, pred: np.ndarray, spe_rto:float):
    fpr, tpr, thresholds = roc_curve(
        target, pred
    )
    if spe_rto is None:
        optimal_idx = np.argmax(tpr - fpr)
        optimal_threshold = thresholds[optimal_idx]
    else:
        rocdf = pd.DataFrame({
            "fpr": fpr,
            "tpr": tpr,
            "thresholds": thresholds,
            "nfpr": np.abs(fpr - 1 + spe_rto)
        }).sort_values(
            ["nfpr", "tpr"], ascending=[True, False]
        )
        optimal_threshold = rocdf.iloc[0, :].at["thresholds"]
    return optimal_threshold

def get_threshold2(target: np.ndarray, pred: np.ndarray, spe_rto:float):
    fpr, tpr, thresholds = roc_curve(
        target, pred
    )
    if spe_rto is None:
        optimal_idx = np.argmax(tpr - fpr)
        optimal_threshold = thresholds[optimal_idx]
    else:
        rocdf = pd.DataFrame({
            "fpr": fpr,
            "tpr": tpr,
            "thresholds": thresholds,
            "nfpr": np.abs(fpr - 1 + spe_rto)
        }).sort_values(
            ["nfpr", "tpr"], ascending=[True, False]
        )
        optimal_threshold = rocdf.iloc[0, :].at["thresholds"]
    return rocdf

In [13]:
{
    "Prot_Type": 1,
    "0.95": get_threshold(
        target=tt_voting_score_pair[1],
        pred=tt_voting_score_pair[0],
        spe_rto=0.95
    ),
    "0.99": get_threshold(
        target=tt_voting_score_pair[1],
        pred=tt_voting_score_pair[0],
        spe_rto=0.99
    ),
    "0.995": get_threshold(
        target=tt_voting_score_pair[1],
        pred=tt_voting_score_pair[0],
        spe_rto=0.99
    ),
    "0.999": get_threshold(
        target=tt_voting_score_pair[1],
        pred=tt_voting_score_pair[0],
        spe_rto=0.99
    ),
    "0.9999": get_threshold(
        target=tt_voting_score_pair[1],
        pred=tt_voting_score_pair[0],
        spe_rto=0.99
    ),
    "0.99999": get_threshold(
        target=tt_voting_score_pair[1],
        pred=tt_voting_score_pair[0],
        spe_rto=0.99
    ),
    "0.99999999999": get_threshold(
        target=tt_voting_score_pair[1],
        pred=tt_voting_score_pair[0],
        spe_rto=0.99
    ),
}

{'Prot_Type': 1,
 '0.95': 0.18181818181818182,
 '0.99': 0.6363636363636364,
 '0.995': 0.6363636363636364,
 '0.999': 0.6363636363636364,
 '0.9999': 0.6363636363636364,
 '0.99999': 0.6363636363636364,
 '0.99999999999': 0.6363636363636364}

In [16]:
get_threshold2(
    target=tt_voting_score_pair[1],
    pred=tt_voting_score_pair[0],
    spe_rto=0.99
)

Unnamed: 0,fpr,tpr,thresholds,nfpr
4,0.0,0.95,0.636364,0.01
3,0.0,0.9,0.818182,0.01
2,0.0,0.8,0.909091,0.01
1,0.0,0.75,1.0,0.01
0,0.0,0.0,2.0,0.01
5,0.1,0.95,0.181818,0.09
6,0.2,0.95,0.090909,0.19
7,1.0,1.0,0.0,0.99
