# T6 Voting 15G AB

> 对于T6，我们也采用两个，一是15特征的模型，算法去掉LPA、GP、ERT和DT，用剩余11个算法集成；二是AAC特征的模型，算法去掉LPA、LGBM、MLP、DT和ERT，用余下10算法集成。

In [1]:
# T6-AB-15G
prot_type = 6
job_name = "T6-AB-15G_Voting"
path_to_score_dir = "out/libfeatureselection/A_feature_research/model/1698039127349260/ab/"
path_to_model_score_path = f"out/libfeatureselection/voting/T{prot_type}/"

## 按模型顺序加载预测分数

In [2]:
import os
import sys
sys.path.append("src")
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

import warnings
warnings.filterwarnings('ignore')

from libfeatureselection import model_space

2023-07-23 15:38:57.688198: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-23 15:38:57.691282: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-07-23 15:38:57.691296: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-07-23 15:38:58.725082: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-07-23 15:38:58.725114: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to c

In [3]:
model_allname_list = [
    item['name']
    for item in model_space.find_space
]
model_list_dict = { item['name']:item for item in model_space.find_space }

In [4]:
import gzip
import pickle
import numpy as np

In [5]:
from sklearn.metrics import roc_curve
def get_optimal_threshold(target: np.ndarray, predict: np.ndarray):

    predict = np.nan_to_num(
        predict, copy=True, nan=0.0
    )
    fpr, tpr, thresholds = roc_curve(target, predict)
    best_one_optimal_idx = np.argmax(tpr - fpr)
    pro_cutoff = thresholds[best_one_optimal_idx]
    predict_l = [1 if i >= pro_cutoff else 0 for i in predict]

    return pro_cutoff

def get_threshold_for_dict(_score_dict: dict):
    # best_predicted_pair
    _score_dict['best_predicted_pair_pro_cutoff'] = get_optimal_threshold(
        target=_score_dict['best_predicted_pair'][1],
        predict=_score_dict['best_predicted_pair'][0][:, 1]
    )
    _score_dict['best_predicted_binary'] = (
        _score_dict['best_predicted_pair'][0] >= _score_dict['best_predicted_pair_pro_cutoff']
    ).astype(int)

    # best_5C_predicted_pair
    _score_dict['best_5C_predicted_pair_pro_cutoff'] = [
        get_optimal_threshold(
            target=fold_item[1],
            predict=fold_item[0][:, 1]
        )
        for fold_item in _score_dict['best_5C_predicted_pair']
    ]
    _score_dict['best_5C_predicted_binary'] = [
        (
            _score_dict['best_5C_predicted_pair'][fold_id][0] >= _score_dict['best_5C_predicted_pair_pro_cutoff'][fold_id]
        ).astype(int) for fold_id in range(len(_score_dict['best_5C_predicted_pair']))
    ]

    return _score_dict

In [6]:
score_dict = {
    model_name: get_threshold_for_dict(
        pickle.load(
            gzip.open(f"{path_to_score_dir}/{model_name}_score.pkl", "rb")
        )
    )
    for model_name in model_allname_list
}

Voting

In [7]:
voting_model_name_list = [
    'XGBClassifier',
    'LGBMClassifier',
    'SVC',
    'LogisticRegression',
    'SGDClassifier',
    'ExtraTreesClassifier',
    'GradientBoostingClassifier',
    'GaussianNB',
    'MLPClassifier',
    'KNeighborsClassifier',
    'RandomForestClassifier'
]

tt_voting_score_pair = [
    np.stack([
        score_dict[model_name]['best_predicted_binary'][:, 1] for model_name in voting_model_name_list
    ], axis=1).mean(axis=1),
    next(iter(score_dict.items()))[1]['best_predicted_pair'][1],
]

cv_voting_score_pair_list = [
    [
        np.stack([
            score_dict[model_name]['best_5C_predicted_binary'][fold_id][:, 1] for model_name in voting_model_name_list
        ], axis=1).mean(axis=1),
        next(iter(score_dict.items()))[1]['best_5C_predicted_pair'][fold_id][1],
    ]
    for fold_id in range(len(next(iter(score_dict.items()))[1]['best_5C_predicted_pair']))
]

os.makedirs(path_to_model_score_path, exist_ok=True)
with gzip.open(f"{path_to_model_score_path}/{job_name}_score.pkl", "wb") as f:
    pickle.dump(
        {
            "best_predicted_pair": tt_voting_score_pair,
            "best_5C_predicted_pair": cv_voting_score_pair_list,
        }, f
    )