In [1]:
import sys
sys.path.append("src")
import os
# os.environ["n_jobs"] = "2"
import json

import libpybiofeature

import utils
work_Dir = utils.workdir.workdir(os.getcwd(), 4)

import numpy as np
import pandas as pd

from Bio import SeqIO

random_seed = 42
np.random.seed(random_seed)

from sklearn.model_selection import StratifiedKFold

In [2]:
def load_AAC_feature(TxSE_args: dict):

    # AAC
    AAC_feature = {
        "name": "DAC",
        "p": libpybiofeature.featurebuilder.build_dac_feature(
            path_to_fasta=TxSE_args['fasta']['p'],
            seq_id_list=[ seq.id for seq in SeqIO.parse(TxSE_args['fasta']['p'], "fasta") ],
            desc='p'
        ),
        "n": libpybiofeature.featurebuilder.build_dac_feature(
            path_to_fasta=TxSE_args['fasta']['n'],
            seq_id_list=[ seq.id for seq in SeqIO.parse(TxSE_args['fasta']['n'], "fasta") ],
            desc='n'
        ),
    }

    print(AAC_feature['n'].shape[0], AAC_feature['p'].shape[0])

    # # 一般p会少于n，所以随机在n中选择即可
    # AAC_feature['n'] = AAC_feature['n'].iloc[np.random.choice(np.arange(AAC_feature['n'].shape[0]), size=AAC_feature['p'].shape[0], replace=False), :]

    AAC_feature["pn_all_feature"] = pd.concat([
        AAC_feature["p"],
        AAC_feature["n"],
    ])
    AAC_feature["pn_all_label"] = np.concatenate([
        np.ones(shape=(AAC_feature["p"].shape[0], )),
        np.zeros(shape=(AAC_feature["n"].shape[0], )),
    ])

    # 切割train、test
    train_id, test_id = next(iter(StratifiedKFold(
        n_splits=5,
        shuffle=True,
        random_state=42
    ).split(
        AAC_feature["pn_all_feature"].values,
        AAC_feature["pn_all_label"]
    )))

    AAC_feature["training_pn_all_feature"] = AAC_feature["pn_all_feature"].iloc[train_id, :]
    AAC_feature["training_pn_all_label"] = AAC_feature["pn_all_label"][train_id]
    AAC_feature["testing_pn_all_feature"] = AAC_feature["pn_all_feature"].iloc[test_id, :]
    AAC_feature["testing_pn_all_label"] = AAC_feature["pn_all_label"][test_id]

    return AAC_feature

In [3]:
prot_type = 6
cter_bool = False
rtx_Tx_arg = {
    "type": f'T{prot_type}',
    'fasta': {
        'cter': cter_bool,
        'p': "data/T6SE/anti-bacterial-effector_p.fasta",
        'n': "data/T6SE/anti-bacterial-effector_n.fasta"
    },
}
nonrtx_Tx_arg = {
    "type": f'T{prot_type}',
    'fasta': {
        'cter': cter_bool,
        'p': "data/T6SE/anti-eukaryotic-effector_p.fasta",
        'n': "data/T6SE/anti-eukaryotic-effector_n.fasta"
    },
}
title_table = "AB-model_Predict_AE"
model_save_dir = "out/libfeatureselection/Six_feature_research/dac/ab/model/"
os.makedirs(model_save_dir, exist_ok=True)

In [4]:
rtx_aac_data = load_AAC_feature(
    TxSE_args=rtx_Tx_arg
)
nonrtx_aac_data = load_AAC_feature(
    TxSE_args=nonrtx_Tx_arg
)
aa_type = list(rtx_aac_data['p'].columns)
print(rtx_aac_data.keys(), ", ".join(aa_type))

p_DAC: 100%|██████████| 53/53 [00:00<00:00, 7178.78it/s]
n_DAC: 100%|██████████| 53/53 [00:00<00:00, 10583.10it/s]
p_DAC: 100%|██████████| 33/33 [00:00<00:00, 7038.85it/s]
n_DAC: 100%|██████████| 33/33 [00:00<00:00, 10892.58it/s]

53 53
33 33
dict_keys(['name', 'p', 'n', 'pn_all_feature', 'pn_all_label', 'training_pn_all_feature', 'training_pn_all_label', 'testing_pn_all_feature', 'testing_pn_all_label']) A,A, A,C, A,D, A,E, A,F, A,G, A,H, A,I, A,K, A,L, A,M, A,N, A,P, A,Q, A,R, A,S, A,T, A,V, A,W, A,Y, C,A, C,C, C,D, C,E, C,F, C,G, C,H, C,I, C,K, C,L, C,M, C,N, C,P, C,Q, C,R, C,S, C,T, C,V, C,W, C,Y, D,A, D,C, D,D, D,E, D,F, D,G, D,H, D,I, D,K, D,L, D,M, D,N, D,P, D,Q, D,R, D,S, D,T, D,V, D,W, D,Y, E,A, E,C, E,D, E,E, E,F, E,G, E,H, E,I, E,K, E,L, E,M, E,N, E,P, E,Q, E,R, E,S, E,T, E,V, E,W, E,Y, F,A, F,C, F,D, F,E, F,F, F,G, F,H, F,I, F,K, F,L, F,M, F,N, F,P, F,Q, F,R, F,S, F,T, F,V, F,W, F,Y, G,A, G,C, G,D, G,E, G,F, G,G, G,H, G,I, G,K, G,L, G,M, G,N, G,P, G,Q, G,R, G,S, G,T, G,V, G,W, G,Y, H,A, H,C, H,D, H,E, H,F, H,G, H,H, H,I, H,K, H,L, H,M, H,N, H,P, H,Q, H,R, H,S, H,T, H,V, H,W, H,Y, I,A, I,C, I,D, I,E, I,F, I,G, I,H, I,I, I,K, I,L, I,M, I,N, I,P, I,Q, I,R, I,S, I,T, I,V, I,W, I,Y, K,A, K,C, K,D, K,E, K,




In [5]:
from libfeatureselection import model, model_space
n_jobs = (
    (os.cpu_count() - 2)
    if "n_jobs" not in os.environ or os.environ['n_jobs'] == "" else
    int(os.environ['n_jobs'])
)
n_jobs

2023-07-12 11:48:56.190911: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-12 11:48:56.193994: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-07-12 11:48:56.194003: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-07-12 11:48:57.025448: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-07-12 11:48:57.025478: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to c

30

In [6]:
search_result_in_a_scheme_df = pd.DataFrame()
for model_index in range(len(model_space.find_space)):
    model_information_summary, searched_result_performance_summary, searched_result_5C_performance_summary = model.MyOptimitzer(
        classifier_name=model_space.find_space[model_index]['name'],
        classifier_class=model_space.find_space[model_index]['class'],
        classifier_param_dict=model_space.find_space[model_index]['param'],
    ).find_best(
        X=rtx_aac_data["pn_all_feature"].values,
        y=rtx_aac_data["pn_all_label"],
        validation=(
            nonrtx_aac_data["pn_all_feature"].values,
            nonrtx_aac_data["pn_all_label"]
        ),
        search_method=(
            "BayesSearchCV"
            if "Bayes" not in model_space.find_space[model_index]
            or model_space.find_space[model_index]['Bayes'] == True
            else "GridSearchCV"
        ),
        n_jobs=n_jobs
    ).get_summary(
        path_to_dir=f"{model_save_dir}/model/"
    )

    # 记录结果，插入到 search_result_in_a_scheme_df
    result_series = pd.concat([
        pd.Series(model_information_summary),
        pd.Series(searched_result_performance_summary),
        pd.Series(searched_result_5C_performance_summary),
    ], keys=[
        "Model_Information",
        "Best_Performance",
        "5FoldCV_Performance",
    ])

    result_series.name = model_index

    search_result_in_a_scheme_df = pd.concat([
        search_result_in_a_scheme_df,
        result_series.to_frame().T
    ], axis=0, ignore_index=False)

    search_result_in_a_scheme_df.index = search_result_in_a_scheme_df.index.set_names(
        ["Model_Type",]
    )

    local_xlsx_path = f"{model_save_dir}/searched_result.xlsx"

    # 缓存 search_result_in_a_scheme_df
    search_result_in_a_scheme_df.to_excel(
        local_xlsx_path,
        title_table,
        freeze_panes=(2, 1)
    )

Traceback (most recent call last):
  File "/home/georgezhao/.pyvirtualenvs/TxSEml_Backend/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/georgezhao/.pyvirtualenvs/TxSEml_Backend/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 219, in __call__
    return self._score(
  File "/home/georgezhao/.pyvirtualenvs/TxSEml_Backend/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 352, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported

Traceback (most recent call last):
  File "/home/georgezhao/.pyvirtualenvs/TxSEml_Backend/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/georgezhao/.pyvirtualenvs/TxSEml_Backend/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 219, in __call__
    

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>