In [4]:
import sys
sys.path.append("src")
import os
# os.environ["n_jobs"] = "2"
import json

import libpybiofeature

import utils
work_Dir = utils.workdir.workdir(os.getcwd(), 4)

import numpy as np
import pandas as pd

from Bio import SeqIO

random_seed = 42
np.random.seed(random_seed)

from sklearn.model_selection import StratifiedKFold

In [5]:
def load_AAC_feature(TxSE_args: dict):

    possum_index_dict = None
    with open(TxSE_args['possum']['index'], 'r', encoding='UTF-8') as f:
        possum_index_dict = json.load(f)

    # PCPseAAC
    PCPseAAC_feature = {
        "name": "S_FPSSM",
        "p": libpybiofeature.libdataloader.pssm_tools.get_all_pssm_feature(
            possum_index_list=possum_index_dict['data'][f'{TxSE_args["possum"]["key"]}_p'],
            feature_name_list=['s_fpssm',],
            path_to_fasta=TxSE_args['fasta']['p'],
            path_to_with_pattern=TxSE_args['possum']['pssm_fdb_pattern']
        ).loc[[ seq.id for seq in SeqIO.parse(TxSE_args['fasta']['p'], "fasta") ], :],
        "n": libpybiofeature.libdataloader.pssm_tools.get_all_pssm_feature(
            possum_index_list=possum_index_dict['data'][f'{TxSE_args["possum"]["key"]}_n'],
            feature_name_list=['s_fpssm',],
            path_to_fasta=TxSE_args['fasta']['n'],
            path_to_with_pattern=TxSE_args['possum']['pssm_fdb_pattern']
        ).loc[[ seq.id for seq in SeqIO.parse(TxSE_args['fasta']['n'], "fasta") ], :],
    }

    print(PCPseAAC_feature['n'].shape[0], PCPseAAC_feature['p'].shape[0])

    # # 一般p会少于n，所以随机在n中选择即可
    # AAC_feature['n'] = AAC_feature['n'].iloc[np.random.choice(np.arange(AAC_feature['n'].shape[0]), size=AAC_feature['p'].shape[0], replace=False), :]

    PCPseAAC_feature["pn_all_feature"] = pd.concat([
        PCPseAAC_feature["p"],
        PCPseAAC_feature["n"],
    ])
    PCPseAAC_feature["pn_all_label"] = np.concatenate([
        np.ones(shape=(PCPseAAC_feature["p"].shape[0], )),
        np.zeros(shape=(PCPseAAC_feature["n"].shape[0], )),
    ])

    # 切割train、test
    train_id, test_id = next(iter(StratifiedKFold(
        n_splits=5,
        shuffle=True,
        random_state=42
    ).split(
        PCPseAAC_feature["pn_all_feature"].values,
        PCPseAAC_feature["pn_all_label"]
    )))

    PCPseAAC_feature["training_pn_all_feature"] = PCPseAAC_feature["pn_all_feature"].iloc[train_id, :]
    PCPseAAC_feature["training_pn_all_label"] = PCPseAAC_feature["pn_all_label"][train_id]
    PCPseAAC_feature["testing_pn_all_feature"] = PCPseAAC_feature["pn_all_feature"].iloc[test_id, :]
    PCPseAAC_feature["testing_pn_all_label"] = PCPseAAC_feature["pn_all_label"][test_id]

    return PCPseAAC_feature

In [6]:
prot_type = 6
cter_bool = False
rtx_Tx_arg = {
    "type": f'T{prot_type}',
    'fasta': {
        'cter': cter_bool,
        'p': "data/T6SE/anti-bacterial-effector_p.fasta",
        'n': "data/T6SE/anti-bacterial-effector_n.fasta"
    },
    'possum': {
        'index': "out/libfeatureselection/A_feature_research/featuredb/possum/possum_index.json",
        'pssm_fdb_pattern': "out/libfeatureselection/A_feature_research/featuredb/possum/{zipid}_pssm_features.zip",
        'pssm_rdb_pattern': "out/libfeatureselection/A_feature_research/featuredb/possum/{zipid}_pssm_files.zip",
        "key": "ab"
    },
}
nonrtx_Tx_arg = {
    "type": f'T{prot_type}',
    'fasta': {
        'cter': cter_bool,
        'p': "data/T6SE/anti-eukaryotic-effector_p.fasta",
        'n': "data/T6SE/anti-eukaryotic-effector_n.fasta"
    },
    'possum': {
        'index': "out/libfeatureselection/A_feature_research/featuredb/possum/possum_index.json",
        'pssm_fdb_pattern': "out/libfeatureselection/A_feature_research/featuredb/possum/{zipid}_pssm_features.zip",
        'pssm_rdb_pattern': "out/libfeatureselection/A_feature_research/featuredb/possum/{zipid}_pssm_files.zip",
        "key": "ae"
    },
}
title_table = "AB-model_Predict_AE"
model_save_dir = "out/libfeatureselection/Six_feature_research/S_FPSSM/ab/model/"
os.makedirs(model_save_dir, exist_ok=True)

In [7]:
rtx_aac_data = load_AAC_feature(
    TxSE_args=rtx_Tx_arg
)
nonrtx_aac_data = load_AAC_feature(
    TxSE_args=nonrtx_Tx_arg
)
aa_type = list(rtx_aac_data['p'].columns)
print(rtx_aac_data.keys(), ", ".join(aa_type))

53 53
33 33
dict_keys(['name', 'p', 'n', 'pn_all_feature', 'pn_all_label', 'training_pn_all_feature', 'training_pn_all_label', 'testing_pn_all_feature', 'testing_pn_all_label']) s_fpssm0, s_fpssm1, s_fpssm2, s_fpssm3, s_fpssm4, s_fpssm5, s_fpssm6, s_fpssm7, s_fpssm8, s_fpssm9, s_fpssm10, s_fpssm11, s_fpssm12, s_fpssm13, s_fpssm14, s_fpssm15, s_fpssm16, s_fpssm17, s_fpssm18, s_fpssm19, s_fpssm20, s_fpssm21, s_fpssm22, s_fpssm23, s_fpssm24, s_fpssm25, s_fpssm26, s_fpssm27, s_fpssm28, s_fpssm29, s_fpssm30, s_fpssm31, s_fpssm32, s_fpssm33, s_fpssm34, s_fpssm35, s_fpssm36, s_fpssm37, s_fpssm38, s_fpssm39, s_fpssm40, s_fpssm41, s_fpssm42, s_fpssm43, s_fpssm44, s_fpssm45, s_fpssm46, s_fpssm47, s_fpssm48, s_fpssm49, s_fpssm50, s_fpssm51, s_fpssm52, s_fpssm53, s_fpssm54, s_fpssm55, s_fpssm56, s_fpssm57, s_fpssm58, s_fpssm59, s_fpssm60, s_fpssm61, s_fpssm62, s_fpssm63, s_fpssm64, s_fpssm65, s_fpssm66, s_fpssm67, s_fpssm68, s_fpssm69, s_fpssm70, s_fpssm71, s_fpssm72, s_fpssm73, s_fpssm74, s_fpssm

In [8]:
from libfeatureselection import model, model_space
n_jobs = (
    (os.cpu_count() - 2)
    if "n_jobs" not in os.environ or os.environ['n_jobs'] == "" else
    int(os.environ['n_jobs'])
)
n_jobs

2023-07-12 22:31:34.899029: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-12 22:31:34.901490: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-07-12 22:31:34.901499: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-07-12 22:31:35.404675: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-07-12 22:31:35.404702: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to c

30

In [9]:
search_result_in_a_scheme_df = pd.DataFrame()
for model_index in range(len(model_space.find_space)):
    model_information_summary, searched_result_performance_summary, searched_result_5C_performance_summary = model.MyOptimitzer(
        classifier_name=model_space.find_space[model_index]['name'],
        classifier_class=model_space.find_space[model_index]['class'],
        classifier_param_dict=model_space.find_space[model_index]['param'],
    ).find_best(
        X=rtx_aac_data["pn_all_feature"].values,
        y=rtx_aac_data["pn_all_label"],
        validation=(
            nonrtx_aac_data["pn_all_feature"].values,
            nonrtx_aac_data["pn_all_label"]
        ),
        search_method=(
            "BayesSearchCV"
            if "Bayes" not in model_space.find_space[model_index]
            or model_space.find_space[model_index]['Bayes'] == True
            else "GridSearchCV"
        ),
        n_jobs=n_jobs
    ).get_summary(
        path_to_dir=f"{model_save_dir}/model/"
    )

    # 记录结果，插入到 search_result_in_a_scheme_df
    result_series = pd.concat([
        pd.Series(model_information_summary),
        pd.Series(searched_result_performance_summary),
        pd.Series(searched_result_5C_performance_summary),
    ], keys=[
        "Model_Information",
        "Best_Performance",
        "5FoldCV_Performance",
    ])

    result_series.name = model_index

    search_result_in_a_scheme_df = pd.concat([
        search_result_in_a_scheme_df,
        result_series.to_frame().T
    ], axis=0, ignore_index=False)

    search_result_in_a_scheme_df.index = search_result_in_a_scheme_df.index.set_names(
        ["Model_Type",]
    )

    local_xlsx_path = f"{model_save_dir}/searched_result.xlsx"

    # 缓存 search_result_in_a_scheme_df
    search_result_in_a_scheme_df.to_excel(
        local_xlsx_path,
        title_table,
        freeze_panes=(2, 1)
    )

Traceback (most recent call last):
  File "/home/georgezhao/.pyvirtualenvs/TxSEml_Backend/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/georgezhao/.pyvirtualenvs/TxSEml_Backend/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 219, in __call__
    return self._score(
  File "/home/georgezhao/.pyvirtualenvs/TxSEml_Backend/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 352, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported

Traceback (most recent call last):
  File "/home/georgezhao/.pyvirtualenvs/TxSEml_Backend/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/georgezhao/.pyvirtualenvs/TxSEml_Backend/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 219, in __call__
    

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>