In [75]:
import sys
sys.path.append("src")
import os
# os.environ["n_jobs"] = "2"
import json

import libpybiofeature

import utils
work_Dir = utils.workdir.workdir(os.getcwd(), 4)

import numpy as np
import pandas as pd

from Bio import SeqIO

random_seed = 42
np.random.seed(random_seed)

from sklearn.model_selection import StratifiedKFold

In [76]:
def load_AAC_feature(TxSE_args: dict):

    def read_data(
        path_to_csv: str,
        path_to_fasta: str,
    ):
        df = pd.read_csv(path_to_csv, index_col=None, header=None)
        df.index = [
            seq.id for seq in SeqIO.parse(path_to_fasta, "fasta")
        ]
        df.columns = list("ACDEFGHIKLMNPQRSTVWY") + [
            str(i) for i in range(1, (df.shape[1] - 20) + 1)
        ]
        return df

    # PCPseAAC
    PCPseAAC_feature = {
        "name": "SCPseAAC",
        "p": read_data(
            path_to_csv=TxSE_args['SCPseAAC']['p'],
            path_to_fasta=TxSE_args['fasta']['p'],
        ),
        "n": read_data(
            path_to_csv=TxSE_args['SCPseAAC']['n'],
            path_to_fasta=TxSE_args['fasta']['n'],
        ),
    }

    print(PCPseAAC_feature['n'].shape[0], PCPseAAC_feature['p'].shape[0])

    # # 一般p会少于n，所以随机在n中选择即可
    # AAC_feature['n'] = AAC_feature['n'].iloc[np.random.choice(np.arange(AAC_feature['n'].shape[0]), size=AAC_feature['p'].shape[0], replace=False), :]

    PCPseAAC_feature["pn_all_feature"] = pd.concat([
        PCPseAAC_feature["p"],
        PCPseAAC_feature["n"],
    ])
    PCPseAAC_feature["pn_all_label"] = np.concatenate([
        np.ones(shape=(PCPseAAC_feature["p"].shape[0], )),
        np.zeros(shape=(PCPseAAC_feature["n"].shape[0], )),
    ])

    # 切割train、test
    train_id, test_id = next(iter(StratifiedKFold(
        n_splits=5,
        shuffle=True,
        random_state=42
    ).split(
        PCPseAAC_feature["pn_all_feature"].values,
        PCPseAAC_feature["pn_all_label"]
    )))

    PCPseAAC_feature["training_pn_all_feature"] = PCPseAAC_feature["pn_all_feature"].iloc[train_id, :]
    PCPseAAC_feature["training_pn_all_label"] = PCPseAAC_feature["pn_all_label"][train_id]
    PCPseAAC_feature["testing_pn_all_feature"] = PCPseAAC_feature["pn_all_feature"].iloc[test_id, :]
    PCPseAAC_feature["testing_pn_all_label"] = PCPseAAC_feature["pn_all_label"][test_id]

    return PCPseAAC_feature

In [77]:
prot_type = 6
cter_bool = False
rtx_Tx_arg = {
    "type": f'T{prot_type}',
    'fasta': {
        'cter': cter_bool,
        'p': "data/T6SE/anti-bacterial-effector_p.fasta",
        'n': "data/T6SE/anti-bacterial-effector_n.fasta"
    },
    "SCPseAAC": {
        'p': "out/libfeatureselection/A_feature_research/featuredb/ab_p_SCPseAAC.csv",
        'n': "out/libfeatureselection/A_feature_research/featuredb/ab_n_SCPseAAC.csv"
    },
}
nonrtx_Tx_arg = {
    "type": f'T{prot_type}',
    'fasta': {
        'cter': cter_bool,
        'p': "data/T6SE/anti-eukaryotic-effector_p.fasta",
        'n': "data/T6SE/anti-eukaryotic-effector_n.fasta"
    },
    "SCPseAAC": {
        'p': "out/libfeatureselection/A_feature_research/featuredb/ae_p_SCPseAAC.csv",
        'n': "out/libfeatureselection/A_feature_research/featuredb/ae_n_SCPseAAC.csv"
    },
}
title_table = "AB-model_Predict_AE"
model_save_dir = "out/libfeatureselection/Six_feature_research/SCPseAAC/ab/model/"
os.makedirs(model_save_dir, exist_ok=True)

In [78]:
rtx_aac_data = load_AAC_feature(
    TxSE_args=rtx_Tx_arg
)
nonrtx_aac_data = load_AAC_feature(
    TxSE_args=nonrtx_Tx_arg
)
aa_type = list(rtx_aac_data['p'].columns)
print(rtx_aac_data.keys(), ", ".join(aa_type))

53 53
33 33
dict_keys(['name', 'p', 'n', 'pn_all_feature', 'pn_all_label', 'training_pn_all_feature', 'training_pn_all_label', 'testing_pn_all_feature', 'testing_pn_all_label']) A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y, 1, 2, 3


In [79]:
from libfeatureselection import model, model_space
n_jobs = (
    (os.cpu_count() - 2)
    if "n_jobs" not in os.environ or os.environ['n_jobs'] == "" else
    int(os.environ['n_jobs'])
)
n_jobs

30

In [80]:
search_result_in_a_scheme_df = pd.DataFrame()
for model_index in range(len(model_space.find_space)):
    model_information_summary, searched_result_performance_summary, searched_result_5C_performance_summary = model.MyOptimitzer(
        classifier_name=model_space.find_space[model_index]['name'],
        classifier_class=model_space.find_space[model_index]['class'],
        classifier_param_dict=model_space.find_space[model_index]['param'],
    ).find_best(
        X=rtx_aac_data["pn_all_feature"].values,
        y=rtx_aac_data["pn_all_label"],
        validation=(
            nonrtx_aac_data["pn_all_feature"].values,
            nonrtx_aac_data["pn_all_label"]
        ),
        search_method=(
            "BayesSearchCV"
            if "Bayes" not in model_space.find_space[model_index]
            or model_space.find_space[model_index]['Bayes'] == True
            else "GridSearchCV"
        ),
        n_jobs=n_jobs
    ).get_summary(
        path_to_dir=f"{model_save_dir}/model/"
    )

    # 记录结果，插入到 search_result_in_a_scheme_df
    result_series = pd.concat([
        pd.Series(model_information_summary),
        pd.Series(searched_result_performance_summary),
        pd.Series(searched_result_5C_performance_summary),
    ], keys=[
        "Model_Information",
        "Best_Performance",
        "5FoldCV_Performance",
    ])

    result_series.name = model_index

    search_result_in_a_scheme_df = pd.concat([
        search_result_in_a_scheme_df,
        result_series.to_frame().T
    ], axis=0, ignore_index=False)

    search_result_in_a_scheme_df.index = search_result_in_a_scheme_df.index.set_names(
        ["Model_Type",]
    )

    local_xlsx_path = f"{model_save_dir}/searched_result.xlsx"

    # 缓存 search_result_in_a_scheme_df
    search_result_in_a_scheme_df.to_excel(
        local_xlsx_path,
        title_table,
        freeze_panes=(2, 1)
    )

Traceback (most recent call last):
  File "/home/georgezhao/.pyvirtualenvs/TxSEml_Backend/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/georgezhao/.pyvirtualenvs/TxSEml_Backend/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 219, in __call__
    return self._score(
  File "/home/georgezhao/.pyvirtualenvs/TxSEml_Backend/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 352, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported

Traceback (most recent call last):
  File "/home/georgezhao/.pyvirtualenvs/TxSEml_Backend/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/georgezhao/.pyvirtualenvs/TxSEml_Backend/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 219, in __call__
    

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>

<Figure size 480x270 with 0 Axes>