In [None]:
from Bio import SeqIO
import pandas as pd
import numpy as np

from sklearn.metrics import roc_curve, confusion_matrix, precision_score, accuracy_score, f1_score, matthews_corrcoef, recall_score,auc,precision_recall_curve


data = {'stand':'','rocAUC': '', 'prAUC': '', 'MCC': '', 'F1': '', 
        'Precision': '', 'Accuracy': '', 'Sensitivity': '', 'Specificity': '',
        'FPR': '', 'Recall': '','pro_cutoff':''}
df = pd.DataFrame(columns=data.keys())



bac_name = ['Ralstonia_pseudosolanacearum_GMI1000','Salmonella_LT2','Coxiella_burnetii_RSA_331',
            'new_Pseudomonas_sp.MIS38','new_Burkholderia_mallei_ATCC_23344','val'][0]


bac_type = ['T3','T4','T1','T2','T5'][0]
stand_ = ['lossen','strict']
cd_hit = [30,70][1]

for stand in stand_:
    fasta_file =f"/mnt/md0/Public/T3_T4/data/new_{bac_type}/val_data/{stand}_{bac_name}.fasta"
    protein_ids = []
    for seq_record in SeqIO.parse(fasta_file, "fasta"):
        protein_id = seq_record.id
        protein_ids.append(protein_id)
    
    for pro_cutoff in [0.1,0.2,0.3,0.4,0.5,
                        0.6,0.7,0.8,0.9]: 
        val_df = pd.read_excel(f'T3/80.xlsx')
        pred = val_df['Voting']
        target_list = val_df['protein']
        target = []
        for a in range(len(target_list)):
            if target_list[a] in protein_ids:
                target.append(1)
            else:
                target.append(0)
        
        target_ = np.reshape(target, (len(target), 1))
        predict_result_list = []


        def calculate_fpr(y_true, y_pred):
            tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
            fpr = fp / (fp + tn)
            return fpr
        fpr, tpr, thresholds = roc_curve(target_, pred)
            
            
            
        pred_l = [1 if i >= pro_cutoff else 0 for i in pred]
        #后面新增的计算prAUC
        confusion_matrix_1d = confusion_matrix(target_, pred_l).ravel()
        confusion_dict = {N: n for N, n in zip(['tn', 'fp', 'fn', 'tp'], list(
            confusion_matrix_1d * 2 / np.sum(confusion_matrix_1d)))}
        
        FPR = calculate_fpr(target_,pred_l)
        
        precision, recall, _ = precision_recall_curve(target_, pred)
        pr_auc = auc(recall, precision)
        Recall = recall_score(target_, pred_l)
        evaluation = {
            "stand":stand,
            "rocAUC": auc(fpr, tpr),
            "prAUC": pr_auc,
            "MCC": matthews_corrcoef(target_, pred_l),
            "F1": f1_score(target_, pred_l),
            "Precision": precision_score(target_, pred_l,zero_division=1),
            "Accuracy": accuracy_score(target_, pred_l),
            "Sensitivity": confusion_dict['tp'] / (confusion_dict['tp'] + confusion_dict['fn']),
            "Specificity": confusion_dict['tn'] / (confusion_dict['tn'] + confusion_dict['fp']),
            "FPR":FPR,
            "Recall":Recall,
            'pro_cutoff': pro_cutoff
        }
        df = pd.concat([df, pd.DataFrame(evaluation, index=[0])], ignore_index=True)

df.to_excel(f'80.xlsx', index=False)

In [28]:
from Bio import SeqIO
import pandas as pd
import numpy as np
import json
import pickle
from sklearn.metrics import roc_curve, confusion_matrix, precision_score, accuracy_score, f1_score, matthews_corrcoef, recall_score,auc,precision_recall_curve
import os
feature_data_set = []
model_list = ["XGBClassifier", "GaussianNB", "GradientBoostingClassifier",   
                                "SVC","KNeighborsClassifier", 
                                "RandomForestClassifier"]
feature_list = ['18pp','AAC','BPBaac','CTDC','CTDT','CTriad','onehot',
            'PC-PseAAC','ppt25','QSO','SC-PseAAC','CTDD','DPC','ppt']
rate = '1_100'
data_dir = 'txseml_addon/out/libfeatureselection/T3/val_data'
bac_name = 'new_Ralstonia_pseudosolanacearum_GMI1000.fasta'
seq_id_list = []
for record in SeqIO.parse('data/new_T3/val_tofeature/new_Ralstonia_pseudosolanacearum_GMI1000.fasta', "fasta"):
        seq_id_list.append(record.id)
allresult_dict = {}
for model_name in model_list:
    
    for feature_name in feature_list:
        tmp_result_dict = {}
        a = 0
        while a< 5:
            model_save_dir = f"/mnt/md0/Public/T3_T4/model/T3/70_model/{feature_name}/{rate}/{a}"
            
            val_df = pd.read_csv(f'{data_dir}/{bac_name}_{feature_name}.csv')
            
            val_df1 = val_df.iloc[0:, 1:]
            
            feature = pd.DataFrame(val_df1)
            if feature_name == 'CTriad':
                feature_ = np.array([eval(row) for row in feature['CTriad']])
            else:
                feature_ = feature.astype("float").values
            
            model = pickle.load(open(f"{model_save_dir}/{model_name}.pkl", "br"))
            
            with open(f"{rate}_threshold.json", "r", encoding="UTF-8") as f:
                threshold_dict = json.load(f)[f'{model_name}']
            
            tmp_result_dict[a] = (
                np.nan_to_num(model.predict_proba(feature_), nan=0.0)[:, 1] >= threshold_dict[feature_name]
                ).astype(int)
            a+=1
        tmp = np.stack([
                tmp_result_dict[a] for a in [0,1,2,3,4]
            ], axis=1).mean(axis=1)
        pred = pd.DataFrame(tmp)
        feature_data_set.append({
            "name": f"{model_name}_{feature_name}",
            "submit": pred,
        })
         
            
data_set_split = {
    datatype: pd.concat([
        item[datatype] for item in feature_data_set
    ], axis=1)
    for datatype in ["submit",]
}
data = pd.DataFrame(data_set_split["submit"])
data.insert(0, "protein_id", seq_id_list)
data.to_csv('feature.csv',index=False)

In [None]:
df = pd.read_csv('feature.csv',header=None)

feature = df.iloc[0:,1:]

protein = list(df[0])
fasta_file =f"blast_out/T3/Ralstonia_pseudosolanacearum_GMI1000/lossen/lossen_Ralstonia_pseudosolanacearum_GMI1000_blast.fasta"
protein_ids = []
for seq_record in SeqIO.parse(fasta_file, "fasta"):
    protein_id = seq_record.id
    protein_ids.append(protein_id)
target = []
for id in protein:
    if id in protein_ids:
        target.append(1)
    else:
        target.append(0)
        
feature_ = feature.astype("float").values
target_ = np.reshape(target, (len(target), 1))

In [29]:
df = pd.read_csv('feature.csv',header=None)

feature = df.iloc[0:,1:]

SVR

In [13]:
import pandas as pd
df = pd.read_csv('11.csv')

In [55]:
clo = list(df.columns)
fir = clo.index('ABS1')
las = clo.index('ABS33')
label = df.loc[0:6,'ABS34']
feature = df.iloc[0:7,fir:las+1]

In [56]:
feature_ = feature.astype("int").values
label_= label.astype("int").values

In [60]:
test = df.iloc[7:8,fir:las+1]

In [None]:
test

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
 
# 生成模拟数据
np.random.seed(42)
X = np.sort(5 * np.random.rand(100, 1), axis=0)
y = np.sin(X).ravel() + np.random.normal(0, 0.1, X.shape[0])
 
# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
# 绘制数据点
plt.scatter(feature_, label_, color='darkorange', label='data')
plt.title('Data Points')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.show()

In [None]:
# 定义支持向量机回归模型，使用RBF核函数
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)
 
# 训练模型
svr.fit(feature_, label_)
 
# 对测试集进行预测
#y_pred = svr.predict(X_test)

In [None]:
y_pred = svr.predict(test)
print(y_pred)

In [None]:
df.loc[7,'ABS34']

In [None]:
# 计算均方误差 (MSE) 和决定系数 (R²)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
 
print("均方误差 (MSE):", mse)
print("决定系数 (R²):", r2)

In [None]:
# 绘制测试集预测值与实际值的对比图
plt.scatter(X_test, y_test, color='darkorange', label='Actual')
plt.scatter(X_test, y_pred, color='navy', label='Predicted')
plt.plot(X_test, y_pred, color='blue', linewidth=2, label='SVR Model')
plt.title('SVR: Actual vs Predicted')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.legend()
plt.show()