# DAC分组问题

In [1]:
import sys
sys.path.append("src")
import os
import json

import libpybiofeature

import utils
work_Dir = utils.workdir.workdir(os.getcwd(), 4)

import numpy as np
import pandas as pd

In [2]:
def load_DAC_feature(TxSE_args: dict):

    # Extract Feature
    seq_id_dict = None
    with open(TxSE_args['seq_id'], 'r', encoding='UTF-8') as f:
        seq_id_dict = json.load(f)

    # DAC
    DAC_feature = {
        "name": "DAC",
        "t_p": libpybiofeature.featurebuilder.build_dac_feature(
            path_to_fasta=TxSE_args['fasta']['t']['p'],
            seq_id_list=seq_id_dict['t']['p'],
            desc='t_p'
        ),
        "t_n": libpybiofeature.featurebuilder.build_dac_feature(
            path_to_fasta=TxSE_args['fasta']['t']['n'],
            seq_id_list=seq_id_dict['t']['n'],
            desc='t_n'
        ),
        "v_p": libpybiofeature.featurebuilder.build_dac_feature(
            path_to_fasta=TxSE_args['fasta']['v']['p'],
            seq_id_list=seq_id_dict['v']['p'],
            desc='v_p'
        ),
        "v_n": libpybiofeature.featurebuilder.build_dac_feature(
            path_to_fasta=TxSE_args['fasta']['v']['n'],
            seq_id_list=seq_id_dict['v']['n'],
            desc='v_n'
        ),
    }

    DAC_feature['p'] = pd.concat([
        DAC_feature['t_p'],
        DAC_feature['v_p'],
    ])
    DAC_feature['n'] = pd.concat([
        DAC_feature['t_n'],
        DAC_feature['v_n'],
    ])

    return DAC_feature

In [3]:
prot_type = 6
cter_bool = False
Tx_arg = {
    "type": f'T{prot_type}',
    'seq_id': os.path.join(work_Dir, *['data', 'db', f'T{prot_type}', 'seq_id.json']),
    'fasta': {
        'cter': cter_bool,
        't': {
            'p': os.path.join(work_Dir, *['data', 'db', f'T{prot_type}', 't_p.fasta']),
            'n': os.path.join(work_Dir, *['data', 'db', f'T{prot_type}', 't_n.fasta'])
        },
        'v': {
            'p': os.path.join(work_Dir, *['data', 'db', f'T{prot_type}', 'v_p.fasta']),
            'n': os.path.join(work_Dir, *['data', 'db', f'T{prot_type}', 'v_n.fasta'])
        },
    },
}
save_dir = "out/libfeatureselection/Six_feature_research/dac_all/origin/"
os.makedirs(save_dir, exist_ok=True)

In [4]:
aac_data = load_DAC_feature(
    TxSE_args=Tx_arg
)
aa_type = list(aac_data['p'].columns)

t_p_DAC: 100%|██████████| 55/55 [00:00<00:00, 7492.26it/s]
t_n_DAC: 100%|██████████| 55/55 [00:00<00:00, 9805.19it/s]
v_p_DAC: 100%|██████████| 37/37 [00:00<00:00, 13005.05it/s]
v_n_DAC: 100%|██████████| 37/37 [00:00<00:00, 11419.37it/s]


In [5]:
import matplotlib as mpl

mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['pdf.use14corefonts'] = False
# mpl.rcParams['pdf.usecorefonts'] = True
mpl.rcParams['pdf.compression'] = 9

import matplotlib.pyplot as plt
import scienceplots

plt.style.use(['science', 'nature'])

from matplotlib.ticker import MaxNLocator

import seaborn as sns

In [6]:
aac_data_transformed = pd.concat([aac_data['p'], aac_data['n']], keys=['T1SP', 'non-T1SP']).reset_index(level=0).rename({
    "level_0": "Type"
}, axis=1, inplace=False).melt(id_vars="Type", value_vars=aa_type, ignore_index=False).rename({
    "variable": "Amino acid",
    "value": "DAC"
}, axis=1, inplace=False)

In [7]:
aac_data_transformed

Unnamed: 0,Type,Amino acid,DAC
AAG05028.1,T1SP,"A,A",0.010050
RKO36372.1,T1SP,"A,A",0.023952
AAY95397.1,T1SP,"A,A",0.039409
sp|Q9I069.1|TSE4_PSEAE,T1SP,"A,A",0.020725
WP_033975570.1,T1SP,"A,A",0.006993
...,...,...,...
lcl|NZ_HG322949.1_prot_WP_038496656.1_4371,non-T1SP,"Y,Y",0.000000
lcl|NC_007899.1_prot_WP_011457855.1_295,non-T1SP,"Y,Y",0.003610
lcl|CP000857.1_prot_ACN46596.1_2401,non-T1SP,"Y,Y",0.000000
lcl|AYSJ01000002.1_prot_ETS33381.1_517,non-T1SP,"Y,Y",0.002006


In [8]:
def get_star(p:float):
    if p <= 0.0001:
        return "****"
    elif p <= 0.001:
        return "***"
    elif p <= 0.01:
        return "**"
    elif p <= 0.05:
        return "*"
    else:
        return ""
from scipy.stats import wilcoxon
grouped = aac_data_transformed.groupby("Amino acid")
wilcoxon_result = {}
for name, group in grouped:
    x = group[group['Type'] == 'T1SP']['DAC']
    y = group[group['Type'] == 'non-T1SP']['DAC']
    stat, p = wilcoxon(x, y)
    wilcoxon_result[name] = {
        "statistic": stat,
        "p-value": p
    }
    # print(f'Type: {name}, Wilcoxon rank-sum statistic: {stat:.2f}, p-value: {p:.3f}')
with open(f"{save_dir}/wilcoxon_result.json", "w+", encoding="UTF-8") as f:
    json.dump(wilcoxon_result, f)
aac_data_transformed.to_csv(f"{save_dir}/aac_data_transformed.csv", index_label="Seq_ID")



处理一下，选择出显著的

In [9]:
p_values_series = pd.Series({
    k:v['p-value'] for k, v in wilcoxon_result.items()
}).to_frame().rename({0: "p-value"}, axis=1)
p_values_series[p_values_series['p-value'] <= 0.05].index.to_list()

['A,A',
 'A,C',
 'A,L',
 'A,R',
 'C,G',
 'C,I',
 'C,R',
 'D,K',
 'D,S',
 'D,Y',
 'E,E',
 'E,G',
 'E,I',
 'E,N',
 'E,T',
 'E,Y',
 'F,A',
 'F,N',
 'F,R',
 'G,E',
 'G,K',
 'G,N',
 'G,Y',
 'H,N',
 'H,T',
 'I,G',
 'I,H',
 'I,K',
 'I,T',
 'I,Y',
 'K,D',
 'K,G',
 'K,N',
 'K,T',
 'K,V',
 'L,C',
 'L,H',
 'L,M',
 'L,R',
 'M,L',
 'M,R',
 'M,Y',
 'N,G',
 'N,K',
 'N,Q',
 'N,S',
 'N,V',
 'P,K',
 'P,N',
 'P,R',
 'P,Y',
 'Q,E',
 'Q,G',
 'Q,H',
 'Q,Y',
 'R,A',
 'R,C',
 'R,D',
 'R,T',
 'S,C',
 'T,K',
 'T,N',
 'T,Y',
 'V,L',
 'W,I',
 'W,P',
 'Y,D',
 'Y,K',
 'Y,N',
 'Y,Q',
 'Y,S']