# DAC分组问题

In [1]:
import sys
sys.path.append("src")
import os
import json

import libpybiofeature

import utils
work_Dir = utils.workdir.workdir(os.getcwd(), 4)

import numpy as np
import pandas as pd

In [2]:
from Bio import SeqIO
def load_DAC_feature(TxSE_args: dict):

    # DAC
    DAC_feature = {
        "name": "DAC",
        "p": libpybiofeature.featurebuilder.build_dac_feature(
            path_to_fasta=TxSE_args['fasta']['p'],
            seq_id_list=[ seq.id for seq in SeqIO.parse(TxSE_args['fasta']['p'], "fasta") ],
            desc='p'
        ),
        "n": libpybiofeature.featurebuilder.build_dac_feature(
            path_to_fasta=TxSE_args['fasta']['n'],
            seq_id_list=[ seq.id for seq in SeqIO.parse(TxSE_args['fasta']['n'], "fasta") ],
            desc='n'
        ),
    }

    print(DAC_feature['n'].shape[0], DAC_feature['p'].shape[0])

    return DAC_feature

In [3]:
prot_type = 6
cter_bool = False
Tx_arg = {
    "type": f'T{prot_type}',
    'fasta': {
        'cter': cter_bool,
        'p': "data/T6SE/anti-eukaryotic-effector_p.fasta",
        'n': "data/T6SE/anti-eukaryotic-effector_n.fasta"
    },
}
save_dir = "out/libfeatureselection/Six_feature_research/dac/ae"

In [4]:
aac_data = load_DAC_feature(
    TxSE_args=Tx_arg
)
aa_type = list(aac_data['p'].columns)

p_DAC: 100%|██████████| 33/33 [00:00<00:00, 5830.82it/s]
n_DAC: 100%|██████████| 33/33 [00:00<00:00, 11206.54it/s]

33 33





In [5]:
import matplotlib as mpl

mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['pdf.use14corefonts'] = False
# mpl.rcParams['pdf.usecorefonts'] = True
mpl.rcParams['pdf.compression'] = 9

import matplotlib.pyplot as plt
import scienceplots

plt.style.use(['science', 'nature'])

from matplotlib.ticker import MaxNLocator

import seaborn as sns

In [6]:
aac_data_transformed = pd.concat([aac_data['p'], aac_data['n']], keys=['T6SP-AE', 'non-T6SP']).reset_index(level=0).rename({
    "level_0": "Type"
}, axis=1, inplace=False).melt(id_vars="Type", value_vars=aa_type, ignore_index=False).rename({
    "variable": "Amino acid",
    "value": "DAC"
}, axis=1, inplace=False)

In [7]:
aac_data_transformed

Unnamed: 0,Type,Amino acid,DAC
PTC37822.1,T6SP-AE,"A,A",0.011885
NP_252177.1,T6SP-AE,"A,A",0.009116
VUY43860.1,T6SP-AE,"A,A",0.017026
GAD68163.1,T6SP-AE,"A,A",0.006656
GAD68164.1,T6SP-AE,"A,A",0.006791
...,...,...,...
lcl|NC_004463.1_prot_NP_770049.1_3409,non-T6SP,"Y,Y",0.000000
lcl|NC_012779.2_prot_WP_015871331.1_1773,non-T6SP,"Y,Y",0.003876
lcl|NZ_CP009322.1_prot_WP_036056218.1_1105,non-T6SP,"Y,Y",0.002551
lcl|NZ_CP011279.2_prot_WP_042115644.1_567,non-T6SP,"Y,Y",0.000000


In [8]:
def get_star(p:float):
    if p <= 0.0001:
        return "****"
    elif p <= 0.001:
        return "***"
    elif p <= 0.01:
        return "**"
    elif p <= 0.05:
        return "*"
    else:
        return ""
from scipy.stats import wilcoxon
grouped = aac_data_transformed.groupby("Amino acid")
wilcoxon_result = {}
for name, group in grouped:
    x = group[group['Type'] == 'T6SP-AE']['DAC']
    y = group[group['Type'] == 'non-T6SP']['DAC']
    stat, p = wilcoxon(x, y)
    wilcoxon_result[name] = {
        "statistic": stat,
        "p-value": p
    }
    # print(f'Type: {name}, Wilcoxon rank-sum statistic: {stat:.2f}, p-value: {p:.3f}')
with open(f"{save_dir}/wilcoxon_result.json", "w+", encoding="UTF-8") as f:
    json.dump(wilcoxon_result, f)
aac_data_transformed.to_csv(f"{save_dir}/aac_data_transformed.csv", index_label="Seq_ID")



处理一下，选择出显著的

In [9]:
p_values_series = pd.Series({
    k:v['p-value'] for k, v in wilcoxon_result.items()
}).to_frame().rename({0: "p-value"}, axis=1)
def swap_char(str_: str):
    str_ = list(str_)
    if str_[0] > str_[2]:
        tmp = str_[2]
        str_[2] = str_[0]
        str_[0] = tmp
    return "".join(str_)
list(set([ swap_char(item) for item in p_values_series[p_values_series['p-value'] <= 0.05].index.to_list()]))

['L,P',
 'A,L',
 'H,Q',
 'I,P',
 'K,Q',
 'D,Y',
 'F,I',
 'E,Q',
 'G,W',
 'I,L',
 'I,Y',
 'V,Y',
 'H,I',
 'N,N',
 'A,V',
 'K,Y',
 'E,N',
 'L,N',
 'D,M',
 'N,S',
 'K,L',
 'K,N',
 'T,Y',
 'D,N',
 'P,P',
 'G,K',
 'I,N',
 'D,I',
 'D,S',
 'F,K',
 'I,K',
 'L,R',
 'R,R',
 'E,K',
 'A,S',
 'Q,Y',
 'E,I',
 'S,S',
 'A,T',
 'N,Y',
 'A,R',
 'K,T',
 'D,D',
 'D,K',
 'F,N',
 'G,S',
 'H,N',
 'I,S',
 'E,Y',
 'G,N',
 'E,F']