# 要做的事情
1. 用TxSEml在26个沙门氏菌染色体上扫描所有的T1/2/6型分泌蛋白；
2. 尽可能保障并证明这些分泌蛋白是准确的；
3. 观察这些分泌蛋白在沙门氏菌中的分布和进化。

# 需要的图
## 图8A
I、II、VI型分泌蛋白直系家族的总体总结：一个柱状图，三个柱子，分别表述I、II、VI型分泌蛋白家族的数目
> 图8A主要反映细菌中I/II/VI型分泌蛋白可能的数量

## 图8B
也做柱状图，反映26个菌，每个菌中I、II、VI型分泌蛋白的数量

26*3个柱子，还是26个柱子（叠加）或者撕开三个小图（I/II/VI）？

> 图8B主要反映I/II/VI型分泌蛋白在沙门氏菌中的数量波动

## 图8C
聚类热图，聚类 - 按照26个菌的进化树；每个菌一行，每个T1/2/6基因（家族）一列，相应的菌有相应的T1/2/6基因（家族）就用颜色表示，没有相应的基因就不标颜色。T1/2/6做成三个小图

# 代码部份

## 定义阈值

In [1]:
class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

threshold = AttrDict(
    T1=1,
    T2=0.9,
    T6=1
)

## 加载数据

In [2]:
import pandas as pd
import numpy as np

In [3]:
predict_result_path = "out/libfeatureselection/Part6.Application/TxSEml_data/table.xlsx"
family_df_path = "data/Salmonellosis/26-Salmonella-PG-ann.xlsx"

In [4]:
family_df = pd.read_excel(family_df_path, index_col=[0,], header=[0,])
predict_result_df_T1 = pd.read_excel(predict_result_path, "T1", index_col=[0,], header=[0,])
predict_result_df_T2 = pd.read_excel(predict_result_path, "T2", index_col=[0,], header=[0,])
predict_result_df_T6 = pd.read_excel(predict_result_path, "T6", index_col=[0,], header=[0,])

  warn(msg)


In [5]:
predict_result_df = pd.DataFrame(
    {
        "T1": predict_result_df_T1.iloc[:, 0].to_list(),
        "T2": predict_result_df_T2.iloc[:, 0].to_list(),
        "T6": predict_result_df_T6.iloc[:, 0].to_list(),
    },
    index=predict_result_df_T1.index
)
predict_result_df.head(5)

Unnamed: 0,T1,T2,T6
26CG0001,0.0,0.1,0.545455
26CG0002,0.090909,0.0,0.0
26CG0003,0.0,0.0,0.0
26CG0004,0.0,0.0,0.909091
26CG0005,0.0,0.0,0.0


## FigA 用阈值去Cutoff

In [6]:
predict_result_binary_df = (predict_result_df >= np.array([threshold.T1, threshold.T2, threshold.T6])).astype(int)
predict_result_binary_df.sum()

T1    344
T2    285
T6    365
dtype: int64

In [7]:
predict_result_text_df = predict_result_binary_df.apply(
    lambda col: col.replace({
        1: f"T{col.name[1]}SP",
        0: f"non-T{col.name[1]}SP",
    }), axis=0
)

In [8]:
predict_result_binary_df.to_csv("out/libfeatureselection/Part6.Application/out/data/predict_result_binary_df.csv", index_label="UID")
predict_result_text_df.to_csv("out/libfeatureselection/Part6.Application/out/data/predict_result_text_df.csv", index_label="UID")

## FigB 用阈值去Cutoff，并在菌种中分别研究

In [9]:
predict_result_text_count_df = pd.melt(predict_result_text_df).groupby(by=["variable", "value"]).apply(
    lambda gdb: gdb.shape[0]
).to_frame().rename({
    0: "num",
}, axis=1)
predict_result_text_count_df.to_csv("out/libfeatureselection/Part6.Application/out/data/predict_result_text_count_df.csv")

In [10]:
family_df.head(3)

Unnamed: 0_level_0,1121,11_01853,11_01854,11_01855,14028S,2439-64,287_91,ATCC9120,ATCC9150,ATCC_BAA_1581,...,RKS2983,RKS2986,RKS3013,RKS3027,RKS3044,RKS3057,RKS4594,RSK2980,SPB7,ST114
UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26CG0001,SEI61121_RS35145,UQ48_RS05315,UQ49_RS05315,UQ50_RS05310,STM14_RS01660,AJH72_RS07810,SG_RS01100,SEEP9120_RS05265,SPA_RS01115,SEHO0A_RS07950,...,N898_RS13290,RKS2986_RS02630,RKS3013_RS02285,RKS3027_RS02400,N643_RS01015,RKS3057_RS02295,SPC_RS01095,SARI_RS13030,SPAB_RS01115,LFZ92_RS18560
26CG0002,SEI61121_RS34855,UQ48_RS08685,UQ49_RS08685,UQ50_RS08675,STM14_RS05320,AJH72_RS12190,SG_RS04640,SEEP9120_RS13965,SPA_RS09160,SEHO0A_RS11055,...,N898_RS09575,RKS2986_RS22495,RKS3013_RS21185,RKS3027_RS21445,N643_RS04115,RKS3057_RS22465,SPC_RS04815,SARI_RS09285,SPAB_RS10335,LFZ92_RS14600
26CG0003,SEI61121_RS27365,UQ48_RS21295,UQ49_RS21295,UQ50_RS21260,STM14_RS18335,AJH72_RS04430,SG_RS20135,SEEP9120_RS01895,SPA_RS16835,SEHO0A_RS22495,...,N898_RS19370,RKS2986_RS09725,RKS3013_RS09240,RKS3027_RS08930,N643_RS15140,RKS3057_RS09200,SPC_RS17520,SARI_RS19510,SPAB_RS17505,LFZ92_RS02080


In [11]:
ordered_col_name = open("out/libfeatureselection/Part6.Application/out/data/26Order.txt", "r", encoding="UTF-8").read().splitlines()

In [12]:
family_sp_dict = {
    sp_name: set(family_df[sp_name][family_df[sp_name] != "-"].index.to_list())
    for sp_name in family_df.columns
}

In [13]:
result = list()
for prottype_str in predict_result_binary_df.columns:
    prottype_str
    pred_pos_uid = set(predict_result_binary_df[predict_result_binary_df[prottype_str] == 1].index.to_list())

    for sp_name in ordered_col_name:
        if sp_name == '1121':
            sp_name = 1121
        result.append([prottype_str, f"{prottype_str}SP" ,sp_name, len(family_sp_dict[sp_name] & pred_pos_uid)])
        # result.append([prottype_str, f"non-{prottype_str}SP" ,sp_name, len(family_sp_dict[sp_name] - pred_pos_uid)])

In [14]:
pd.DataFrame(result, columns=[
    "ProtType", "Type", "Strain", "Count"
]).to_csv("out/libfeatureselection/Part6.Application/out/data/Strain_sp_col.csv", index=False)

## FigC 热图绘制

In [31]:
pred_pos_uid_list_all = {}
for prottype_str in predict_result_binary_df.columns:
    pred_pos_uid_list = predict_result_binary_df[predict_result_binary_df[prottype_str] == 1].index.to_list()
    pred_pos_uid_list_all[prottype_str] = pd.DataFrame(
        np.zeros(
            (len(ordered_col_name), len(pred_pos_uid_list))
        ),
        index=ordered_col_name,
        columns=pred_pos_uid_list,
    )

    for sp_name in ordered_col_name:
        if sp_name == '1121':
            sp_name = 1121
        pred_pos_uid_list_all[prottype_str].loc[str(sp_name), list(family_sp_dict[sp_name] & set(pred_pos_uid_list))] = 1
        # result.append([prottype_str, f"non-{prottype_str}SP" ,sp_name, len(family_sp_dict[sp_name] - pred_pos_uid)])

    pred_pos_uid_list_all[prottype_str].to_csv(f"out/libfeatureselection/Part6.Application/out/data/FigC/{prottype_str}.csv")