In [1]:
import gseapy as gp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
import warnings
warnings.filterwarnings('ignore')
sns.set_context("paper")
sns.set_theme(style="whitegrid", font_scale=0.8)
dpi = 600

In [2]:
def plot_KEGG(enr=None, top_nums=15, show=False, file_name=None):
    """
    绘制KEGG通路富集分析结果的条形图和气泡图
    :param enr: enrichr对象，包含富集分析结果
    :param top_nums: 要显示的通路数量
    :param file_name: 保存图表的文件名，如果为None则不保存
    :return:
    """
    results_df = enr.results.sort_values(by='Adjusted P-value').head(top_nums)

    # 美化通路名称
    def clean_term_names(term):
        return term.replace("KEGG_", "").replace("_", " ").capitalize()
    results_df['Term_clean'] = results_df['Term'].apply(clean_term_names)

    results_df['-log10(AdjP)'] = -np.log10(results_df['Adjusted P-value'])

    # 解析 'Overlap' 列
    results_df['Genes_in_set'] = results_df['Overlap'].apply(lambda x: int(x.split('/')[0]))

    """ --- 绘制条形图 --- """
    plt.figure(figsize=(7, 8))

    ax = sns.barplot(
        data=results_df.sort_values(by='-log10(AdjP)', ascending=False), # 使用新列名排序
        x='-log10(AdjP)', # 使用新列名作为x轴数据
        y='Term_clean',
        palette='viridis'
    )

    # --- 图表美化 ---
    # ax.set_title('KEGG Pathway Enrichment Analysis', fontsize=16, fontweight='bold')
    ax.set_xlabel(r'$-\log_{10}(\mathrm{Adjusted\ P-value})$', fontsize=12)
    ax.set_ylabel('')
    ax.tick_params(axis='y', labelsize=11)
    sns.despine()
    plt.tight_layout()

    if file_name:
        plt.savefig(f"{file_name}_bar.png", dpi=dpi, bbox_inches='tight')
    if show:
        plt.show()
    plt.close()

    """ --- 绘制气泡图 --- """
    fig = plt.figure(figsize=(6, 7))
    gs = gridspec.GridSpec(1, 2, width_ratios=[8, 2], wspace=0.1)
    ax = plt.subplot(gs[0])

    # 使用新列名作为hue
    sns.scatterplot(
        data=results_df,
        x='Combined Score',
        y='Term_clean',
        size='Genes_in_set',
        hue='-log10(AdjP)', # 使用新列名作为颜色映射
        palette='viridis_r',
        sizes=(50, 500),
        edgecolor='black',
        linewidth=0.5,
        ax=ax,
        legend=False
    )

    # 主图美化
    # ax.set_title('KEGG Pathway Enrichment Analysis', fontsize=16, fontweight='bold', pad=20)
    ax.set_xlabel('Combined Score', fontsize=12)
    ax.set_ylabel('')
    ax.tick_params(axis='y', labelsize=11)
    ax.grid(True, axis='x', linestyle='--', alpha=0.6)
    ax.margins(y=0.05)
    sns.despine(ax=ax)


    # --- 在画布的特定位置创建对齐的图例 ---
    x_pos = 0.82

    # 大小图例 (Gene Count)
    min_size = results_df['Genes_in_set'].min()
    max_size = results_df['Genes_in_set'].max()
    legend_sizes = [min_size, int((min_size + max_size) / 2), max_size]
    legend_handles = [plt.scatter([], [], s=s*20, color='gray', edgecolor='black', linewidth=0.5) for s in legend_sizes]
    size_legend = fig.legend(handles=legend_handles,
                             labels=[str(s) for s in legend_sizes],
                             bbox_to_anchor=(x_pos, 0.88),
                             loc='upper left',
                             title='Gene Count',
                             frameon=False,
                             fontsize=11,
                             title_fontsize=12,
                             labelspacing=1.2)

    # 颜色条图例
    cax = fig.add_axes([x_pos, 0.25, 0.03, 0.3])
    # 使用新列名设置颜色条范围
    norm = plt.Normalize(results_df['-log10(AdjP)'].min(), results_df['-log10(AdjP)'].max())
    sm = plt.cm.ScalarMappable(cmap="viridis_r", norm=norm)
    sm.set_array([])
    cbar = fig.colorbar(sm, cax=cax, orientation='vertical')
    cbar.ax.tick_params(labelsize=10)
    cbar.set_label(r'$\log_{10}\left(\frac{1}{\mathrm{FDR}}\right)$', rotation=270, labelpad=20, fontsize=12)

    if file_name:
        plt.savefig(f"{file_name}_dot.png", dpi=dpi, bbox_inches='tight')
    if show:
        plt.show()
    plt.close()

In [5]:
ppi_genes = set(pd.read_csv('../data/knowledge/PPI_data_min700.txt', sep='\t')['protein1'].tolist())
data_name = ['monaco_pbmc', 'sdy67', 'microarray', 'GSE107572', 'GSE120502', 'monaco2', 'sdy67_250', 'brain_human']
for name in data_name:
    print(f'Processing {name}...')
    features_list = []
    with open(f'../results/plot/SHAP/{name}/top200_knowledge_features.txt', 'r') as f:
        for line in f:
            features_list.append(line.strip())
    gene_list = list(set(features_list) & ppi_genes)
    enr = gp.enrichr(gene_list=gene_list,
                     gene_sets='../data/knowledge/c2.cp.kegg_legacy.v2024.1.Hs.symbols.gmt',
                     organism='Human',
                     cutoff=0.5
                     )

    plot_KEGG(enr, top_nums=15, file_name=f'../results/plot/SHAP/{name}/KEGG_enrichment')


Processing monaco_pbmc...
Processing sdy67...
Processing microarray...
Processing GSE107572...
Processing GSE120502...
Processing monaco2...
Processing sdy67_250...
Processing brain_human...


In [6]:
ppi_genes = set(pd.read_csv('../data/knowledge/PPI_data_min700.txt', sep='\t')['protein1'].tolist())
data_name = ['CRC-sEV']
for name in data_name:
    for tissue in ['normal', 'cancer']:
        features_list = []
        with open(f'../results/plot/SHAP/{name}/{tissue}/top200_knowledge_features.txt', 'r') as f:
            for line in f:
                features_list.append(line.strip())
        gene_list = list(set(features_list) & ppi_genes)
        enr = gp.enrichr(gene_list=gene_list,
                         gene_sets='../data/knowledge/c2.cp.kegg_legacy.v2024.1.Hs.symbols.gmt',
                         organism='Human',
                         cutoff=0.5
                         )

        plot_KEGG(enr, top_nums=15, file_name=f'../results/plot/SHAP/{name}/{tissue}/KEGG_enrichment')
