In [1]:
import pandas as pd
import os
import numpy as np
import shap
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("paper")
dpi = 600

In [2]:
sns.set_theme(style="white", font_scale=0.8)

## expression or knowledge by gene

In [3]:
cell_type = ['B cell', 'CD4 T cell', 'CD8 T cell', 'Monocyte', 'NK cell', 'Unknown']
data_name = ['monaco_pbmc', 'sdy67', 'microarray', 'GSE107572', 'GSE120502', 'monaco2', 'sdy67_250']
for name in data_name:
    for mode in ['expression', 'knowledge']:
        shap_values = np.load(f'../results/SHAP/{name}/shap_{mode}_by_gene_values.npy', allow_pickle=True)
        data_tensor = np.load(f'../results/SHAP/{name}/X_tensor.npy', allow_pickle=True)

        feature_names = []
        with open(f'../results/SHAP/{name}/gene_names.txt', 'r') as f:
            for line in f:
                feature_names.append(line.strip())

        # 将列表转为Numpy数组，方便后续索引
        feature_names_np = np.array(feature_names)

        if not os.path.exists(f'../results/plot/SHAP/{name}'):
            os.makedirs(f'../results/plot/SHAP/{name}')
        for i, label in enumerate(cell_type):
            shap.summary_plot(shap_values[:, :, i], data_tensor, feature_names=feature_names, show=False, max_display=20)
            # 2. 获取当前坐标轴
            ax = plt.gca()

            # 3. 将x轴标签设置为空字符串，即可去掉它
            ax.set_xlabel('')

            # 设置顶部标题
            if mode == 'expression':
                ax.set_title(f"Top 20 important genes without knowledge: {label}", fontsize=15)
            else:
                ax.set_title(f"Top 20 important genes with knowledge integration: {label}", fontsize=15)

            # --- 计算并保存Top 100基因 ---
            # 1. 计算当前细胞类型下，每个基因的平均绝对SHAP值
            mean_abs_shap = np.mean(np.abs(shap_values[:, :, i]), axis=0)

            # 2. 获取按SHAP值降序排列的基因索引
            # np.argsort默认升序，所以使用[::-1]进行反转
            sorted_indices = np.argsort(mean_abs_shap)[::-1]

            # 3. 提取前100个基因的索引和名称
            top_100_indices = sorted_indices[:100]
            top_100_genes = feature_names_np[top_100_indices]

            # 4. 定义输出文件名并保存
            # 将标签中的空格替换为下划线，避免文件名问题
            safe_label = label.replace(' ', '_')
            output_path = f'../results/plot/SHAP/{name}/top100_genes_{mode}_{safe_label}.txt'

            with open(output_path, 'w') as f:
                for gene in top_100_genes:
                    f.write(f"{gene}\n")

            plt.savefig(f'../results/plot/SHAP/{name}/shap_{mode}_by_gene_{label}.png', dpi=dpi, bbox_inches='tight')
            plt.close()

In [4]:
cell_type = ['Astrocytes', 'Endothelial', 'Microglia', 'Neurons', 'Oligodendrocytes']
data_name = ['brain_human']
for name in data_name:
    for mode in ['expression', 'knowledge']:
        shap_values = np.load(f'../results/SHAP/{name}/shap_{mode}_by_gene_values.npy', allow_pickle=True)
        data_tensor = np.load(f'../results/SHAP/{name}/X_tensor.npy', allow_pickle=True)

        feature_names = []
        with open(f'../results/SHAP/{name}/gene_names.txt', 'r') as f:
            for line in f:
                feature_names.append(line.strip())

        feature_names_np = np.array(feature_names)
        if not os.path.exists(f'../results/plot/SHAP/{name}'):
            os.makedirs(f'../results/plot/SHAP/{name}')
        for i, label in enumerate(cell_type):
            shap.summary_plot(shap_values[:, :, i], data_tensor, feature_names=feature_names, show=False, max_display=20)
            # 2. 获取当前坐标轴
            ax = plt.gca()

            # 3. 将x轴标签设置为空字符串，即可去掉它
            ax.set_xlabel('')

            # 设置顶部标题
            if mode == 'expression':
                ax.set_title(f"Top 20 important genes without knowledge: {label}", fontsize=15)
            else:
                ax.set_title(f"Top 20 important genes with knowledge integration: {label}", fontsize=15)

            # --- 计算并保存Top 100基因 ---
            # 1. 计算当前细胞类型下，每个基因的平均绝对SHAP值
            mean_abs_shap = np.mean(np.abs(shap_values[:, :, i]), axis=0)

            # 2. 获取按SHAP值降序排列的基因索引
            # np.argsort默认升序，所以使用[::-1]进行反转
            sorted_indices = np.argsort(mean_abs_shap)[::-1]

            # 3. 提取前100个基因的索引和名称
            top_100_indices = sorted_indices[:100]
            top_100_genes = feature_names_np[top_100_indices]

            # 4. 定义输出文件名并保存
            # 将标签中的空格替换为下划线，避免文件名问题
            safe_label = label.replace(' ', '_')
            output_path = f'../results/plot/SHAP/{name}/top100_genes_{mode}_{safe_label}.txt'

            with open(output_path, 'w') as f:
                for gene in top_100_genes:
                    f.write(f"{gene}\n")

            plt.savefig(f'../results/plot/SHAP/{name}/shap_{mode}_by_gene_{label}.png', dpi=dpi, bbox_inches='tight')
            plt.close()

In [5]:
cell_type = ['sEV', 'Others']
data_name = ['CRC-sEV']
for name in data_name:
    for mode in ['expression', 'knowledge']:
        for tissue in ['normal', 'cancer']:
            shap_values = np.load(f'../results/SHAP/{name}/{tissue}/shap_{mode}_by_gene_values.npy', allow_pickle=True)
            data_tensor = np.load(f'../results/SHAP/{name}/{tissue}/X_tensor.npy', allow_pickle=True)

            feature_names = []
            with open(f'../results/SHAP/{name}/gene_names.txt', 'r') as f:
                for line in f:
                    feature_names.append(line.strip())

            feature_names_np = np.array(feature_names)
            if not os.path.exists(f'../results/plot/SHAP/{name}'):
                os.makedirs(f'../results/plot/SHAP/{name}')
            if not os.path.exists(f'../results/plot/SHAP/{name}/{tissue}'):
                os.makedirs(f'../results/plot/SHAP/{name}/{tissue}')
            for i, label in enumerate(cell_type):
                shap.summary_plot(shap_values[:, :, i], data_tensor, feature_names=feature_names, show=False, max_display=20)
                # 2. 获取当前坐标轴
                ax = plt.gca()

                # 3. 将x轴标签设置为空字符串，即可去掉它
                ax.set_xlabel(tissue.capitalize())

                # 设置顶部标题
                if mode == 'expression':
                    ax.set_title(f"Top 20 important genes without knowledge: {label}", fontsize=15)
                else:
                    ax.set_title(f"Top 20 important genes with knowledge integration: {label}", fontsize=15)

                # --- 计算并保存Top 100基因 ---
                # 1. 计算当前细胞类型下，每个基因的平均绝对SHAP值
                mean_abs_shap = np.mean(np.abs(shap_values[:, :, i]), axis=0)

                # 2. 获取按SHAP值降序排列的基因索引
                # np.argsort默认升序，所以使用[::-1]进行反转
                sorted_indices = np.argsort(mean_abs_shap)[::-1]

                # 3. 提取前100个基因的索引和名称
                top_100_indices = sorted_indices[:100]
                top_100_genes = feature_names_np[top_100_indices]

                # 4. 定义输出文件名并保存
                # 将标签中的空格替换为下划线，避免文件名问题
                safe_label = label.replace(' ', '_')
                output_path = f'../results/plot/SHAP/{name}/{tissue}/top100_genes_{mode}_{safe_label}.txt'

                with open(output_path, 'w') as f:
                    for gene in top_100_genes:
                        f.write(f"{gene}\n")

                plt.savefig(f'../results/plot/SHAP/{name}/{tissue}/shap_{mode}_by_gene_{label}.png', dpi=dpi, bbox_inches='tight')
                plt.close()

## knowledge by feature

In [6]:
data_name = ['monaco_pbmc', 'sdy67', 'microarray', 'GSE107572', 'GSE120502', 'monaco2', 'sdy67_250', 'brain_human']
for name in data_name:
    shap_values = np.load(f'../results/SHAP/{name}/shap_knowledge_values.npy', allow_pickle=True)
    knowledge_tensor = np.load(f'../results/SHAP/{name}/knowledge_tensor.npy', allow_pickle=True)

    shap_vals_mean = np.mean(np.abs(shap_values), axis=2)

    feature_names = []
    with open(f'../results/SHAP/{name}/knowledge_names.txt', 'r') as f:
        for line in f:
            feature_names.append(line.strip())

    # --- 计算并保存Top 200特征 ---
    # 1. 计算每个特征在所有样本中的平均绝对SHAP值，得到最终的重要性得分
    #    这正是 summary_plot 用来排序的依据
    importance_scores = np.mean(np.abs(shap_vals_mean), axis=0)

    # 2. 获取按重要性得分降序排列的特征索引
    sorted_indices = np.argsort(importance_scores)[::-1]

    # 3. 提取前200个最重要的特征的名称
    top_200_indices = sorted_indices[:200]
    feature_names_np = np.array(feature_names)  # 转换为Numpy数组以便索引
    top_200_features = feature_names_np[top_200_indices]


    if not os.path.exists(f'../results/plot/SHAP/{name}'):
        os.makedirs(f'../results/plot/SHAP/{name}')

    # 5. 保存这200个特征到文件
    with open(f'../results/plot/SHAP/{name}/top200_knowledge_features.txt', 'w') as f:
        for feature in top_200_features:
            f.write(f"{feature}\n")

    shap.summary_plot(shap_vals_mean, knowledge_tensor, feature_names=feature_names, show=False, max_display=20)
    # 2. 获取当前坐标轴
    ax = plt.gca()

    plt.gcf().set_size_inches(10, 8)  # 宽度优先增加
    plt.tight_layout()  # 布局紧凑排布，减少重叠

    # 3. 将x轴标签设置为空字符串，即可去掉它
    ax.set_xlabel('')

    ax.set_title(f"Top 20 important knowledge features by SHAP", fontsize=15)

    # 设置顶部标题
    # ax.set_title(label, fontsize=16)
    plt.savefig(f'../results/plot/SHAP/{name}/shap_knowledge_features.png', dpi=dpi, bbox_inches='tight')
    plt.close()

  plt.tight_layout()
  plt.tight_layout()  # 布局紧凑排布，减少重叠


In [7]:
data_name = ['CRC-sEV']
for name in data_name:
    for tissue in ['normal', 'cancer']:
        shap_values = np.load(f'../results/SHAP/{name}/{tissue}/shap_knowledge_values.npy', allow_pickle=True)
        knowledge_tensor = np.load(f'../results/SHAP/{name}/{tissue}/knowledge_tensor.npy', allow_pickle=True)

        shap_vals_mean = np.mean(np.abs(shap_values), axis=2)

        feature_names = []
        with open(f'../results/SHAP/{name}/knowledge_names.txt', 'r') as f:
            for line in f:
                feature_names.append(line.strip())

        # --- 计算并保存Top 200特征 ---
        # 1. 计算每个特征在所有样本中的平均绝对SHAP值，得到最终的重要性得分
        #    这正是 summary_plot 用来排序的依据
        importance_scores = np.mean(np.abs(shap_vals_mean), axis=0)

        # 2. 获取按重要性得分降序排列的特征索引
        sorted_indices = np.argsort(importance_scores)[::-1]

        # 3. 提取前200个最重要的特征的名称
        top_200_indices = sorted_indices[:200]
        feature_names_np = np.array(feature_names)  # 转换为Numpy数组以便索引
        top_200_features = feature_names_np[top_200_indices]


        if not os.path.exists(f'../results/plot/SHAP/{name}'):
            os.makedirs(f'../results/plot/SHAP/{name}')
        if not os.path.exists(f'../results/plot/SHAP/{name}/{tissue}'):
            os.makedirs(f'../results/plot/SHAP/{name}/{tissue}')

        # 5. 保存这200个特征到文件
        with open(f'../results/plot/SHAP/{name}/{tissue}/top200_knowledge_features.txt', 'w') as f:
            for feature in top_200_features:
                f.write(f"{feature}\n")

        shap.summary_plot(shap_vals_mean, knowledge_tensor, feature_names=feature_names, show=False, max_display=20)
        # 2. 获取当前坐标轴
        ax = plt.gca()

        plt.gcf().set_size_inches(10, 8)  # 宽度优先增加
        plt.tight_layout()  # 布局紧凑排布，减少重叠

        # 3. 将x轴标签设置为空字符串，即可去掉它
        ax.set_xlabel(tissue.capitalize())
        ax.set_title(f"Top 20 important knowledge features by SHAP", fontsize=15)

        # 设置顶部标题
        # ax.set_title(label, fontsize=16)
        plt.savefig(f'../results/plot/SHAP/{name}/{tissue}/shap_knowledge_features.png', dpi=dpi, bbox_inches='tight')
        plt.close()

  plt.tight_layout()  # 布局紧凑排布，减少重叠
