CD4/Monocyte Analysis Updates for Revision

deto · Dec 17, 2020 · fe9775c · fe9775c
1 parent 98c6b16
commit fe9775c
Show file tree

Hide file tree

Showing 44 changed files with 3,855 additions and 343 deletions.
diff --git a/Transcriptomics/CD4_w_protein/Snakefile b/Transcriptomics/CD4_w_protein/Snakefile
@@ -3,13 +3,51 @@ subworkflow data:
         "../../data/10x_PBMC_w_proteins"
 
 data_file = "cd4/data.loom"
+data_file_train = "cd4/data_train.loom"
+data_file_test = "cd4/data_test.loom"
 ab_file = "cd4/ab.txt.gz"
 
 include: "../latentTestsSnake/Snakefile"
+include: "../Snakefile_splits"
 
 rule computeGRScores:
     input:
         signatures="../../data/Signatures/TCells_noCD8.gmt",
     output:
         out="evaluation/geneRelevanceScores.txt"
     script: "scripts/computeGeneRelevanceScores.py"
+
+rule runWGCNA:
+    input:
+        loom=data(data_file),
+        genes="genes/threshold.txt",
+    params:
+        power="auto",
+        min_module_size=15,
+    output:
+        cluster_output="wgcna/modules.txt",
+        scores="wgcna/module_scores.txt",
+    script: "../../pipelineScripts/wgcna/wgcna.R"
+
+rule runICA10:
+    input:
+        loom=data(data_file),
+        genes="genes/threshold.txt",
+    params:
+        n_components=10,
+    output:
+        components="ica10/components.txt",
+        cell_loadings="ica10/cell_loadings.txt"
+    script: "../../pipelineScripts/ICA/run_ica.py"
+
+rule runICA10_fdr:
+    input:
+        components=rules.runICA10.output.components,
+    params:
+        qvalcutoff=1e-3,
+    output:
+        components_fdr="ica10/components_fdr.txt",
+        modules="ica10/modules.txt",
+    script: "../../pipelineScripts/ICA/run_ica_fdr.R"
+
+include: "Snakefile_k_sensitivity"
diff --git a/Transcriptomics/CD4_w_protein/Snakefile_k_sensitivity b/Transcriptomics/CD4_w_protein/Snakefile_k_sensitivity
@@ -0,0 +1,17 @@
+rule runHotspot_k_sensitivity:
+    input:
+        loom=data(data_file),
+        latent=rules.SCVI_hvg.output.latent,
+    params:
+        model='danb',
+        n_neighbors='{k}',
+        n_cells_min=10,
+    output:
+        results="k_sensitivity/k_{k}/hotspot.txt"
+    script: "../../pipelineScripts/hotspot/runHotspot.py"
+
+k_values = [5, 10, 30, 50, 100, 300, 500, 1000]
+
+rule runHotspot_k_sensitivity_all:
+    input:
+        expand(rules.runHotspot_k_sensitivity.output.results, k=k_values)
diff --git a/Transcriptomics/CD4_w_protein_downsampling/Snakefile b/Transcriptomics/CD4_w_protein_downsampling/Snakefile
@@ -0,0 +1,8 @@
+subworkflow data:
+    workdir:
+        "../../data/10x_PBMC_w_proteins/"
+
+data_file = "cd4/downsampled_{rate}/data.loom"
+data_file_test = "cd4/data_test.loom"
+
+include: "../Snakefile_downsampling"
diff --git a/Transcriptomics/CD4_w_protein_downsampling/pipelineScripts b/Transcriptomics/CD4_w_protein_downsampling/pipelineScripts
@@ -0,0 +1 @@
+../../pipelineScripts/
diff --git a/Transcriptomics/Figures/CD4_Correlation/ModuleConsistency.py b/Transcriptomics/Figures/CD4_Correlation/ModuleConsistency.py
@@ -0,0 +1,185 @@
+import os
+import pandas as pd
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+plt.rcParams['svg.fonttype'] = 'none'
+
+# Pick which set of results to plot - CD4 or Monocytes
+# base_dir = '../../CD4_w_protein'
+base_dir = '../../Mono_w_protein'
+
+datasets = [
+    {
+        'Name': 'Hotspot',
+        'Train': os.path.join(base_dir, 'train/hotspot_hs/modules.txt'),
+        'Test': os.path.join(base_dir, 'test/hotspot_hs/modules.txt'),
+    },
+    {
+        'Name': 'WGCNA',
+        'Train': os.path.join(base_dir, 'train/wgcna_hs/modules.txt'),
+        'Test': os.path.join(base_dir, 'test/wgcna_hs/modules.txt'),
+    },
+    {
+        'Name': 'ICA5',
+        'Train': os.path.join(base_dir, 'train/ica5/modules.txt'),
+        'Test': os.path.join(base_dir, 'test/ica5/modules.txt'),
+    },
+    # # Don't need so many ICA versions as this one isn't that different from ICA10
+    # {
+    #     'Name': 'ICA8',
+    #     'Train': os.path.join(base_dir, 'train/ica8/modules.txt'),
+    #     'Test': os.path.join(base_dir, 'test/ica8/modules.txt'),
+    # },
+    {
+        'Name': 'ICA10',
+        'Train': os.path.join(base_dir, 'train/ica10/modules.txt'),
+        'Test': os.path.join(base_dir, 'test/ica10/modules.txt'),
+    },
+    {
+        'Name': 'Grnboost',
+        'Train': os.path.join(base_dir, 'train/grnboost/modules.txt'),
+        'Test': os.path.join(base_dir, 'test/grnboost/modules.txt'),
+    },
+]
+
+for data in datasets:
+    train = pd.read_table(data['Train'], index_col=0)
+    test = pd.read_table(data['Test'], index_col=0)
+
+    train = train.Cluster.to_dict()
+    test = test.Cluster.to_dict()
+    data['TrainDict'] = train
+    data['TestDict'] = test
+
+
+def eval_module_consistency(data):
+    consistency = (
+        eval_module_consistency_inner(data['TrainDict'], data['TestDict']) +
+        eval_module_consistency_inner(data['TestDict'], data['TrainDict'])
+    ) / 2
+
+    data['Consistency'] = consistency
+
+
+def eval_module_consistency_inner(dict_a, dict_b):
+
+    all_genes = set(dict_a.keys()) & set(dict_b.keys())
+
+    # For each pairs of genes that are in the same module in 'A', how many are in the same module in 'B'?
+
+    denom = 0
+    num = 0
+    for ga in all_genes:
+        for gb in all_genes:
+
+            if ga == gb: continue
+
+            if dict_a[ga] == dict_a[gb] and dict_a[ga] != -1 and dict_a[gb] != -1:  # Same module in A
+                denom += 1
+
+                if dict_b[ga] == dict_b[gb] and dict_b[ga] != -1 and dict_b[gb] != -1:  # Same module in B
+                    num += 1
+
+    num = num/2
+    denom = denom/2
+
+    consistent_rate = num/denom
+
+    return consistent_rate
+
+
+def eval_num_modules(data):
+    num_modules = (
+        pd.Series(data['TrainDict']).unique().size - 1 +
+        pd.Series(data['TestDict']).unique().size - 1
+    ) / 2
+
+    data['NumModules'] = num_modules
+
+
+def eval_num_assigned(data):
+    assigned = (
+        (pd.Series(data['TrainDict']) != -1).sum()/2 +
+        (pd.Series(data['TestDict']) != -1).sum()/2
+    )
+
+    data['NumAssigned'] = assigned
+
+
+for data in tqdm(datasets):
+    train = pd.read_table(data['Train'], index_col=0)
+    test = pd.read_table(data['Test'], index_col=0)
+
+    train = train.Cluster.to_dict()
+    test = test.Cluster.to_dict()
+    data['TrainDict'] = train
+    data['TestDict'] = test
+
+    eval_module_consistency(data)
+    eval_num_modules(data)
+    eval_num_assigned(data)
+
+
+# %% Consolidate into a nice dataframe
+columns = [
+    'Name',
+    'Consistency',
+    'NumModules',
+    'NumAssigned'
+]
+
+results = []
+for data in datasets:
+    results.append(
+        [data[x] for x in columns]
+    )
+
+results = pd.DataFrame(results, columns=columns)
+
+
+# %% Plot it
+order = ['ICA5', 'ICA10', 'Grnboost', 'WGCNA', 'Hotspot']
+colors = sns.color_palette('deep')[:len(order)]
+plot_data = results.set_index('Name').loc[order]
+
+fig, axs = plt.subplots(1, 3, figsize=(12, 4))
+
+plt.sca(axs[0])
+
+plt.bar(
+    plot_data.index, plot_data.Consistency, alpha=0.9, color=colors
+)
+plt.xticks(rotation=45)
+plt.ylabel('Proportion of Gene Pairs Which\nReplicate Across Data Split')
+plt.title('Reproducibility')
+plt.gca().set_axisbelow(True)
+plt.grid(color='#CCCCCC', lw=0.5, axis='y', ls=(0, (5, 5)))
+
+plt.sca(axs[1])
+
+plt.bar(
+    plot_data.index, plot_data.NumModules, alpha=0.9, color=colors
+)
+plt.xticks(rotation=45)
+plt.ylabel('Modules')
+plt.title('# Modules')
+plt.gca().set_axisbelow(True)
+plt.grid(color='#CCCCCC', lw=0.5, axis='y', ls=(0, (5, 5)))
+
+plt.sca(axs[2])
+
+plt.bar(
+    plot_data.index, plot_data.NumAssigned, alpha=0.9, color=colors
+)
+plt.xticks(rotation=45)
+plt.ylabel('Genes')
+plt.title('# Genes Assigned')
+plt.gca().set_axisbelow(True)
+plt.grid(color='#CCCCCC', lw=0.5, axis='y', ls=(0, (5, 5)))
+
+plt.subplots_adjust(bottom=.25, wspace=0.4, left=0.1, right=0.9)
+# plt.savefig('CD4_Module_TrainTest.svg')
+plt.savefig('Monocyte_Module_TrainTest.svg')
+# plt.show()
diff --git a/Transcriptomics/Figures/CD4_Correlation/plot.py b/Transcriptomics/Figures/CD4_Correlation/plot.py
@@ -112,41 +112,99 @@
 # %% Plot Module Scores
 # %% Load data
 
-fig, axs = plt.subplots(
-    3, 4, figsize=(9, 7),
-    gridspec_kw=dict(
-        left=.05, right=.95, top=.95, bottom=.05
-    ),
-)
+from itertools import zip_longest
 
-cbar_ax = fig.add_axes(
-    [.95, .05, .01, .1]
-)
+def plot_scores_umap(scores, proj, vmin='auto', vmax='auto'):
+
+    fig, axs = plt.subplots(
+        2, 6, figsize=(15, 6),
+        gridspec_kw=dict(
+            left=.05, right=.95, top=.95, bottom=.05
+        ),
+    )
 
-for ax, module in zip(axs.ravel(), scores.columns):
-    plt.sca(ax)
+    cbar_ax = fig.add_axes(
+        [.95, .05, .01, .1]
+    )
 
-    vals = scores[module]
+    for ax, module in zip_longest(axs.ravel(), scores.columns):
+        plt.sca(ax)
 
-    vmin = np.percentile(vals, 5)
-    vmax = np.percentile(vals, 95)
-    vmin = -2
-    vmax = 2
+        if module is None:
+            ax.remove()
+            continue
 
-    sc = plt.scatter(
-        x=proj.iloc[:, 0],
-        y=proj.iloc[:, 1],
-        c=vals,
-        vmin=vmin, vmax=vmax,
-        s=2, rasterized=True
-    )
-    plt.xticks([])
-    plt.yticks([])
-    for sp in ax.spines.values():
-        sp.set_visible(False)
-    plt.title(module)
+        vals = scores[module].values.copy()
 
-plt.colorbar(sc, cax=cbar_ax)
+        if vmin == 'auto':
+            _vmin = np.percentile(vals, 5)
+        else:
+            _vmin = vmin
+
+        if vmax == 'auto':
+            _vmax = np.percentile(vals, 95)
+        else:
+            _vmax = vmax
+
+        sc = plt.scatter(
+            x=proj.iloc[:, 0],
+            y=proj.iloc[:, 1],
+            c=vals,
+            vmin=_vmin, vmax=_vmax,
+            s=2, rasterized=True
+        )
+        plt.xticks([])
+        plt.yticks([])
+        for sp in ax.spines.values():
+            sp.set_visible(False)
+        plt.title(module)
+
+    plt.colorbar(sc, cax=cbar_ax)
+    return fig
+
+# %%
 
 # plt.show()
+scores.columns = [
+    'Hotspot-{}'.format(i+1) for i in range(len(scores.columns))
+]
+fig = plot_scores_umap(scores, proj, vmin=-2, vmax=2)
 plt.savefig('CD4_ModuleScore_UMAPs.svg', dpi=300)
+plt.close()
+
+# %% Do the same for WGCNA
+
+scores_wgcna = pd.read_table(
+    "../../CD4_w_protein/wgcna/module_scores.txt",
+    index_col=0
+)
+scores_wgcna.columns = [
+    'WGCNA-{}'.format(i+1) for i in range(len(scores_wgcna.columns))
+]
+
+fig = plot_scores_umap(scores_wgcna, proj)
+plt.savefig('CD4_ModuleScore_UMAPs_wgcna.svg', dpi=300)
+plt.close()
+
+# %% Do the same for ICA
+
+scores_ica = pd.read_table(
+    "../../CD4_w_protein/ica10/cell_loadings.txt",
+    index_col=0
+)
+
+fdr_ica = pd.read_table(
+    "../../CD4_w_protein/ica10/components_fdr.txt",
+    index_col=0
+)
+
+scores_ica = scores_ica.loc[
+    :, fdr_ica.min(axis=0) <= .05
+]
+
+scores_ica.columns = [
+    'ICA-{}'.format(i+1) for i in range(len(scores_ica.columns))
+]
+fig = plot_scores_umap(scores_ica, proj)
+plt.savefig('CD4_ModuleScore_UMAPs_ica.svg', dpi=300)
+plt.close()