Skip to content

Commit

Permalink
CD4/Monocyte Analysis Updates for Revision
Browse files Browse the repository at this point in the history
  • Loading branch information
deto committed Dec 17, 2020
1 parent 98c6b16 commit fe9775c
Show file tree
Hide file tree
Showing 44 changed files with 3,855 additions and 343 deletions.
38 changes: 38 additions & 0 deletions Transcriptomics/CD4_w_protein/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,51 @@ subworkflow data:
"../../data/10x_PBMC_w_proteins"

data_file = "cd4/data.loom"
data_file_train = "cd4/data_train.loom"
data_file_test = "cd4/data_test.loom"
ab_file = "cd4/ab.txt.gz"

include: "../latentTestsSnake/Snakefile"
include: "../Snakefile_splits"

rule computeGRScores:
input:
signatures="../../data/Signatures/TCells_noCD8.gmt",
output:
out="evaluation/geneRelevanceScores.txt"
script: "scripts/computeGeneRelevanceScores.py"

rule runWGCNA:
input:
loom=data(data_file),
genes="genes/threshold.txt",
params:
power="auto",
min_module_size=15,
output:
cluster_output="wgcna/modules.txt",
scores="wgcna/module_scores.txt",
script: "../../pipelineScripts/wgcna/wgcna.R"

rule runICA10:
input:
loom=data(data_file),
genes="genes/threshold.txt",
params:
n_components=10,
output:
components="ica10/components.txt",
cell_loadings="ica10/cell_loadings.txt"
script: "../../pipelineScripts/ICA/run_ica.py"

rule runICA10_fdr:
input:
components=rules.runICA10.output.components,
params:
qvalcutoff=1e-3,
output:
components_fdr="ica10/components_fdr.txt",
modules="ica10/modules.txt",
script: "../../pipelineScripts/ICA/run_ica_fdr.R"

include: "Snakefile_k_sensitivity"
17 changes: 17 additions & 0 deletions Transcriptomics/CD4_w_protein/Snakefile_k_sensitivity
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
rule runHotspot_k_sensitivity:
input:
loom=data(data_file),
latent=rules.SCVI_hvg.output.latent,
params:
model='danb',
n_neighbors='{k}',
n_cells_min=10,
output:
results="k_sensitivity/k_{k}/hotspot.txt"
script: "../../pipelineScripts/hotspot/runHotspot.py"

k_values = [5, 10, 30, 50, 100, 300, 500, 1000]

rule runHotspot_k_sensitivity_all:
input:
expand(rules.runHotspot_k_sensitivity.output.results, k=k_values)
8 changes: 8 additions & 0 deletions Transcriptomics/CD4_w_protein_downsampling/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
subworkflow data:
workdir:
"../../data/10x_PBMC_w_proteins/"

data_file = "cd4/downsampled_{rate}/data.loom"
data_file_test = "cd4/data_test.loom"

include: "../Snakefile_downsampling"
1 change: 1 addition & 0 deletions Transcriptomics/CD4_w_protein_downsampling/pipelineScripts
185 changes: 185 additions & 0 deletions Transcriptomics/Figures/CD4_Correlation/ModuleConsistency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
import os
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['svg.fonttype'] = 'none'

# Pick which set of results to plot - CD4 or Monocytes
# base_dir = '../../CD4_w_protein'
base_dir = '../../Mono_w_protein'

datasets = [
{
'Name': 'Hotspot',
'Train': os.path.join(base_dir, 'train/hotspot_hs/modules.txt'),
'Test': os.path.join(base_dir, 'test/hotspot_hs/modules.txt'),
},
{
'Name': 'WGCNA',
'Train': os.path.join(base_dir, 'train/wgcna_hs/modules.txt'),
'Test': os.path.join(base_dir, 'test/wgcna_hs/modules.txt'),
},
{
'Name': 'ICA5',
'Train': os.path.join(base_dir, 'train/ica5/modules.txt'),
'Test': os.path.join(base_dir, 'test/ica5/modules.txt'),
},
# # Don't need so many ICA versions as this one isn't that different from ICA10
# {
# 'Name': 'ICA8',
# 'Train': os.path.join(base_dir, 'train/ica8/modules.txt'),
# 'Test': os.path.join(base_dir, 'test/ica8/modules.txt'),
# },
{
'Name': 'ICA10',
'Train': os.path.join(base_dir, 'train/ica10/modules.txt'),
'Test': os.path.join(base_dir, 'test/ica10/modules.txt'),
},
{
'Name': 'Grnboost',
'Train': os.path.join(base_dir, 'train/grnboost/modules.txt'),
'Test': os.path.join(base_dir, 'test/grnboost/modules.txt'),
},
]

for data in datasets:
train = pd.read_table(data['Train'], index_col=0)
test = pd.read_table(data['Test'], index_col=0)

train = train.Cluster.to_dict()
test = test.Cluster.to_dict()
data['TrainDict'] = train
data['TestDict'] = test


def eval_module_consistency(data):
consistency = (
eval_module_consistency_inner(data['TrainDict'], data['TestDict']) +
eval_module_consistency_inner(data['TestDict'], data['TrainDict'])
) / 2

data['Consistency'] = consistency


def eval_module_consistency_inner(dict_a, dict_b):

all_genes = set(dict_a.keys()) & set(dict_b.keys())

# For each pairs of genes that are in the same module in 'A', how many are in the same module in 'B'?

denom = 0
num = 0
for ga in all_genes:
for gb in all_genes:

if ga == gb: continue

if dict_a[ga] == dict_a[gb] and dict_a[ga] != -1 and dict_a[gb] != -1: # Same module in A
denom += 1

if dict_b[ga] == dict_b[gb] and dict_b[ga] != -1 and dict_b[gb] != -1: # Same module in B
num += 1

num = num/2
denom = denom/2

consistent_rate = num/denom

return consistent_rate


def eval_num_modules(data):
num_modules = (
pd.Series(data['TrainDict']).unique().size - 1 +
pd.Series(data['TestDict']).unique().size - 1
) / 2

data['NumModules'] = num_modules


def eval_num_assigned(data):
assigned = (
(pd.Series(data['TrainDict']) != -1).sum()/2 +
(pd.Series(data['TestDict']) != -1).sum()/2
)

data['NumAssigned'] = assigned


for data in tqdm(datasets):
train = pd.read_table(data['Train'], index_col=0)
test = pd.read_table(data['Test'], index_col=0)

train = train.Cluster.to_dict()
test = test.Cluster.to_dict()
data['TrainDict'] = train
data['TestDict'] = test

eval_module_consistency(data)
eval_num_modules(data)
eval_num_assigned(data)


# %% Consolidate into a nice dataframe
columns = [
'Name',
'Consistency',
'NumModules',
'NumAssigned'
]

results = []
for data in datasets:
results.append(
[data[x] for x in columns]
)

results = pd.DataFrame(results, columns=columns)


# %% Plot it
order = ['ICA5', 'ICA10', 'Grnboost', 'WGCNA', 'Hotspot']
colors = sns.color_palette('deep')[:len(order)]
plot_data = results.set_index('Name').loc[order]

fig, axs = plt.subplots(1, 3, figsize=(12, 4))

plt.sca(axs[0])

plt.bar(
plot_data.index, plot_data.Consistency, alpha=0.9, color=colors
)
plt.xticks(rotation=45)
plt.ylabel('Proportion of Gene Pairs Which\nReplicate Across Data Split')
plt.title('Reproducibility')
plt.gca().set_axisbelow(True)
plt.grid(color='#CCCCCC', lw=0.5, axis='y', ls=(0, (5, 5)))

plt.sca(axs[1])

plt.bar(
plot_data.index, plot_data.NumModules, alpha=0.9, color=colors
)
plt.xticks(rotation=45)
plt.ylabel('Modules')
plt.title('# Modules')
plt.gca().set_axisbelow(True)
plt.grid(color='#CCCCCC', lw=0.5, axis='y', ls=(0, (5, 5)))

plt.sca(axs[2])

plt.bar(
plot_data.index, plot_data.NumAssigned, alpha=0.9, color=colors
)
plt.xticks(rotation=45)
plt.ylabel('Genes')
plt.title('# Genes Assigned')
plt.gca().set_axisbelow(True)
plt.grid(color='#CCCCCC', lw=0.5, axis='y', ls=(0, (5, 5)))

plt.subplots_adjust(bottom=.25, wspace=0.4, left=0.1, right=0.9)
# plt.savefig('CD4_Module_TrainTest.svg')
plt.savefig('Monocyte_Module_TrainTest.svg')
# plt.show()
116 changes: 87 additions & 29 deletions Transcriptomics/Figures/CD4_Correlation/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,41 +112,99 @@
# %% Plot Module Scores
# %% Load data

fig, axs = plt.subplots(
3, 4, figsize=(9, 7),
gridspec_kw=dict(
left=.05, right=.95, top=.95, bottom=.05
),
)
from itertools import zip_longest

cbar_ax = fig.add_axes(
[.95, .05, .01, .1]
)
def plot_scores_umap(scores, proj, vmin='auto', vmax='auto'):

fig, axs = plt.subplots(
2, 6, figsize=(15, 6),
gridspec_kw=dict(
left=.05, right=.95, top=.95, bottom=.05
),
)

for ax, module in zip(axs.ravel(), scores.columns):
plt.sca(ax)
cbar_ax = fig.add_axes(
[.95, .05, .01, .1]
)

vals = scores[module]
for ax, module in zip_longest(axs.ravel(), scores.columns):
plt.sca(ax)

vmin = np.percentile(vals, 5)
vmax = np.percentile(vals, 95)
vmin = -2
vmax = 2
if module is None:
ax.remove()
continue

sc = plt.scatter(
x=proj.iloc[:, 0],
y=proj.iloc[:, 1],
c=vals,
vmin=vmin, vmax=vmax,
s=2, rasterized=True
)
plt.xticks([])
plt.yticks([])
for sp in ax.spines.values():
sp.set_visible(False)
plt.title(module)
vals = scores[module].values.copy()

plt.colorbar(sc, cax=cbar_ax)
if vmin == 'auto':
_vmin = np.percentile(vals, 5)
else:
_vmin = vmin

if vmax == 'auto':
_vmax = np.percentile(vals, 95)
else:
_vmax = vmax

sc = plt.scatter(
x=proj.iloc[:, 0],
y=proj.iloc[:, 1],
c=vals,
vmin=_vmin, vmax=_vmax,
s=2, rasterized=True
)
plt.xticks([])
plt.yticks([])
for sp in ax.spines.values():
sp.set_visible(False)
plt.title(module)

plt.colorbar(sc, cax=cbar_ax)
return fig

# %%

# plt.show()
scores.columns = [
'Hotspot-{}'.format(i+1) for i in range(len(scores.columns))
]
fig = plot_scores_umap(scores, proj, vmin=-2, vmax=2)
plt.savefig('CD4_ModuleScore_UMAPs.svg', dpi=300)
plt.close()

# %% Do the same for WGCNA

scores_wgcna = pd.read_table(
"../../CD4_w_protein/wgcna/module_scores.txt",
index_col=0
)
scores_wgcna.columns = [
'WGCNA-{}'.format(i+1) for i in range(len(scores_wgcna.columns))
]

fig = plot_scores_umap(scores_wgcna, proj)
plt.savefig('CD4_ModuleScore_UMAPs_wgcna.svg', dpi=300)
plt.close()

# %% Do the same for ICA

scores_ica = pd.read_table(
"../../CD4_w_protein/ica10/cell_loadings.txt",
index_col=0
)

fdr_ica = pd.read_table(
"../../CD4_w_protein/ica10/components_fdr.txt",
index_col=0
)

scores_ica = scores_ica.loc[
:, fdr_ica.min(axis=0) <= .05
]

scores_ica.columns = [
'ICA-{}'.format(i+1) for i in range(len(scores_ica.columns))
]
fig = plot_scores_umap(scores_ica, proj)
plt.savefig('CD4_ModuleScore_UMAPs_ica.svg', dpi=300)
plt.close()

0 comments on commit fe9775c

Please sign in to comment.