SlideSeq Panel Updates for revision

deto · Dec 17, 2020 · 98c6b16 · 98c6b16
1 parent 53376b9
commit 98c6b16
Show file tree

Hide file tree

Showing 14 changed files with 1,181 additions and 56 deletions.
diff --git a/SlideSeq/Figures/Supp1/IDR_Curves.R b/SlideSeq/Figures/Supp1/IDR_Curves.R
@@ -43,10 +43,10 @@ pair_fun <- function(file_pair, stat_col, stat_colb=stat_col){
 # %% Run for Hotspot pairs
 
 files <- c(
-    "../Puck_180819_9/hotspot/hotspot_300.txt",
-    "../Puck_180819_10/hotspot/hotspot_300.txt",
-    "../Puck_180819_11/hotspot/hotspot_300.txt",
-    "../Puck_180819_12/hotspot/hotspot_300.txt"
+    "../Puck_180819_9/hotspot/hotspot.txt",
+    "../Puck_180819_10/hotspot/hotspot.txt",
+    "../Puck_180819_11/hotspot/hotspot.txt",
+    "../Puck_180819_12/hotspot/hotspot.txt"
 )
 
 stat_col <- "Z"
@@ -76,10 +76,10 @@ jsonlite::write_json(
 
 
 combos <- list(
-    c("../Puck_180819_9/hotspot/hotspot_300.txt",  "../Puck_180819_9/spatialDE/spatialDE_fixed.txt"),
-    c("../Puck_180819_10/hotspot/hotspot_300.txt", "../Puck_180819_10/spatialDE/spatialDE_fixed.txt"),
-    c("../Puck_180819_11/hotspot/hotspot_300.txt", "../Puck_180819_11/spatialDE/spatialDE_fixed.txt"),
-    c("../Puck_180819_12/hotspot/hotspot_300.txt", "../Puck_180819_12/spatialDE/spatialDE_fixed.txt")
+    c("../Puck_180819_9/hotspot/hotspot.txt",  "../Puck_180819_9/spatialDE/spatialDE_fixed.txt"),
+    c("../Puck_180819_10/hotspot/hotspot.txt", "../Puck_180819_10/spatialDE/spatialDE_fixed.txt"),
+    c("../Puck_180819_11/hotspot/hotspot.txt", "../Puck_180819_11/spatialDE/spatialDE_fixed.txt"),
+    c("../Puck_180819_12/hotspot/hotspot.txt", "../Puck_180819_12/spatialDE/spatialDE_fixed.txt")
 )
 
 stat_colA <- "Z"
@@ -121,10 +121,10 @@ idr_group <- function(files, stat_col) {
 }
 
 files <- c(
-    "../Puck_180819_9/hotspot/hotspot_300.txt",
-    "../Puck_180819_10/hotspot/hotspot_300.txt",
-    "../Puck_180819_11/hotspot/hotspot_300.txt",
-    "../Puck_180819_12/hotspot/hotspot_300.txt"
+    "../Puck_180819_9/hotspot/hotspot.txt",
+    "../Puck_180819_10/hotspot/hotspot.txt",
+    "../Puck_180819_11/hotspot/hotspot.txt",
+    "../Puck_180819_12/hotspot/hotspot.txt"
 )
 
 stat_col <- "Z"

diff --git a/SlideSeq/Figures/Supp1/plotMeanVar.py b/SlideSeq/Figures/Supp1/plotMeanVar.py
@@ -4,13 +4,15 @@
 import matplotlib.pyplot as plt
 import seaborn as sns
 
+plt.rcParams['svg.fonttype'] = 'none'
+
 
 ds = loompy.connect("../../../data/SlideSeq/Puck_180819_12/data.loom", mode="r")
 
 scaled = ds.layer['scaled'][:, :]
 counts = ds.layer[''][:, :]
 symbol = ds.row_attrs['Symbol'][:]
-scaled = scaled / 45 * 10000
+scaled = scaled / 45 * 10000  # Scaled is counts/45, but we want counts/10k here
 
 ds.close()
 
@@ -28,17 +30,6 @@
 disp_c = (var_c - mu_c) / (mu_c**2)
 fano_c = var_c / mu_c
 
-
-# %% Plot
-
-plt.figure()
-plt.plot(mu, fano, 'o', ms=1)
-plt.xscale('log')
-plt.yscale('log')
-plt.xlabel(r'Mean ($\frac{Counts}{10k}$)')
-plt.ylabel('Variance / Mean')
-plt.show()
-
 # %% Load hotspot results
 
 hs_file = "../../Puck_180819_12/hotspot/hotspot.txt"
@@ -56,15 +47,16 @@
 color_too_low = '#DDDDDD'
 color_not_sig = '#CCCCCC'
 
-plt.figure(figsize=(4, 4))
+plt.figure(figsize=(6, 2.5))
 plt.plot(mu[hs_xx], fano[hs_xx], 'o', ms=1, color=color_too_low, rasterized=True)
 plt.plot(mu[~hs_xx], fano[~hs_xx], 'o', ms=1, color=color_not_sig, rasterized=True)
 ll = plt.plot(mu[hs_ii], fano[hs_ii], 'o', ms=1, color=color_hs, rasterized=True)[0]
 plt.xscale('log')
 plt.yscale('log')
 plt.xlabel(r'Mean ($\frac{Counts}{10k}$)')
 plt.ylabel('Variance / Mean')
-plt.legend([ll], ['HS-Selected'], markerscale=3, loc='lower right')
+plt.legend([ll], ['Hotspot-Selected'], markerscale=3, loc='lower right')
 plt.subplots_adjust(left=.2, bottom=.2)
 # plt.show()
 plt.savefig('slideseq_meanvar.svg', dpi=300)
+plt.close()
diff --git a/SlideSeq/Figures/Supp1/plotPR.py b/SlideSeq/Figures/Supp1/plotPR.py
@@ -98,31 +98,6 @@ def getMarkers(df, N):
 plt.ylim(0, 1)
 plt.show()
 
-# %% Ok - this is very promising.  What if we just looked at average expression tho...
-pucks = [
-    "Puck_180819_9",
-    "Puck_180819_10",
-    "Puck_180819_11",
-    "Puck_180819_12",
-]
-
-puck_means = {}
-for puck in pucks:
-    ls = loompy.connect("../../../data/SlideSeq/{}/data.loom".format(puck), mode="r")
-    counts = ls.layers['scaled'][:, :]
-    gene_info = ls.ra['EnsID', 'Symbol']
-    ls.close()
-
-    # Have to do this because data_slideseq makes it a numpy array
-    gene_info = pd.DataFrame(
-        gene_info, columns=['EnsID', 'Symbol']).set_index('EnsID')
-    counts = pd.DataFrame(counts, index=gene_info.index)
-
-    # need counts, latent, and num_umi
-    valid_genes = (counts > 0).sum(axis=1) > 50
-    gene_means = counts.loc[valid_genes].mean(axis=1)
-    puck_means[puck] = gene_means
-    del counts
 
 # %%
 

diff --git a/SlideSeq/Figures/Supp1/plotPR_k_sensitivity.py b/SlideSeq/Figures/Supp1/plotPR_k_sensitivity.py
@@ -0,0 +1,121 @@
+from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, auc
+from functools import reduce
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+import seaborn as sns
+import loompy
+
+plt.rcParams['svg.fonttype'] = 'none'
+
+# %% First get the markers from DropViz data
+
+marker_de = pd.read_table("../../DropViz/ranksums_markers_1vAll.txt")
+
+groups = {
+    l: marker_de.loc[marker_de.level == l].set_index('gene')
+    for l in marker_de.level.unique()
+}
+
+
+def getMarkers(df, N):
+
+    markers = df.loc[df.FDR < .1].sort_values('AUC').index.tolist()
+    if len(markers) > N:
+        markers = markers[:N]
+
+    return markers
+
+
+N = 100
+markers = {l: getMarkers(v, N) for l, v in groups.items()}
+
+all_markers = reduce(
+    lambda x, y: x | set(y), markers.values(), set()
+)
+
+all_markers_norm = set([x.upper() for x in all_markers])
+
+print(len(all_markers))
+print(len(all_markers_norm))
+
+
+# %% Define some lists...
+
+k_values = [5, 10, 30, 50, 100, 300, 500, 1000]
+
+hs_files = {
+    k: "../../Puck_180819_12/k_sensitivity/k_{k}/hotspot.txt".format(k=k) for k in k_values
+}
+
+
+# %% Parse data for AUC and ROC curves
+
+aucs = []
+
+for k in hs_files.keys():
+    ff = hs_files[k]
+    res = pd.read_table(ff, index_col=0)
+
+    y_score = res.Z.values
+    y_true = np.array(
+        [x.upper() in all_markers_norm for x in res.index]
+    ).astype('int')
+
+    auc_i = roc_auc_score(y_true, y_score)
+
+    aucs.append([k, auc_i])
+
+aucs = pd.DataFrame(aucs, columns=['K', 'AUROC'])
+
+# %%
+
+plt.figure()
+ax = plt.gca()
+plt.plot(aucs['K'], aucs['AUROC'], 'o-')
+plt.xscale('log')
+plt.xticks(aucs['K'].unique())
+ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
+ax.xaxis.set_ticks([], minor=True)
+plt.xlabel('# of Neighbors')
+plt.ylabel('Area Under ROC Curve')
+plt.gca().set_axisbelow(True)
+plt.grid(color='#CCCCCC', ls=(0, (5, 5)), lw=.5)
+plt.savefig('AU_ROC_k_sensitivity.svg', dpi=300)
+
+# %% What about the precision recall curve?
+
+auprs = []
+
+for k in hs_files.keys():
+    ff = hs_files[k]
+    res = pd.read_table(ff, index_col=0)
+
+    y_score = res.Z.values
+    y_true = np.array(
+        [x.upper() in all_markers_norm for x in res.index]
+    ).astype('int')
+
+    precision, recall, thresholds = precision_recall_curve(y_true, y_score)
+
+    aupr = auc(recall, precision)
+    auprs.append([k, aupr])
+
+
+auprs = pd.DataFrame(auprs, columns=['K', 'AUPRC'])
+
+# %%
+
+plt.figure()
+ax = plt.gca()
+plt.plot(auprs['K'], auprs['AUPRC'], 'o-')
+plt.xscale('log')
+plt.xticks(auprs['K'].unique())
+ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
+ax.xaxis.set_ticks([], minor=True)
+plt.ylabel('Area Under\nPrecision-Recall Curve')
+plt.xlabel('# of Neighbors')
+plt.gca().set_axisbelow(True)
+plt.grid(color='#CCCCCC', ls=(0, (5, 5)), lw=.5)
+plt.savefig('AU_PR_k_sensitivity.svg', dpi=300)
diff --git a/SlideSeq/Figures/Supp5_HVG_vs_HS/hvg_vs_hs.py b/SlideSeq/Figures/Supp5_HVG_vs_HS/hvg_vs_hs.py
@@ -0,0 +1,146 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import loompy
+from bio_utils.plots import hover_plot
+from matplotlib.colors import LinearSegmentedColormap
+plt.rcParams['svg.fonttype'] = 'none'
+
+# %% Load results
+
+hs = pd.read_table("../../Puck_180819_12/hotspot/hotspot.txt", index_col=0)
+hvg = pd.read_table("../../Puck_180819_12/genes/hvg_info.txt", index_col=0)
+hvg = hvg.loc[hs.index]
+
+
+# %% Plot one vs other
+
+plot_data = hs[['Z']].join(hvg[['gene.dispersion.scaled']])
+
+plt.figure(figsize=(5, 5))
+
+plt.plot(
+    plot_data['gene.dispersion.scaled'],
+    plot_data['Z'],
+    'o', ms=2, rasterized=True
+)
+
+plt.gca().set_axisbelow(True)
+plt.xlabel('Scaled Dispersion')
+plt.ylabel('Autocorrelation Z')
+plt.grid(color='#BBBBBB', ls=(0, (5, 5)), lw=.5)
+plt.subplots_adjust(left=0.15, right=1-0.15, bottom=0.15, top=1-0.15)
+
+plt.ylim(-10, 260)  # Zoom in to see the points better
+plt.xlim(-4, 8)
+plt.savefig('autocorrelation_vs_hvg.svg', dpi=300)
+
+# %%
+
+# hover_plot(
+#     plot_data['gene.dispersion.scaled'],
+#     plot_data['Z'],
+#     plot_data.index,
+#     'o', ms=2
+# )
+# plt.show()
+
+# %% Plot rank vs rank
+
+
+# %%
+
+loom_file = "../../../data/SlideSeq/Puck_180819_12/data.loom"
+with loompy.connect(loom_file, 'r') as ds:
+    barcodes = ds.ca['Barcode'][:]
+    scaled = ds.layers['scaled'][:, :]
+    gene_info = ds.ra['EnsID', 'Symbol']
+    num_umi = ds.ca['NumUmi'][:]
+
+# Have to do this because data_slideseq makes it a numpy array
+gene_info = pd.DataFrame(
+    gene_info, columns=['EnsID', 'Symbol']).set_index('EnsID')
+scaled = pd.DataFrame(scaled, index=gene_info.index, columns=barcodes)
+
+positions = pd.read_table(
+    "../../Puck_180819_12/positions/positions.txt", index_col=0
+)
+
+positions = positions.loc[scaled.columns]
+
+# %%
+gene_cmap = LinearSegmentedColormap.from_list(
+    "grays", ['#dddddd', '#000000']
+)
+
+
+def plot_gene(gene, positions, scaled, ax=None):
+
+    if ax is None:
+        ax = plt.gca()
+    else:
+        plt.sca(ax)
+
+    vals = np.log2(scaled.loc[gene]+1)
+
+    vmin = 0
+    vmax = np.percentile(vals[vals > 0], 80)
+
+    plot_data = pd.DataFrame({
+        'X': positions.Comp1,
+        'Y': positions.Comp2,
+        'Expression': vals,
+    }).sort_values('Expression')
+
+    plot_data['S'] = 0.5
+    plot_data.loc[plot_data.Expression > 0, 'S'] = 1
+
+    plt.scatter(
+        x=plot_data.X,
+        y=plot_data.Y,
+        c=plot_data.Expression,
+        s=plot_data.S,
+        vmin=vmin, vmax=vmax, cmap=gene_cmap,
+        alpha=0.7, rasterized=True, edgecolors='none',
+    )
+    for sp in ax.spines.values():
+        sp.set_visible(False)
+
+    plt.xticks([])
+    plt.yticks([])
+    plt.title(gene)
+
+
+# %%
+
+# top_genes_hs = hs.Z.sort_values(ascending=False).index[0:6]
+
+# top_genes_hvg = hvg['gene.dispersion.scaled'].sort_values(ascending=False).index[0:6]
+# top_genes_hvg = hvg.loc[hvg['gene.mean'] > 1] \
+#     .sort_values('gene.dispersion.scaled', ascending=False).index[0:6]
+
+
+top_genes_hs = ['Plp1', 'Enpp2', 'Cartpt', 'Pcp2', 'Car8', 'Calb1']
+top_genes_hvg = ['Ube2q1', 'Rexo4', 'Letm1', 'Cnksr2', 'Xrn2', 'Prmt5']
+
+
+fig, axs = plt.subplots(2, 3, figsize=(6.5, 4))
+
+
+for gene, ax in zip(top_genes_hs, axs.ravel()):
+
+    plot_gene(gene, positions, scaled, ax=ax)
+
+# plt.show()
+plt.savefig('top_genes_hs.svg', dpi=600)
+
+# %%
+
+fig, axs = plt.subplots(2, 3, figsize=(6.5, 4))
+
+
+for gene, ax in zip(top_genes_hvg, axs.ravel()):
+
+    plot_gene(gene, positions, scaled, ax=ax)
+
+plt.savefig('top_genes_hvg.svg', dpi=600)