# Validate array vs RNA-seq modules

The goal of this notebook is to compare the modules found using array data vs RNA-seq data. We would expect the modules to be similar

In [1]:
%load_ext autoreload
%autoreload 2
import os
import scipy
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.cluster import DBSCAN, AgglomerativeClustering, AffinityPropagation
from core_acc_modules import utils, paths

np.random.seed(1)

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [2]:
# User params
num_singular_values_log = 100

# Clustering method
# Choices: {"dbscan", "hierarchal", "affinity"}
cluster_method = "affinity"

# DBSCAN params
density_threshold = 8

# Hierarchical clustering params
hier_threshold = 8
link_dist = "average"

# Affinity params
affinity_damping = 0.6

In [3]:
# Load expression data
pao1_array_compendium_filename = paths.ARRAY_COMPENDIUM_TO_COMPARE
pao1_rnaseq_compendium_filename = paths.RNASEQ_COMPENDIUM_TO_COMPARE

In [4]:
pao1_array_compendium = pd.read_csv(
    pao1_array_compendium_filename, sep="\t", header=0, index_col=0
)
pao1_rnaseq_compendium = pd.read_csv(
    pao1_rnaseq_compendium_filename, sep="\t", header=0, index_col=0
)

In [5]:
print(pao1_array_compendium.shape)
pao1_array_compendium.head()

(524, 5543)


Unnamed: 0,PA3454,PA2957,PA3863,PA1587,PA2875,PA4894,PA0833,PA1542,PA2116,PA2387,...,PA1467,PA2956,PA1182,PA5194,PA4244,PA2741,PA4474,PA1612,PA1852,PA2394
GSM1146022_pJN105_1.CEL,6.408613,10.309321,6.370142,10.009081,7.76793,7.615713,9.924233,5.539717,7.56899,6.148954,...,8.239182,6.739507,7.385588,9.429937,12.036582,11.266657,7.6665,6.741294,10.425292,5.674284
GSM1146023_pJN105_2.CEL,6.990266,9.214348,7.253147,9.907029,6.438852,7.694704,9.377877,6.210088,7.697756,6.848244,...,8.350595,7.041017,7.473732,9.299,11.748049,9.870242,7.520434,7.253668,8.883771,6.560394
GSM1146024_phaF_1.CEL,6.868354,10.108974,6.444074,10.08934,8.388457,7.697112,8.991802,5.768433,8.060447,6.719979,...,8.817465,7.335355,7.090449,9.270996,9.757212,10.720674,7.001546,6.748612,10.158589,6.231881
GSM1146025_phaF_2.CEL,6.75135,9.663363,6.337432,9.86922,7.726482,7.356633,9.587064,5.555644,7.813412,6.838972,...,8.024882,7.04538,7.27678,9.411877,12.060076,11.401624,7.799123,6.868192,10.13398,5.565795
GSM1244967_PAO1-22-replicate-01.CEL,7.552383,7.23946,6.383347,9.269304,7.43589,7.400998,8.886594,6.777863,9.473574,6.547137,...,7.648199,7.629626,6.704446,8.019752,10.768162,9.970635,10.044534,6.068311,10.562088,7.417833


In [6]:
print(pao1_rnaseq_compendium.shape)
pao1_rnaseq_compendium.head()

(956, 5543)


Unnamed: 0,PA3454,PA2957,PA3863,PA1587,PA2875,PA4894,PA0833,PA1542,PA2116,PA2387,...,PA1467,PA2956,PA1182,PA5194,PA4244,PA2741,PA4474,PA1612,PA1852,PA2394
ERX541571,2.610107,86.456892,7.226329,168.699127,13.192091,0.344707,18.173201,4.314739,3.163717,19.026344,...,7.787712,13.481757,8.108854,103.071129,224.831548,2221.438212,10.747415,7.459259,359.621921,2.425557
ERX541579,1.999048,96.94265,4.031264,478.746855,29.221653,0.0,186.619262,8.75194,6.652959,46.246755,...,22.306312,24.087714,14.772566,27.751564,1088.599448,2808.853017,58.897402,28.078862,770.311575,3.159865
ERX541580,9.361704,71.869195,5.968307,499.822637,24.958124,0.0,190.582148,3.446196,6.108587,44.793062,...,13.079799,16.282879,18.289502,24.517293,1487.173938,2348.995185,67.839781,25.617239,813.046259,2.305751
ERX541591,9.376476,244.273862,12.346102,359.46446,78.900806,1.474163,69.744163,9.466618,69.795026,67.576803,...,26.344252,34.825424,45.604931,37.074546,470.009691,1043.755632,203.209457,87.896274,655.548448,4.076711
ERX541592,17.896018,172.557961,11.536978,334.064662,44.784611,0.0,63.13262,10.089901,89.742498,37.69155,...,31.632905,29.554337,56.313395,38.926468,538.000709,1145.46166,227.475682,91.914805,454.492375,7.541219


## Get correlation matrices

In [7]:
# Correlation
pao1_array_corr_original = pao1_array_compendium.corr()

Note: Below we plotted the heatmap of the array data to confirm that it has the same issue as the RNA-seq data - there is one large cluster

In [8]:
%%time
# Plot heatmap
"""o1 = sns.clustermap(pao1_array_corr_original.abs(), cmap="viridis", figsize=(20, 20))
o1.fig.suptitle("Correlation of raw PAO1 genes (array compendium)", y=1.05)"""

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.81 µs


'o1 = sns.clustermap(pao1_array_corr_original.abs(), cmap="viridis", figsize=(20, 20))\no1.fig.suptitle("Correlation of raw PAO1 genes (array compendium)", y=1.05)'

In [9]:
# Transpose compendia to be gene x sample
# Here we're interested in how genes cluster
pao1_array_compendium_T = pao1_array_compendium.T
pao1_rnaseq_compendium_T = pao1_rnaseq_compendium.T

In [10]:
# log transform data
pao1_array_compendium_log10 = np.log10(1 + pao1_array_compendium_T)
pao1_rnaseq_compendium_log10 = np.log10(1 + pao1_rnaseq_compendium_T)

In [11]:
# Apply SVD
array_U, array_s, array_Vh = np.linalg.svd(
    pao1_array_compendium_log10, full_matrices=False
)
rnaseq_U, rnaseq_s, rnaseq_Vh = np.linalg.svd(
    pao1_rnaseq_compendium_log10, full_matrices=False
)

In [12]:
# Convert ndarray to df to use corr()
array_U_df = pd.DataFrame(data=array_U, index=pao1_array_compendium_T.index)
rnaseq_U_df = pd.DataFrame(data=rnaseq_U, index=pao1_rnaseq_compendium_T.index)

In [13]:
# Correlation of U
# Since `corr()` computes pairwise correlation of columns we need to invert U
pao1_array_corr_log_spell = array_U_df.iloc[:, :num_singular_values_log].T.corr()
pao1_rnaseq_corr_log_spell = rnaseq_U_df.iloc[:, :num_singular_values_log].T.corr()

Note: Here we plot the heatmaps to verify that the correlation of log + SPELL transformed data looks as expected (i.e. there is not a single large cluster)

In [14]:
# Plot heatmap
"""h1a = sns.clustermap(pao1_array_corr_log_spell.abs(), cmap="viridis", figsize=(20, 20))
h1a.fig.suptitle(
    f"log transform + SPELL corrected using {num_singular_values_log} vectors (PAO1 array)",
    y=1.05,
)"""

'h1a = sns.clustermap(pao1_array_corr_log_spell.abs(), cmap="viridis", figsize=(20, 20))\nh1a.fig.suptitle(\n    f"log transform + SPELL corrected using {num_singular_values_log} vectors (PAO1 array)",\n    y=1.05,\n)'

In [15]:
# Plot heatmap
"""h1b = sns.clustermap(pao1_rnaseq_corr_log_spell.abs(), cmap="viridis", figsize=(20, 20))
h1b.fig.suptitle(
    f"log transform + SPELL corrected using {num_singular_values_log} vectors (PAO1 rnaseq)",
    y=1.05,
)"""

'h1b = sns.clustermap(pao1_rnaseq_corr_log_spell.abs(), cmap="viridis", figsize=(20, 20))\nh1b.fig.suptitle(\n    f"log transform + SPELL corrected using {num_singular_values_log} vectors (PAO1 rnaseq)",\n    y=1.05,\n)'

## Clustering and get module membership

In [16]:
# Clustering using DBSCAN
if cluster_method == "dbscan":
    pao1_array_clustering = DBSCAN(eps=density_threshold).fit(pao1_array_corr_log_spell)
    pao1_rnaseq_clustering = DBSCAN(eps=density_threshold).fit(
        pao1_rnaseq_corr_log_spell
    )

In [17]:
# Clustering using hierarchal clustering
if cluster_method == "hierarchal":
    pao1_array_clustering = AgglomerativeClustering(
        n_clusters=None, distance_threshold=hier_threshold, linkage=link_dist
    ).fit(ppao1_array_corr_log_spell)
    pao1_rnaseq_clustering = AgglomerativeClustering(
        n_clusters=None, distance_threshold=hier_threshold, linkage=link_dist
    ).fit(pao1_rnaseq_corr_log_spell)

In [18]:
# Clustering using affinity propogation
if cluster_method == "affinity":
    pao1_array_clustering = AffinityPropagation(random_state=0).fit(
        pao1_array_corr_log_spell
    )
    pao1_rnaseq_clustering = AffinityPropagation(
        random_state=1, damping=affinity_damping
    ).fit(pao1_rnaseq_corr_log_spell)

In [19]:
# Get module membership for a single threshold
# Format and save output to have columns: gene_id | group_id
pao1_array_membership_df = pd.DataFrame(
    data={"module id": pao1_array_clustering.labels_},
    index=pao1_array_corr_log_spell.index,
)

pao1_array_membership_df["module id"].value_counts()

99     33
453    33
230    32
407    29
176    25
480    24
519    24
330    24
49     24
198    23
102    22
523    22
295    22
103    22
454    22
134    20
469    20
109    20
339    20
321    19
159    19
261    19
43     19
41     19
170    19
74     18
363    18
374    18
551    18
502    18
       ..
359     5
228     5
461     5
337     5
232     5
375     5
307     5
315     4
455     4
450     4
75      4
36      4
532     4
484     4
17      4
524     4
389     4
379     4
381     4
149     4
362     4
251     4
252     4
306     4
258     4
108     4
265     4
183     3
352     3
218     3
Name: module id, Length: 556, dtype: int64

In [20]:
pao1_array_membership_df.head()

Unnamed: 0,module id
PA3454,502
PA2957,504
PA3863,57
PA1587,219
PA2875,224


In [21]:
pao1_rnaseq_membership_df = pd.DataFrame(
    data={"module id": pao1_rnaseq_clustering.labels_},
    index=pao1_rnaseq_corr_log_spell.index,
)

pao1_rnaseq_membership_df["module id"].value_counts()

306    42
384    32
225    32
333    32
57     31
179    29
524    27
331    27
128    26
473    25
56     24
264    24
341    23
416    22
47     22
506    22
14     22
201    21
226    21
522    20
189    20
63     20
19     20
178    20
563    20
110    19
52     19
312    19
355    19
439    19
       ..
259     4
39      4
99      4
527     4
425     4
294     4
292     4
555     4
497     4
548     4
368     4
46      4
411     4
319     4
402     4
525     4
11      4
336     4
168     4
144     4
107     3
238     3
298     3
169     3
421     3
406     3
211     3
224     3
213     3
133     3
Name: module id, Length: 565, dtype: int64

In [22]:
pao1_rnaseq_membership_df.head()

Unnamed: 0,module id
PA3454,503
PA2957,128
PA3863,58
PA1587,396
PA2875,79


## Compare composition of modules

For a given array module, are the genes within 1 module in the RNA-seq compendium?

For rna-seq data, `grp_id` is the cluster id, `grp` contains the list of genes in this cluster. We can then look up which cluster these genes from the rna-seq cluster map to in the array compendium:  `pao1_array_membership_df.loc[grp.index]` returns the list of array cluster ids for each gene found in the rna-seq cluster. Finally, `rnaseq_module_mapping[grp_id]` contains the list of cluster ids in array data that contain each of these genes in the cluster grp_id in rnaq-seq data.

Here we are looking to see if modules are consistent (i.e. are genes found in an array module also found in a single module in rna-seq)?

In [26]:
array_module_mapping = {}
for grp_id, grp in pao1_array_membership_df.groupby("module id"):
    grp_mapped = pao1_rnaseq_membership_df.loc[grp.index]
    array_module_mapping[grp_id] = list(grp_mapped["module id"].unique())

array_module_mapping

{0: [47, 14, 7, 272, 374, 549],
 1: [0, 148, 164, 205],
 2: [229, 524, 359, 557],
 3: [425, 260, 264, 475, 32, 37, 357],
 4: [317, 493, 163, 109, 277, 86, 150, 74, 345, 499, 52, 287],
 5: [5, 271, 89, 128, 15, 420],
 6: [181, 320, 286],
 7: [30, 149, 480, 554, 522, 84],
 8: [12, 418, 102, 237, 495, 20],
 9: [75, 271, 445, 380, 403, 5, 407],
 10: [349, 215, 218, 59, 48, 275],
 11: [493, 98, 113, 402, 20],
 12: [18, 207, 453],
 13: [24, 454, 439, 379, 275],
 14: [181, 87, 464, 538, 119, 563, 235],
 15: [63, 130],
 16: [317, 263, 394, 106, 227, 253, 402, 231, 146, 412],
 17: [53, 220, 333],
 18: [29, 118, 393, 128, 126, 522],
 19: [400, 52, 139, 518, 148, 87],
 20: [154, 239, 139, 69, 76, 528, 320, 267],
 21: [510, 362, 226, 519, 272, 2, 225, 384, 420],
 22: [475, 365, 225, 285, 395, 193, 482, 512, 411, 396, 534],
 23: [532, 67, 247, 377, 291, 179, 393, 290, 2],
 24: [481, 148, 155, 286, 277, 306, 67, 249],
 25: [2, 268, 186, 358, 394, 163, 322, 465, 558, 291, 457],
 26: [307, 306, 558, 1

In [31]:
# Are there any array modules that map to a single rnaseq module?
consistent_array_modules = []
for array_module_id, list_rnaseq_module_ids in array_module_mapping.items():
    if len(list_rnaseq_module_ids) == 1:
        print(array_module_id, list_rnaseq_module_ids)
        consistent_array_modules.append(array_module_id)

print(len(consistent_array_modules) / len(array_module_mapping))

149 [142]
154 [91]
183 [521]
218 [364]
228 [232]
233 [261]
252 [547]
256 [255]
337 [344]
359 [513]
362 [404]
366 [422]
376 [376]
379 [171]
398 [541]
503 [504]
508 [324]
532 [239]
0.03237410071942446


In [29]:
rnaseq_module_mapping = {}
for grp_id, grp in pao1_rnaseq_membership_df.groupby("module id"):
    grp_mapped = pao1_array_membership_df.loc[grp.index]
    rnaseq_module_mapping[grp_id] = list(grp_mapped["module id"].unique())

rnaseq_module_mapping

{0: [1, 344],
 1: [498, 292, 244, 291],
 2: [25, 54, 162, 548, 21, 23],
 3: [153, 253, 373, 368, 387, 44, 385, 128, 522],
 4: [432, 327, 135, 79, 210, 341, 174],
 5: [5, 323, 462, 38, 122, 225, 9, 316],
 6: [207],
 7: [340, 142, 312, 461, 38, 336, 410, 113, 485, 372, 0, 430],
 8: [336, 349, 156, 49],
 9: [360, 203, 68, 266, 470, 143, 474, 444],
 10: [151, 367, 143, 405, 439, 347, 53, 325, 378, 435, 472],
 11: [282],
 12: [8, 315, 81],
 13: [164, 104, 397, 107],
 14: [454, 0, 348, 463, 152],
 15: [83, 128, 152, 150, 397, 5, 528, 79],
 16: [281, 433, 57, 292, 156],
 17: [85, 190, 230, 87, 176],
 18: [12, 342, 380],
 19: [357, 136, 369, 170, 70, 234, 281, 297, 510, 460, 235, 94, 293],
 20: [314, 43, 203, 54, 245, 128, 11, 497, 8],
 21: [549, 411, 354, 515, 274],
 22: [456, 101, 439, 248, 407, 65, 469, 551],
 23: [298, 66, 536, 199, 150, 401, 195, 447, 144],
 24: [13, 102, 442],
 25: [277, 132, 175, 72, 30, 439, 310, 206],
 26: [98, 67, 214, 59, 87, 104],
 27: [549, 112],
 28: [176, 190, 2

In [34]:
# Are there any rnaseq modules that map to a single array module?
consistent_rnaseq_modules = []
for rnaseq_module_id, list_array_module_ids in rnaseq_module_mapping.items():
    if len(list_array_module_ids) == 1:
        print(rnaseq_module_id, list_array_module_ids)
        consistent_rnaseq_modules.append(rnaseq_module_id)

print(len(consistent_rnaseq_modules) / len(rnaseq_module_mapping))

6 [207]
11 [282]
39 [91]
51 [93]
94 [88]
97 [93]
108 [40]
144 [274]
180 [472]
209 [424]
211 [403]
213 [389]
219 [364]
312 [523]
334 [334]
336 [328]
381 [367]
421 [484]
459 [166]
503 [502]
504 [503]
513 [359]
0.03893805309734513


**Observation:**

* Only ~3% of modules are consistent using RNA-seq and array data. We would have expected more of an overlap. What is causing this?