**Processing gsets and evaluating KL**

In [None]:
import numpy as np
import gseapy
from scipy.sparse import csr_matrix
from tqdm.contrib.concurrent import process_map
from functools import partial
from scipy.sparse.csgraph import shortest_path
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import networkx

from ABCA7lof2.geneclusters import get_scores, get_kernighan_lin_clusters, get_gene_pathway_matrix, get_full_matrix_from_bipartite, plot_component, plot_edges, plot_nodes, group, compute_groupped_matrix, get_scores, find_similar_clusters, get_representative_name_per_cluster, get_kernighan_lin_clusters, get_gene_pathway_matrix, compute_groupped_matrix, get_full_matrix_from_bipartite

In [None]:
# save the gsets
p1 = np.load('./raw_data/genesets/WikiPathways_2019_Human.npy', allow_pickle=True).item()
res = {**p6}
np.save('./processed_data/genesets/all_paths.npy', res)
pd.DataFrame.from_dict(res, orient='index').to_csv('./processed_data/genesets/all_paths.csv')

#### Plot Jaccard indices

In [None]:
# get pathway gene matrix
mat = get_gene_pathway_matrix('./processed_data/genesets/all_paths.npy')

import numba as nb
@nb.njit()
def compute_jaccard(arr1, arr2):
    outer = arr1-arr2
    i_outer = np.sum(outer>0)
    outer_sum = np.sum(np.abs(outer))
    shared = np.sum(arr1)-i_outer
    jaccard = shared/(outer_sum+shared)
    return jaccard

@nb.njit(parallel=True)
def compute_all_jaccard(mat_array):
    N = mat_array.shape[0]
    out = np.empty(shape=(N,N))
    for i in nb.prange(N):
        for j in nb.prange(N):
            out[i,j] = compute_jaccard(mat_array[i], mat_array[j])
    return out

mat_array = np.array(mat)
out = compute_all_jaccard(mat_array)

from sklearn.manifold import SpectralEmbedding

embedding = SpectralEmbedding(n_components=1, affinity='precomputed')
embedding = embedding.fit_transform(out)

o = np.argsort(embedding.ravel())
plt.imshow((np.log(out+1e-100)[o][:,o]), cmap='viridis', vmax=0, vmin=-230)
plt.colorbar()
plt.savefig('./pdf_figures/jaccard_all_paths.png')

x = np.zeros_like(out)
np.fill_diagonal(x, 1)
plt.imshow((np.log(x+1e-100)), cmap='viridis', vmax=0, vmin=-230)
plt.colorbar()

#### Assess consistency across runs

In [None]:
# visualize 

# assign the clusters
from tqdm import tqdm

C = 0
KL_modified = True
random_labels = True
unweighted = True

N=1000
loss = np.empty(N)
clusters = np.empty((N,np.sum(mat_sub.shape)))

for i in tqdm(range(N)):
    frame, loss_temp = get_kernighan_lin_clusters(None, 50, C, KL_modified, random_labels, unweighted, seed=i, no_progress=True, mat=mat_sub)
    frame.columns = ['cluster', 'description', 'is_gene']
    clusters[i] = np.array(frame['cluster'])
    loss[i] = loss_temp
    
@nb.njit(parallel=True)
def compute_jaccard_all_clust(arr1, arr2):
    N = len(np.unique(arr1))
    out = np.empty(shape=(N,N))
    for i in nb.prange(N):
        for j in nb.prange(N):
            i_0 = (arr1==i)#.astype(int)
            j_0 = (arr2==j)#.astype(int)
            out[i,j] = compute_jaccard(i_0, j_0)
    return out

temp = compute_jaccard_all_clust(clusters[5], clusters[1])
T = temp
embedding = SpectralEmbedding(n_components=1, affinity='precomputed')
embedding = embedding.fit_transform(1-T)

o = np.argsort(embedding.ravel())
plt.imshow(T[o][:,o], cmap='viridis')#, vmax=0, vmin=-230)
plt.colorbar()
plt.savefig('./pdf_figures/example1.pdf')

temp = compute_jaccard_all_clust(clusters[10], clusters[11])
T = temp
embedding = SpectralEmbedding(n_components=1, affinity='precomputed')
embedding = embedding.fit_transform(1-T)

o = np.argsort(embedding.ravel())
plt.imshow(T[o][:,o], cmap='viridis')#, vmax=0, vmin=-230)
plt.colorbar()
plt.savefig('./pdf_figures/example2.pdf')

temp = compute_jaccard_all_clust(np.random.permutation(clusters[5]), clusters[1])

T = temp
embedding = SpectralEmbedding(n_components=1, affinity='precomputed')
embedding = embedding.fit_transform(1-T)

o = np.argsort(embedding.ravel())
plt.imshow(T[o][:,o], cmap='viridis', vmax=1, vmin=0)
plt.colorbar()
plt.savefig('./pdf_figures/example1_baseline.pdf')

temp = compute_jaccard_all_clust(np.random.permutation(clusters[10]), clusters[11])

T = temp
embedding = SpectralEmbedding(n_components=1, affinity='precomputed')
embedding = embedding.fit_transform(1-T)

o = np.argsort(embedding.ravel())
plt.imshow(T[o][:,o], cmap='viridis', vmax=1, vmin=0)
plt.colorbar()
plt.savefig('./pdf_figures/example2_baseline.pdf')

In [None]:
# quantify using existing cluster robustness method

#### show loss by method

In [None]:
plt.hist(loss, label='kernighan-lin')
plt.ylabel('frequency')
plt.xlabel('loss')
plt.legend()
plt.savefig('./pdf_figures/loss.pdf')