In [None]:
import pandas as pd
import numpy as np
import sys, os
import seaborn as sns
from scipy.stats import hypergeom
from sklearn.metrics import normalized_mutual_info_score
from topicpy.hsbmpy import get_max_available_L, get_scores, get_scores_shuffled, normalise_score, add_score_lines
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import multiprocessing as mp
from time import time

In [None]:
import logging
log = logging.getLogger("selections")
hdl = logging.StreamHandler()
hdl.setLevel(logging.DEBUG)
log.addHandler(hdl)
log.setLevel(logging.DEBUG)

In [None]:
population=pd.read_csv("https://stephenslab.github.io/count-clustering/project/utilities/gene_names_all_gtex.txt", header=None).values.ravel()

In [None]:
df_files = pd.read_csv("https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", sep='\t').loc[:,['SAMPID','SMTS', 'SMTSD']]
df_files.set_index('SAMPID', inplace=True)

In [None]:
palette = sns.diverging_palette(240, 10, sep=20, as_cmap=True)

In [None]:
work_dir = "/home/jovyan/work/phd/topics/datasets/"

In [None]:
algorithms = ["topsbm", "topsbm-log", "lda", "tm", "wgcna"]
experiments = ["gtex10", "gtexhk", "random/random00", "random/random11", "random/random22", "random/random33", "random/random44", "random/random55", "random/random66", "random/random77", "random/random88", "random/random99"]
#experiments = ["gtex10","gtexall","gtexhk","random/random11"]

In [None]:
genes = []
for directory in experiments:
    genes.append(pd.read_csv(f"{work_dir}/{directory}/mainTable.csv", index_col=0).index)

In [None]:
def get_pval(setA, setB):
    x = setA.isin(setB).sum() # number of successes
    M = len(population) # pop size
    k = len(setB) # successes in pop
    N = len(setA) # sample size
    pval = hypergeom.sf(x-1, M, k, N)
    return pval
#-np.log10(get_pval(setA, setB)+1)

In [None]:
overlaps = list(map(lambda setA: list(map(lambda setB: setA.isin(setB).sum()/float(len(setA)), genes)), genes))

In [None]:
cm = sns.clustermap(
    overlaps, 
    row_cluster=False,
    col_cluster=False, 
    vmin=0,
    vmax=max(list(map(lambda x: max(filter(lambda xi: xi<1,x)), overlaps)))
)

ax = cm.ax_heatmap
ax.yaxis.set_label_position("left")
ax.yaxis.tick_left()
ax.set_ylabel("Sets", fontsize=35)
ax.set_xlabel("Sets", fontsize=35)
ax.set_xticklabels(experiments, rotation=45)
ax.set_yticklabels(experiments, rotation=0)

ax.tick_params(labelsize=30)


bax = cm.ax_cbar
bax.tick_params(labelsize=25)
bax.set_title("% overlap", fontsize=35)

plt.show()
cm.savefig("selection_overlap.pdf")

In [None]:
algorithm = "topsbm"

In [None]:
def get_scores_mp(directory):
    return directory, get_scores(work_dir+directory, ["SMTS"], algorithm=algorithm, df_files=df_files)["SMTS"]

def get_scores_clbck(packed_score):
    global scores
    scores[packed_score[0]]=packed_score[1]

In [None]:
scores = {}
        
start = time()
    
pool = mp.Pool(12)

work = [pool.apply_async(get_scores_mp, args=([directory]), callback=get_scores_clbck, error_callback=lambda err:print(err)) for directory in experiments]

pool.close()
pool.join()

print(time()-start)

In [None]:
fig, ax = plt.subplots(figsize=(18,15))
add_score_lines(ax, scores, labels = experiments)
ax.set_title(algorithm, fontsize=30)
ax.set_xlim(1,800)
fig.savefig(f"scores_selections_{algorithm}.pdf")

In [None]:
def get_exp_labels(experiment):
    global files
    #print(experiment)
    levels = {
        "gtex10":{
            "topsbm": 2,
            "topsbm-log": 2,
            "lda": 3,
            "tm":0,
            "wgcna":0
        },
        "gtexhk":{
            "topsbm": 1,
            "lda": 1,
            "tm":0,
            "wgcna":0
        }
    }
    if experiment in levels.keys():
        level = levels[experiment][algorithm]
    else:
        level = get_max_available_L(work_dir+experiment, algorithm)-1
    df_clusters = pd.read_csv(f"{work_dir}/{experiment}/{algorithm}/{algorithm}_level_{level}_clusters.csv")
    exp_data = []
    new_idx = files.index[files.index.isin(df_clusters.values.ravel())]
    for file in new_idx:
        exp_data.append(df_clusters.columns[df_clusters[df_clusters==file].any(0)].values[0].split(" ")[1])
    return pd.Series(name=experiment, data=exp_data, index = new_idx)

def exp_clbck(exp_series):
    global files
    files = files.join(exp_series, how="outer")

In [None]:
experiments = ["gtex10", "gtexhk", "random/random00", "random/random11", "random/random22", "random/random33", "random/random44", "random/random55", "random/random66", "random/random77", "random/random88", "random/random99"]
#experiments = ["gtex10", "gtexhk", "random/random11", "random/random22", "random/random33"]

In [None]:
files = pd.DataFrame(index=df_files.index)
scores = {}
        
start = time()
    
pool = mp.Pool(12)

work = [pool.apply_async(get_exp_labels, args=([directory]), callback=exp_clbck, error_callback=lambda err:print(err)) for directory in experiments]

pool.close()
pool.join()

print(time()-start)

In [None]:
files.astype(float).apply(np.nanmax,0).astype(int)

In [None]:
partitions = files.transpose().values

In [None]:
def score(A, B):
    mask = ~(np.isnan(A.astype(float)) | np.isnan(B.astype(float))) #demorgan law
    return normalized_mutual_info_score(A[mask], B[mask])

partition_overlap = list(map(lambda partitionA: list(map(lambda partitionB: score(partitionA, partitionB), partitions)), partitions))

In [None]:
cm = sns.clustermap(
    partition_overlap, 
    row_cluster=False,
    col_cluster=False,
    vmin=0.0,
    cmap=palette
)

ax = cm.ax_heatmap
ax.set_title(algorithm, fontsize=35)
ax.yaxis.set_label_position("left")
ax.yaxis.tick_left()
ax.set_ylabel("Clusters sets", fontsize=35)
ax.set_xlabel("Clusters sets", fontsize=35)
ax.tick_params(labelsize=30)
ax.set_xticklabels(experiments, rotation=45)
ax.set_yticklabels(experiments, rotation=0)

bax = cm.ax_cbar
bax.tick_params(labelsize=25)
bax.set_title("NMI", fontsize=35)

plt.show()
cm.savefig(f"cluster_overlap_{algorithm}.pdf")

# Compare algorithms

## topics

In [None]:
def get_exp_topic_labels(algorithm, experiment):
    global files_topic
    #print(experiment)
    levels = {
        "gtex10":{
            "topsbm": 3,
            "topsbm-log": 2,
            "lda": 2,
            "tm":0,
            "wgcna":0
        },
        "random/random11":{
            "topsbm": 1,
            "topsbm-log": 1,
            "lda": 2,
            "tm":0,
            "wgcna":0
        },
        "random/random22":{
            "topsbm": 2,
            "topsbm-log": 1,
            "lda": 1,
            "tm":0,
            "wgcna":0
        },
        "gtexhk":{
            "topsbm": 2,
            "lda": 1,
            "tm":0,
            "wgcna":0
        }
    }
    level = levels[experiment][algorithm]
    df_topics = pd.read_csv(f"{work_dir}/{experiment}/{algorithm}/{algorithm}_level_{level}_topics.csv")
    exp_data = []
    new_idx = files_topic.index[files_topic.index.isin(df_topics.values.ravel())]
    df_topics.columns = np.unique(df_topics.columns, return_inverse=True)[1]
    for file in new_idx:
        exp_data.append(df_topics.columns[df_topics[df_topics==file].any(0)].values[0])
    return pd.Series(name=algorithm, data=exp_data, index = new_idx)

def exp_topic_clbck(exp_series):
    global files_topic
    files_topic = files_topic.join(exp_series, how="outer")

In [None]:
experiment = experiments[4]

In [None]:
files_topic = pd.DataFrame(index=population)
scores = {}
        
start = time()
    
pool = mp.Pool(6)

work = [pool.apply_async(get_exp_topic_labels, args=([alg, experiment]), callback=exp_topic_clbck, error_callback=lambda err:print(err)) for alg in algorithms]

pool.close()
pool.join()

print(time()-start)

In [None]:
files_topic.max(axis=0).astype(int)

In [None]:
partitions_topics = files_topic.transpose().values

In [None]:
def score(A, B):
    mask = ~(np.isnan(A.astype(float)) | np.isnan(B.astype(float))) #demorgan law
    return normalized_mutual_info_score(A[mask], B[mask])
    
partition_topics_overlap = list(map(lambda partitionA: list(map(lambda partitionB: score(partitionA, partitionB), partitions_topics)), partitions_topics))

In [None]:
cm = sns.clustermap(
    partition_topics_overlap, 
    row_cluster=False,
    col_cluster=False,
    vmin=0.0,
    cmap = palette
)

ax = cm.ax_heatmap
ax.set_title(experiment, fontsize=35)
ax.yaxis.set_label_position("left")
ax.yaxis.tick_left()
ax.set_ylabel("Topics", fontsize=35)
ax.set_xlabel("Topics", fontsize=35)
ax.tick_params(labelsize=30)
ax.set_xticklabels(files_topic.columns, rotation=45)
ax.set_yticklabels(files_topic.columns, rotation=0)

bax = cm.ax_cbar
bax.tick_params(labelsize=25)
bax.set_title("NMI", fontsize=35)

plt.show()
#cm.savefig(f"topic_overlap_{experiment}.pdf")

## clusters

In [None]:
def get_exp_cluster_labels(algorithm, experiment):
    global files_topic
    #print(experiment)
    levels = {
        "topsbm": 1,
        "topsbm-log":1,
        "lda": 1,
        "tm":0,
        "wgcna":1
    }
    #level = get_max_available_L(work_dir+experiment, algorithm)
    level = levels[algorithm]
    df_clusters = pd.read_csv(f"{work_dir}/{experiment}/{algorithm}/{algorithm}_level_{level}_clusters.csv")
    exp_data = []
    new_idx = files_cluster.index[files_cluster.index.isin(df_clusters.values.ravel())]
    df_clusters.columns = np.unique(df_clusters.columns, return_inverse=True)[1]
    for file in new_idx:
        exp_data.append(df_clusters.columns[df_clusters[df_clusters==file].any(0)].values[0])
    return pd.Series(name=algorithm, data=exp_data, index = new_idx)

def exp_topic_clbck(exp_series):
    global files_cluster
    files_cluster = files_cluster.join(exp_series, how="outer")

In [None]:
files_cluster = pd.DataFrame(index=df_files.index)
scores = {}
        
start = time()
    
pool = mp.Pool(4)

work = [pool.apply_async(get_exp_cluster_labels, args=([alg, experiment]), callback=exp_topic_clbck, error_callback=lambda err:print(err)) for alg in algorithms]

pool.close()
pool.join()

print(time()-start)

In [None]:
files_cluster.max(axis=0).astype(int)

In [None]:
partitions_clusters = files_cluster.transpose().values

In [None]:
partition_clusters_overlap = list(map(lambda partitionA: list(map(lambda partitionB: score(partitionA, partitionB), partitions_clusters)), partitions_clusters))

In [None]:
cm = sns.clustermap(
    partition_clusters_overlap, 
    row_cluster=False,
    col_cluster=False,
    vmin=0.0,
    cmap = palette
)

ax = cm.ax_heatmap
ax.set_title(experiment, fontsize=35)
ax.yaxis.set_label_position("left")
ax.yaxis.tick_left()
ax.set_ylabel("Clusters", fontsize=35)
ax.set_xlabel("Clusters", fontsize=35)
ax.tick_params(labelsize=30)
ax.set_xticklabels(files_cluster.columns, rotation=45)
ax.set_yticklabels(files_cluster.columns, rotation=0)

bax = cm.ax_cbar
bax.tick_params(labelsize=25)
bax.set_title("NMI", fontsize=35)

plt.show()
#cm.savefig(f"cluster_overlap_{experiment}.pdf")

# Rank and scores

In [None]:
def get_max_allscores_mp(experiment, algorithm):
    log.info(f"{experiment},{algorithm}")
    try:
        scores = {}
        scores["data"]=get_scores(work_dir+experiment, ["SMTS"], algorithm=algorithm, df_files=df_files)["SMTS"]
        scores["shuffle"]=get_scores_shuffled(work_dir+experiment, label="SMTS", algorithm=algorithm, df_files=df_files)
        
        temp = []
        
        for _ in range(5):
            temp.append(get_scores_shuffled(work_dir+experiment, label="SMTS", algorithm=algorithm, df_files=df_files)["V"])
                
        scores["shuffle"]["V"]=np.average(temp, 0)
        normalise_score(scores, base_algorithm="shuffle")
        score_data=scores["data"]
        mask = np.argsort(score_data["xl"])
        return algorithm, experiment, max(score_data["V"])
    except:
        log.error(sys.exc_info()[1])
        return None
    
def get_allscores_clbck(packed_score):
    global all_scores
    if packed_score is not None:
        all_scores[packed_score[0]][packed_score[1]] = packed_score[2]

In [None]:
all_scores = {}
        
start = time()
pool = mp.Pool(12)
    
for algorithm in algorithms:
    log.debug(algorithm)
    all_scores[algorithm]={}
    [pool.apply_async(get_max_allscores_mp, args=([experiment,algorithm]), callback=get_allscores_clbck, error_callback=lambda err:log.error(err)) for experiment in experiments]

pool.close()

log.info("close")

pool.join()

print(time()-start)

In [None]:
fig = go.Figure()

fig.add_traces(
    [
        go.Bar(y = [all_scores[algorithm[0]][experiment]
                    for experiment in experiments if (experiment in all_scores[algorithm[0]].keys())],
               name = algorithm[0]
              )
        for algorithm in all_scores.items()
    ]
)



layout={
    "xaxis":{
        "tickmode":"array",
        "tickvals": list(range(len(experiments))),
        "ticktext": experiments
    },
    "yaxis":{
        "title":"NMI"
    }
}

fig.update_layout(layout)
fig.write_image("scores_per_selection.pdf", engine="kaleido")

In [None]:
randoms_data = {algorithm[0]:[val[1] for val in all_scores[algorithm[0]].items() if "random" in val[0]]
 for algorithm in all_scores.items()}

fig = go.Figure()

fig.add_traces([
    go.Box(y=box_data[1], name=box_data[0])
for box_data in randoms_data.items()]
)



layout={
    "title": "random_selections",
    "xaxis":{
        "tickmode":"array",
        "tickvals": list(range(len(randoms_data))),
        "ticktext": [algo for algo in randoms_data.keys() if len(randoms_data[algo])>0]
    },
    "yaxis":{
        "title":"NMI",
        "range":[0.5,1]
    }
}

fig.update_layout(layout)
fig.write_image("random_selections.pdf", engine="kaleido")

In [None]:
import plotly
plotly.io.orca.config.executable="/opt/conda/bin/orca"
plotly.io.orca.config.save()

In [None]:
import plotly.io as pio
pio.orca.config.use_xvfb = True
pio.orca.config.save()

## Stable genes

In [None]:
def get_topic(algorithm, experiment):
    levels = {
        "gtex10":{
            "topsbm": 3,
            "topsbm-log": 2,
            "lda": 2,
            "tm":0,
            "wgcna":0
        },
        "random/random11":{
            "topsbm": 1,
            "topsbm-log": 1,
            "lda": 2,
            "tm":0,
            "wgcna":0
        },
        "random/random22":{
            "topsbm": 2,
            "topsbm-log": 1,
            "lda": 1,
            "tm":0,
            "wgcna":0
        },
        "gtexhk":{
            "topsbm": 2,
            "lda": 1,
            "tm":0,
            "wgcna":0
        }
    }
    level = levels[experiment][algorithm]
    df_topics = pd.read_csv(f"{work_dir}/{experiment}/{algorithm}/{algorithm}_level_{level}_topics.csv")
    for topic in df_topics.columns:
        yield df_topics[topic].dropna().values

In [None]:
genes = pd.read_csv("http://stephenslab.github.io/count-clustering/project/utilities/gene_names_all_gtex.txt", header=None, index_col=0)
genes["idx"]=list(range(genes.shape[0]))
genes.at["ENSG00000000419","idx"]

In [None]:
import networkx as nx
import graph_tool.all as gt

In [None]:
G = gt.Graph()
name = G.vp["name"] = G.new_vertex_property("string")

In [None]:
for gene in genes["idx"].items():
    v = G.add_vertex()
    assert(v==gene[1])
    name[v]=gene[0]

In [None]:
for algorithm in ["topsbm", "lda", "tm", "wgcna"]:
    print(algorithm)
    topic_gen = get_topic(algorithm,"gtex10")
    for topic_genes in topic_gen:
        for ig1, g1 in enumerate(topic_genes):
            G.add_edge_list(((genes.at[g1,"idx"],genes.at[g2,"idx"]) for g2 in topic_genes[ig1:]))

In [None]:
filter_degree = G.new_vertex_property("bool")
for v in G.vertices():
    filter_degree[v] = len(G.get_all_neighbors(v)) > 1
    
G.set_vertex_filter(filter_degree)
G.purge_vertices()
G.clear_filters()
G

In [None]:
adjacency = gt.adjacency(G).toarray()

In [None]:
adjacency.max()

In [None]:
g = nx.Graph()
for irow, row in enumerate(adjacency):
    g.add_edges_from([(irow, icol, {"weight":int(w)}) for icol, w in enumerate(row) if w > 5])

In [None]:
nx.draw_random(g)

[python-louvain](https://python-louvain.readthedocs.io/en/latest/api.html)

In [None]:
import community as community_louvain

In [None]:
partition = community_louvain.best_partition(g)

In [None]:
import sys
sys.path.append("../../Developer/fastconsensus/")

In [None]:
%load_ext autoreload
%autoreload 2
from fast_consensus import fast_consensus, group_to_partition
import matplotlib.cm as cm
import matplotlib.pyplot as plt

In [None]:
output = fast_consensus(g)

In [None]:
for iout in range(len(output)): 
    partition = output[iout]
    nodes = [node[0] for node in g.degree() if node[1] > 9]
    edges = [edge for edge in g.edges(data=True) if ((edge[2]["weight"]>0) & (edge[0] in nodes) & (edge[1] in nodes))]
    weights = [edge[2]["weight"] for edge in edges]
    weights = [5*float(w)/max(weights) for w in weights]
    new_partition = {node:part for node,part in partition.items() if node in nodes}

    g_plot = nx.Graph()
    g_plot.add_edges_from(edges)

    pos = nx.spring_layout(g_plot, k=0.1)
    # color the nodes according to their partition

    fig, ax = plt.subplots(figsize=(9,8))
    cmap = cm.get_cmap('viridis', max(new_partition.values()) + 1)
    nx.draw_networkx_nodes(g_plot, 
                            pos,
                            nodes,
                            node_size=250,
                            cmap=cmap, 
                            node_color=list(new_partition.values()),
                            ax=ax)

    nx.draw_networkx_edges(g_plot, 
                            pos,
                            edges, 
                            width=weights,  
                            alpha=0.5,
                            ax=ax)
    plt.show()

In [None]:
df_partition = pd.DataFrame(data=output[-1].items(), columns=["ensg", "partition"])
df_partition.groupby("partition").count().sort_values("ensg", ascending=False)

In [None]:
for gene in df_partition[df_partition["partition"]==52]["ensg"].values:
    print(genes[genes["idx"]==gene].index[0])

In [None]:
from gseapy import enrichr

In [None]:
enrichr([genes[genes["idx"]==gene].index[0] for gene in df_partition[df_partition["partition"]==9]["ensg"].values], 
       gene_sets = ['GO_Molecular_Function_2018',
             'GO_Biological_Process_2018',
             'GO_Cellular_Component_2018',
             'Human_Phenotype_Ontology',
             'GTEx_Tissue_Sample_Gene_Expression_Profiles_up',
             'GTEx_Tissue_Sample_Gene_Expression_Profiles_down',
             'Tissue_Protein_Expression_from_Human_Proteome_Map',
             'KEGG_2019_Human',
             'NCI-60_Cancer_Cell_Lines',
             '../MSigDB/c1.all.v7.1.symbols.gmt',
            '../MSigDB/c2.all.v7.1.symbols.gmt',
            '../MSigDB/c3.all.v7.1.symbols.gmt',
            '../MSigDB/c4.all.v7.1.symbols.gmt',
            '../MSigDB/c5.all.v7.1.symbols.gmt',
            '../MSigDB/c6.all.v7.1.symbols.gmt',
            '../MSigDB/c7.all.v7.1.symbols.gmt',
            '../MSigDB/c8.all.v7.1.symbols.gmt',
            '../MSigDB/h.all.v7.1.symbols.gmt',
            ],
       background=population,
       cutoff=0.05).results

In [None]:
df_partition.to_csv("partitions.csv")

In [None]:
df_partition["gene"]=[genes[genes["idx"]==g].index[0] for g in df_partition["ensg"]]