# Generating Mutual Rank correlation networks (lossy)

This notebook produces Mutual Rank networks where some of the original correlation information is lost.\
In order reduce the size of the all-vs-all correlation table (often 40k x 40k table), the results are\
rounded and converted to int8 and then saved in a compressed datatable JAY format. Because int8 can store\
up to 256 unique integers, all MR values above 250 are discarded (although they are rarely used anyway).\
The improved compression had only minor effects on the final observed networks but it reduced the size\
of the MR correlation file from >20gb to about 2gb, which was helpful when working with multiple gene\
expression datasets.\
\
The code in this notebook can run a full analysis of an MR correlation network as described in (Wisecaver, 2017)\
10.1105/tpc.17.00009. It does require enough RAM to load 2 ranked all-vs-all correlation tables at the same time\
and a workstation with ~96gb RAM is recommended for large datasets. It is fairly fast, taking about 1-2 hours per\
dataset, but I have not benchedmarked it against other methods.\
\
This code has only been tested on linux and might be hard to get to work in other environments.\
For now you need to install **csvformat**: sudo apt install csvkit\
(I think datatable added an option to save as tsv so might not need csvkit)

In [None]:
import os
import math
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import datatable as dt
import networkx as nx
import numpy as np
from nancorrmp.nancorrmp import NaNCorrMp
from multiprocessing import  Pool

# Functions for generating the MR correlation networks

### Main MR network function

In [None]:
def mr_network(name, e_val):
    global dfone # needs to be gloval for the parallel function in the end
    # Convert the MR matrix to an exponential decay matrix
    corr = dt.fread("mrs/"+name+".mrs.jay").to_pandas().astype("int16") 
    corr.index = corr.columns
    corr = corr + 125
    corr = corr.mask(corr == 0, 251) # This value won't be included in network

    print("3. Converting MR matrix using exponential decay (ED) value of: "+str(e_val))
    corr = np.exp(-(corr-1.0)/e_val)
    corr = corr.mask(corr < 0.01, 0)
    corr = corr.loc[(corr.sum(axis=1) != 1), (corr.sum(axis=0) != 1)]
    #print("Second:", corr.iloc[:5,:5])
    print("4. Processing ED matrix in order produce an edge list file")
    # Changes lower triangle and diagonal to NaN
    corr = corr.mask(np.arange(len(corr))[:,None] >= np.arange(len(corr)))
    # Make sure file doesn't exist because data appends to it
    if os.path.exists("temp_edge_list.csv"): os.remove("temp_edge_list.csv")
    #print("Third:", corr.iloc[:5,:5])
    # ClusterONE doesn't work with header
    range_indices = [[start, start+1000] for start in range(0, len(corr), 1000)]
    range_indices[-1][-1] = len(corr)
    for ind in range_indices:
        a = corr.iloc[ind[0]:ind[1]].copy()
        a = a.unstack().reset_index()
        a = a[(a[0]>0) & ( ~ np.isnan(a[0]) )] # & (a[0]<1)]
        dt.Frame(a).to_csv("temp_edge_list.csv", append=True, header=False)
    # Since ClusterONE uses only tsv format for the edge list, and dt only saves csv need to convert
    os.system("csvformat -T temp_edge_list.csv > temp_edge_list.tsv")
    #os.system("csvformat -T temp_edge_list.csv > "+name+"_"+str(e_val)+"_edgelist.tsv")
    # Once converted to tsv, the temp csv edge list can be deleted
    if os.path.exists("temp_edge_list.csv"): os.remove("temp_edge_list.csv")

    print("5. Using ClusterONE to generate a list of clusters from the edge list")
    os.system('java -jar cluster_one-1.0.jar temp_edge_list.tsv -f "edge_list" -F "csv" > temp_clusterONE.csv')
    
    print("6. Starting processing of ClusterONE clusters")
    # Read input edge_list of 3 columns: node1, node2, weight. No header.
    file_name = name+'_'+str(e_val)
    edge_df = pd.read_csv('temp_edge_list.tsv', sep="\t", header=None)
    # Read ClusterONE output results
    dfone = pd.read_csv('temp_clusterONE.csv')
    dfone = dfone[dfone["P-value"]<0.1]
    # Convert the node column of each cluster to a list of members
    dfone["Members"] = dfone["Members"].apply(lambda x: x.split(" "))
    if os.path.exists("temp_edge_list.tsv"): os.remove("temp_edge_list.tsv")
    if os.path.exists("temp_clusterONE.csv"): os.remove("temp_clusterONE.csv")
    
    edges_in_clusters_df = parallel_is_in_cluster(edge_df, is_in_cluster)
    edges_in_clusters_df[2] = edges_in_clusters_df[2].apply(lambda x: 1 - math.log(x) * e_val) # return original MR
    edges_in_clusters_df.columns = ["edge1", "edge2", "MR"]
    edges_in_clusters_df.to_csv("clusterONE/"+file_name+".tsv", sep="\t", header=False, index=False)

    print("7. MR based coexpression network processing is complete ")

### Parallelize search for MR values for MR network edges
This is not really necessary, but I like having the MR values in the final network edge file.\
I will try to add an option to skip this step to save on a couple of minutes of analysis for each network.\

In [None]:
def is_in_cluster(splt):
    # A function that keeps edges only if they are found within a ClusterONE cluster
    return(splt[splt[[0,1]].apply(lambda a: 
        any(dfone["Members"].apply(lambda x: set([a[0], a[1],]).issubset(x))), axis=1)])

def parallel_is_in_cluster(df, func, n_cores=32):

    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

# Step 1: Generate the all-vs-all MR correlation table

In [None]:
# Create a dictionary of all the data files, such as gene expression data, 
# that will be used to generate the Mutual Rank correlation networks
file_dict = {
    "name_1": "file_1.csv",
    "name_2": "file_2.csv",
}

for key, val in file_dict.items():
    print(key)
    if ".csv" in val:
        sep=","
    else:
        sep="\t"
    df = pd.read_csv("data/"+val, index_col=0, sep=sep)
    
    # For the fvert index (C_123_full_name) keep only the compound ID
    # ClusterONE doesn't work well with complex names
    df.index = ["C_"+str(ix.split("_")[1]) for ix in df.index.tolist()]
    df = df.loc[~(df==0).all(axis=1)] # remove all-zero rows
    #df = np.log2(df+1)
    
    # Generate the complete coexpression matrix
    df = NaNCorrMp.calculate(df.T, n_jobs=32) 
    
    # Calculate the Mutual Rank table by multiplying the correlation df ranks * t(df) ranks
    df = np.sqrt(df.rank(ascending=False)) * np.sqrt(df.rank(axis=1, ascending=False))
    
    # Round and convert MR values to int8 to get better compression on results
    df = df.round(decimals=0)
    df[df>250] = 0
    df = df - 125
    df = df.astype("int8") 
    dt.Frame(df).to_jay("mrs/"+key+".mrs.jay")

# Step 2: Run the ClusterONE analysis on the MR correlation table
As described in the Wisecaver, 2017 paper. Use different e-values to increase the MR value threshold.

In [None]:
for key, _ in file_dict.items():
    e_vals = [5,10,15,20,25,30,35,40,45,50,60,70,80,90,100]
    for e_val in e_vals:
        mr_network(key, e_val)