In [None]:
import os
import itertools

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import datatable as dt

from scipy.stats import pearsonr
from sklearn import preprocessing
from scipy.sparse.csgraph import connected_components

from nancorrmp.nancorrmp import NaNCorrMp

import warnings
warnings.filterwarnings('ignore')

# MR Network
The code provided here is meant meant to facilitate Mutual Rank-based coexpression network analysis. \
For more information about Mutual Rank and how it can be used please check the Wisecaver manuscript and our Mutual Rank shiny app:
* A Global Coexpression Network Approach for Connecting Genes to Specialized Metabolic Pathways in Plants (https://doi.org/10.1105/tpc.17.00009)
* MutRank: an R shiny web-application for exploratory targeted mutual rank-based coexpression analyses integrated with user-provided supporting information (https://doi.org/10.7717/peerj.10264)
* MutRank on Github: https://github.com/eporetsky/MutRank \

This code uses useful python packages to speed up the analysis, primarily: **nancorrmp** and **datatable**. \
It has not been thoroughly tested yet but I'll be happy to address any concerns by email: **eporetsky at ucsd.edu**. \
This code has been generated as part of my work in the **Huffaker lab** at University of California, San Diego.

In [None]:
df = pd.read_csv("input.csv")
df.iloc[:,2:] = np.log2(df.iloc[:,2:])
df.shape

In [None]:
# If you have columns to drop, this is the place. Only expression data from now on
df = df.drop(["id","rep"], axis=1)
# Generate the complete coexpression matrix. Requires a lot of memory
corr = NaNCorrMp.calculate(df)

In [None]:
# Faster save output. Produces a large file
# name = "analysis_name"
# dt.Frame(corr).to_csv(analysis_name+'_corr.csv')

# Complete Mutual Rank to ClusterONE network analysis

In [None]:
name = "analysis_name"

# Can skip step 1 if corr already in memory
print("1. Starting analysis on correlation matrix named: "+name)
corr = dt.fread(name+'_corr.csv').to_pandas()
corr.index = corr.columns

print("2. Calculating the MR matrix")
# Generate the MR matrix from the corr
corr = np.sqrt(corr.rank(ascending=False)) * np.sqrt(corr.rank(axis=1, ascending=False))

# Convert the MR matrix to an exponential decay matrix
e_val = 50
print("3. Converting MR matrix using exponential decay (ED) value of: "+str(e_val))
corr = np.exp(-(corr-1.1)/e_val)
corr = corr.mask(corr < 0.01, 0)
corr = corr.loc[(corr.sum(axis=1) != 1), (corr.sum(axis=0) != 1)]

print("4. Processing ED matrix in order produce an edge list file")
# Changes lower triangle and diagonal to NaN
corr = corr.mask(np.arange(len(corr))[:,None] >= np.arange(len(corr)))
# Make sure file doesn't exist because data appends to it
if os.path.exists("temp_edge_list.csv"): os.remove("temp_edge_list.csv")

# ClusterONE doesn't work with header
range_indices = [[start, start+1000] for start in range(0, len(corr), 1000)]
range_indices[-1][-1] = len(corr)
for ind in range_indices:
    a = corr.iloc[ind[0]:ind[1]].copy()
    a = a.unstack().reset_index()
    a = a[(a[0]>0) & ( ~ np.isnan(a[0]) )] # & (a[0]<1)]
    dt.Frame(a).to_csv("temp_edge_list.csv", append=True, header=False)
# Since ClusterONE uses only tsv format for the edge list, and dt only saves csv need to convert
os.system("csvformat -T temp_edge_list.csv > "+name+"_"+str(e_val)+"_edgelist.tsv")
# Once converted to tsv, the temp csv edge list can be deleted
if os.path.exists("temp_edge_list.csv"): os.remove("temp_edge_list.csv")

print("5. Using ClusterONE to generate a list of clusters from the edge list")
os.system('java -jar cluster_one-1.0.jar '+name+'_'+str(e_val)+'_edgelist.tsv -f "edge_list" -F "csv" > '+name+'_'+str(e_val)+'_clusterONE.csv')

print("6. Starting processing of ClusterONE clusters")
dfone = pd.read_csv(name+'_'+str(e_val)+'_clusterONE.csv')
dfone = dfone[dfone["P-value"]<0.1]
print("Number of clusters after p.value filtering:", dfone.shape[0])
output_df = pd.DataFrame(columns=["node1", "node2", "attribute"])
#adj = open('clusterone_adj_mre_met_'+str(c_val)+'_'+str(e_val)+'_cytoscape.csv', "w")
for clusters in dfone["Members"].to_list():
    cluster = clusters.split(" ")
    combinations = list(itertools.combinations(cluster, 2))
    for comb in combinations:
        output_df.loc[len(output_df)] = [comb[0],comb[1],1]
        #adj.write(comb[0]+","+comb[1]+","+"1"+"\n")
    # There could be overlap bertween the clusters and this helps  
    output_df = output_df[~output_df[['node1', 'node2']].apply(frozenset, axis=1).duplicated()]

print("Number of unique nodes in the final network: " + str(len(set(output_df["node1"]).union(set(output_df["node2"])))))
output_df.to_csv(name+'_'+str(e_val)+'_cytoscape.csv')
print("7. MR based coexpression network processing is complete ")