In [None]:
import pandas as pd
import datatable as dt
import networkx as nx
from nancorrmp.nancorrmp import NaNCorrMp    

import warnings
warnings.filterwarnings('ignore')

# MR Network with Genes of Interest (GOEs)

In [None]:
# Use the commented code to save or read the intermediate file
# It takes about an hour to get 40k x 40k coexpression matrix with 32 threads
df = pd.read_csv("expression_data.csv", index_col=0)
df = df.loc[~(df==0).all(axis=1)] # remove all-zero rows
df = np.log2(df+1) # if you want to log transform the data
df.iloc[:5,:5]

In [None]:
# Calculate the coexpression matrix
analysis_name = "experiment_name"
corr = NaNCorrMp.calculate(df.T) # Generate the complete coexpression matrix
#dt.Frame(corr).to_csv(analysis_name+'.corr.csv')

# Calculate the MR matrix
#corr = dt.fread(name+'.corr.csv').to_pandas()
#corr.index = corr.columns
mr = np.sqrt(corr.rank(ascending=False)) * np.sqrt(corr.rank(axis=1, ascending=False))

In [None]:
# Get the list of GOEs and keep the ones that exist in the MR table
# You can loop over the dictionary, or just have one file, I keep it for convenience
keep_dict = {"file1": "filter_set1.csv",
             "file2": "filter_set2.csv"}

for keep_name in keep_dict.keys():
    keep_set = set(pd.read_csv(keep_dict[keep_name])["GeneID"].tolist())
    keep_set = mr.index.intersection(keep_set)

    for mr_threshold in [10, 100]:
        mr_adj = mr.copy()

        # keep rows and columns from the keep_set
        mr_adj = mr_adj.loc[keep_set, keep_set]

        # To generate matrix set diagonal and > mr_threshold to zero (no edges)
        mr_adj[mr_adj>mr_threshold]=0
        mr_adj[mr_adj==1.0]=0

        # Convert MR table, in the form of adj matrix, to networkx Graph
        G = nx.convert_matrix.from_pandas_adjacency(mr_adj)

        # Extract edge weights from the MR table and set as graph attribute
        weighted_edges = []
        for e in G.edges:
            weighted_edges.append(tuple([e[0], e[1], mr_adj.loc[e[0], e[1]]]))
        G.add_weighted_edges_from(weighted_edges, weight="mr")
        
        # On few occasions I found genes that are barely expressed that have pcc=1 with multiple genes
        # This causes the MR value to differ from 1 (arbitrary ranking value) that causes selfloop formation 
        G.remove_edges_from(nx.selfloop_edges(G))


        # Write network to file
        nx.write_edgelist(G, analysis_name+"_"+keep_name+"_mr"+str(mr_threshold)+".edgelist", comments='#', delimiter='\t', data=["mr"], encoding='utf-8')