# MicroRNA Disease Network (MRDN) Construction

In [1]:
import re
import math
import numpy as np
import pandas as pd

## Curation of the human miRNA-disease association dataset

1. Select associations with tissue expression information

In [2]:
hmdd_file = "./data/HMDD_alldata.txt"
all_data = pd.read_table(hmdd_file)
tissue_data = all_data[all_data["category"].str.match("^tissue_expression_(up|down)$")].copy()

2. Rename miRNAs and code regulation information

In [3]:
def adjustMirName(name):
    '''
        Return pri-miRNA name
    '''
    return "-".join(name.split("-")[:3])

def adjustRegName(name):
    '''
        1 for up regulation and -1 for down regulation
    '''
    codemap = {"up": 1, "down": -1}
    return codemap[name.split("_")[-1]]

tissue_data['pri-miRNA'] = tissue_data['mir'].map(adjustMirName)
tissue_data['regulation'] = tissue_data['category'].map(adjustRegName)
tissue_data_reg = tissue_data.loc[:, ['pri-miRNA', 'disease', 'regulation']]

In [4]:
tissue_data_reg.to_csv("./results/HMDD_tissue.txt", sep="\t", index=False)

## Calculation of miRNA-based disease similarity

1. Calculate miRNA-based disease vector  

In [6]:
md_pairs = tissue_data_reg.groupby(["pri-miRNA", "disease"]).sum().reset_index()
md_pairs_nonzero = md_pairs[md_pairs["regulation"] != 0]
weight_matrix = md_pairs_nonzero.pivot_table(index=["disease"], 
                                             columns=["pri-miRNA"], 
                                             values=["regulation"])
dsw = {mir:sum(weight_matrix[mir].notna())/float(len(weight_matrix.index)) 
       for mir in weight_matrix.columns}
weight_matrix = weight_matrix.fillna(0).astype(float)
for m in weight_matrix.columns:
    weight_matrix[m] = weight_matrix[m].map(lambda x: x*np.log(1/dsw[m]))

In [14]:
weight_matrix.columns = weight_matrix.columns.droplevel(0)
weight_matrix.columns.name = None
weight_matrix.to_csv("./results/HMDD_weight_matrix.txt", sep="\t")

2. Calculate disease similarity

In [8]:
def tonimoto(vi, vj):
    '''
        Return Tanimoto coefficient
    '''
    vi = np.array(vi).astype(float)
    vj = np.array(vj).astype(float)
    leni = sum(vi**2)
    lenj = sum(vj**2)
    inner = sum(vi * vj)
    sim = inner / (leni + lenj - inner)
    return sim

diseases = weight_matrix.index.values
similarity_matrix = [[tonimoto(weight_matrix.loc[x, :].values, 
                               weight_matrix.loc[y, :].values) 
                               for x in diseases] for y in diseases]
similarity_matrix = pd.DataFrame(similarity_matrix, index=diseases, columns=diseases)

In [18]:
similarity_matrix.index.names = ["disease"]
similarity_matrix.to_csv("./results/HMDD_similarity_matrix.txt", sep="\t")

## Construction of miRNA-based disease network (MRDN)

1. Select nodes and edges

In [10]:
for d in diseases:
    similarity_matrix.loc[d, d] = 0.0
similarity = similarity_matrix.reset_index().melt(id_vars="index")
similarity.columns = ["Disease1", "Disease2", "Similarity"]

In [21]:
def drt(x):
    if x > 0:
        return "+"
    elif x < 0:
        return "-"
    else:
        return np.nan

similarity["Similarity_abs"] = similarity["Similarity"].map(abs)
similarity["Direction"] = similarity["Similarity"].map(drt)

In [22]:
mrdn_edges = similarity[similarity["Similarity_abs"] > 0.05]
mrdn_edges = mrdn_edges[mrdn_edges["Disease1"] < mrdn_edges["Disease2"]]
mrdn_nodes = mrdn_edges.loc[:, ["Disease1", "Disease2"]].values.flatten()
mrdn_nodes = pd.DataFrame(np.unique(mrdn_nodes), columns=["Nodes"])

In [31]:
similarity_matrix = pd.read_csv("./results/HMDD_similarity_matrix.txt", sep="\t", index_col=0)
mrdn_nodes = similarity_matrix.columns.values
mrdn_nodes = pd.DataFrame(np.unique(mrdn_nodes), columns=["Nodes"])
mrdn_nodes.to_csv("./results/MRDN_nodes.txt", sep="\t", index=False)

In [25]:
mrdn_edges.to_csv("./results/MRDN_edges.txt", sep="\t", index=False)
mrdn_nodes.to_csv("./results/MRDN_nodes.txt", sep="\t", index=False)

# Network Analysis

create a network using networkx

Figures and Tables see Figures.ipynb

In [34]:
import networkx as nx

mrdn_edges = pd.read_csv("./results/MRDN_edges.txt", sep="\t")
# Disease category was manually curated from MeSH.
mrdn_nodes = pd.read_csv("./results/MRDN_nodes_category.txt", sep="\t")
edges = [(row[0], row[1], {"weight":row[3], "direction":row[4]})
         for row in mrdn_edges.values]
nodes = [(row[0], {"Class":row[1]}) for row in mrdn_nodes.values]
G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(edges)

## Curation of the human miRNA-disease association dataset

In [4]:
exp_data = pd.read_csv("./results/HMDD_tissue.txt", sep="\t")

In [12]:
n_record = exp_data.shape[0]
md_pair = exp_data.groupby(["pri-miRNA", "disease"]).sum().reset_index()
n_unique = sum(md_pair["regulation"] != 0) 
n_up = sum(md_pair["regulation"] > 0)
n_down = sum(md_pair["regulation"] < 0) 
print(n_record, n_unique, n_up, n_down)

5103 3357 1788 1569


## Construction of miRNA-based disease network (MRDN)

In [66]:
positive_edges = [(row[0], row[1], float(row[3])) for row in mrdn_edges.values if row[-1] == "+"]
negative_edges = [(row[0], row[1], float(row[3])) for row in mrdn_edges.values if row[-1] == "-"]

all_degree = {k:v for k,v in dict(G.degree()).items()}

subG = nx.Graph()
subG.add_weighted_edges_from(positive_edges, direction="+")
pos_degree = {k:v for k,v in dict(subG.degree()).items()}
pos_degree.update({k:0 for k in all_degree if k not in pos_degree})

subG = nx.Graph()
subG.add_weighted_edges_from(negative_edges, direction="-")
neg_degree = {k:v for k,v in dict(subG.degree()).items()}
neg_degree.update({k:0 for k in all_degree if k not in neg_degree})

cc = nx.clustering(G)

print(len(positive_edges), len(negative_edges))

1521 919


In [67]:
node_property = pd.read_csv("./results/MRDN_nodes_category.txt", sep="\t")
node_property["Degree"] = [all_degree[k] for k in node_property["Nodes"]]
node_property["Positive_Edge"] = [pos_degree[k] for k in node_property["Nodes"]]
node_property["Negative_Edge"] = [neg_degree[k] for k in node_property["Nodes"]]
node_property["Cluster_Coefficient"] = [cc[k] for k in node_property["Nodes"]]

In [68]:
node_property.to_csv("./results/MRDN_nodes_property.txt", sep="\t", index=False)

## Mining disease patterns in MRDN

In [109]:
disease_info = pd.read_csv("./results/MRDN_nodes_property.txt", sep="\t", index_col=0)
asp = nx.all_pairs_shortest_path_length(G)
asp = {row[0]:row[1] for row in list(asp)}
resd= {}
for d1 in disease_info.index:
    for d2 in disease_info.index:
        if d2 != d1:
            key = tuple(sorted([d1, d2]))
            if key not in resd:
                if d2 in asp[d1]:
                    distance = asp[d1][d2]
                    resd[key] = {}
                    resd[key]["Distance"] = distance
                    if disease_info.loc[d1, "Category"] == disease_info.loc[d2, "Category"]:
                        resd[key]["Type"] = "intra"
                    else:
                        resd[key]["Type"] = "inter"
res = [[k[0], k[1], v["Distance"], v["Type"]] for k, v in resd.items()]
header = ["From", "To", "Distance", "Type"]
res.insert(0, header)
with open("./results/MRDN_distance.txt", 'w') as fo:
    fo.writelines(["\t".join([str(val) for val in row])+"\n" for row in res])

In [90]:
similarity_matrix = pd.read_csv("./results/HMDD_similarity_matrix.txt", sep="\t", index_col=0)
with open("./results/MRDN_nodes_category.txt", "r") as fi:
    catedic = [row.strip().split("\t") for row in fi.readlines()[1:]]
    catedic = {row[0]:row[1] for row in catedic}
similarity = similarity_matrix.reset_index().melt(id_vars="disease")
similarity.columns = ["d1", "d2", "similarity"]
similarity = similarity[similarity["d1"] != similarity["d2"]]
similarity["d1_class"] = [catedic[i] for i in similarity["d1"]]
similarity["d2_class"] = [catedic[i] for i in similarity["d2"]]
similarity["same_class"] = similarity["d1_class"] == similarity["d2_class"]

In [95]:
similarity.to_csv("./results/MRDN_diseases.txt", sep="\t", index=False)

coherent

In [107]:
# Generating input file for mfinder 
df = pd.read_csv("./results/MRDN_edges.txt", sep="\t")
nds = pd.read_csv("./results/MRDN_nodes_category.txt", sep="\t")
map2int = {row[0]:i+1 for i, row in enumerate(nds.values)}
weightd = {"+": 1, "-": -1}
output = [[map2int[row[1]], map2int[row[0]], weightd[row[-1]]] for row in df.values]
with open("./results/coherent/MRDN_IntNet.txt", "w") as fo:
    fo.writelines(["\t".join([str(i) for i in row])+"\n" for row in output])
mfinder_input = [[map2int[row[1]], map2int[row[0]], 1] for row in df.values]
with open("./results/coherent/MRDN_mfinder_input.txt", "w") as fo:
    fo.writelines(["\t".join([str(i) for i in row])+"\n" for row in mfinder_input])

In [96]:
def iscoherent(triplet, edged):
    a = tuple(sorted([triplet[0], triplet[1]]))
    b = tuple(sorted([triplet[1], triplet[2]]))
    c = tuple(sorted([triplet[2], triplet[0]]))
    res = edged[a]*edged[b]*edged[c]
    if res == 1:
        return True
    return False

In [None]:
# Count the number of coherent loops.
df = readTable("data/MRDN_IntNet.txt")
edged = {tuple(sorted(row[:2])):int(row[2]) for row in df}
df = readTable("data/MRDN_mfinder_input_MEMBERS.txt")
df = df[:-1]
cnt = int(df[1][0].split(" : ")[1])
motifs = df[5:]
coherent = [triplet for triplet in motifs if iscoherent(triplet, edged)]
ratio = float(len(coherent))/(cnt-len(coherent))
print(cnt)
print(len(coherent))
print(ratio)