In [3]:
# %load generation.py
# generation
import numpy as np
import scipy.sparse as sparse
import sys

from hyperg import HyperG

def gen_hg(X,with_feature,tad,hyperedge,hyperedge_flat):
    """
    :param X: numpy array, shape = (n_samples, n_features)
    :param with_feature: bool, optional(default=False)
    :return: instance of HyperG
    """
    
    n_nodes = tad.shape[0]
    n_edges = hyperedge.shape[0]

    node_idx = hyperedge_flat[:,0]
    edge_idx = hyperedge_flat[:,1]

    values = np.ones(node_idx.shape[0])

    H = sparse.coo_matrix((values, (node_idx, edge_idx)), shape=(n_nodes, n_edges))
    w = np.ones(n_edges)

    if with_feature:
        return HyperG(H, w = w, X=X)

    return HyperG(H,w = w)

In [13]:
# %load learning.py
from sklearn.cluster import k_means
from sklearn.cluster import SpectralClustering
from sklearn.utils import check_symmetric, check_random_state
from scipy.linalg import eigh

from hyperg import HyperG

feature = np.loadtxt('epi/output/GM12878/chr19_TAD.txt')
print(feature.shape)

def spectral_hg_partitioning(hg, n_clusters, assign_labels='kmeans', n_components=None, random_state=None, n_init=10):
    """
    :param hg: instance of HyperG
    :param n_clusters: int,
    :param assign_labels: str, {'kmeans', 'discretize'}, default: 'kmeans'
    :param n_components: int, number of eigen vectors to use for the spectral embedding
    :param random_state: int or None (default)
    :param n_init: int, number of time the k-means algorithm will be run
    with different centroid seeds.
    :return: numpy array, shape = (n_samples,), labels of each point
    """

    assert isinstance(hg, HyperG)
    assert n_clusters <= hg.num_nodes()

    random_state = check_random_state(random_state)

    if n_components is None:
        n_components = n_clusters

    L = hg.laplacian().toarray()
    L = check_symmetric(L)

    eigenval, eigenvec = eigh(L)
    embeddings = eigenvec[:, :n_components]
    embeddings = np.concatenate((embeddings, feature), axis=1)
    print(embeddings.shape)
    if assign_labels == 'kmeans':
        _, labels, _ = k_means(embeddings, n_clusters = n_clusters, random_state=random_state,
                               n_init=n_init)
    else:
        labels = SpectralClustering(n_clusters, random_state=random_state).fit(embeddings).labels_

    return labels

(111, 3)


In [14]:
from scipy.stats import pearsonr,spearmanr

TAD = np.loadtxt("output/GM12878/50kb_TAD/chr19.txt")
hyperedge = np.loadtxt('output/GM12878/hypergraph/hyperedge_all.txt')
hyperedge_flat = np.loadtxt('output/GM12878/hypergraph/hyperedge_flat.txt')

hg = gen_hg(None,False,TAD,hyperedge,hyperedge_flat)

clusters = spectral_hg_partitioning(hg, n_clusters = 3, assign_labels='kmeans', n_components=None, random_state=None, n_init=10)
print(clusters)

(111, 6)
[2 0 0 0 0 0 0 0 1 0 2 2 0 2 2 1 0 0 0 0 1 1 2 1 0 1 0 0 0 0 1 1 0 2 0 0 2
 1 2 0 0 0 0 0 0 2 0 0 2 1 2 2 2 2 1 2 2 2 1 1 2 1 2 2 1 0 1 0 0 1 1 1 2 0
 1 2 1 0 0 2 1 2 0 1 0 0 1 1 1 1 0 0 2 1 0 2 1 1 0 0 2 1 0 0 2 1 1 2 1 1 2]


  dv2 = np.power(self._DV.data.reshape(-1), -0.5)


In [16]:
from sklearn.metrics.pairwise import cosine_similarity
A = np.loadtxt('output/GM12878/AB/A_KR.txt')
B = np.loadtxt('output/GM12878/AB/B_KR.txt')
# TAD = np.loadtxt('/home/zsc/study/biye/output/chr19_50kb/optics_new.txt')

def getClass_AB():
    res = np.ones(len(TAD))
    for i in range(len(TAD)):
        start = TAD[i][0]
        end = TAD[i][1]
        for j in range(len(A)):
            s = A[j][0]
            e = A[j][1]
            if not (start>e or end<s):
                res[i] = 0
    return res
cos_sim = cosine_similarity(np.vstack((clusters, getClass_AB())))[0,1]
print(clusters,getClass_AB())
pccs = pearsonr(clusters, getClass_AB())
print(cos_sim)

[2 0 0 0 0 0 0 0 1 0 2 2 0 2 2 1 0 0 0 0 1 1 2 1 0 1 0 0 0 0 1 1 0 2 0 0 2
 1 2 0 0 0 0 0 0 2 0 0 2 1 2 2 2 2 1 2 2 2 1 1 2 1 2 2 1 0 1 0 0 1 1 1 2 0
 1 2 1 0 0 2 1 2 0 1 0 0 1 1 1 1 0 0 2 1 0 2 1 1 0 0 2 1 0 0 2 1 1 2 1 1 2] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
0.15861031714362883


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
A1 = np.loadtxt('output/IMR90/sub-compartment/A1.txt')
B1 = np.loadtxt('output/IMR90/sub-compartment/B1.txt')
A2 = np.loadtxt('output/IMR90/sub-compartment/A2.txt')
B2 = np.loadtxt('output/IMR90/sub-compartment/B2.txt')

length=[0]*len(TAD)
def getlen(fir,sec):
    dist = 0
    start = fir[0]
    end = fir[1]
    s = sec[0]
    e = sec[1]
    if end>s and start<s:
        dist = end-s+1
    if end>e and start<e:
        dist = end-e+1
    return dist
def getClass_sub():
    res = [-1]*len(TAD)
    for i in range(len(TAD)):
        start = TAD[i][0]
        end = TAD[i][1]

        for j in range(len(A1)):
            s = A1[j][0]
            e = A1[j][1]
            if not(start>=e or end<=s):
                res[i] = 0
                if start>s and end<e:
                    length[i] = end-start+1
                else:
                    length[i] = getlen(TAD[i],A1[j])

        for j in range(len(A2)):
            s = A2[j][0]
            e = A2[j][1]
            if not(start>=e or end<=s):
                if start>=s and end<=e:
                        res[i] = 1
                        length[i] = end-start+1
                else:
                    if res[i] ==-1:
                        res[i] = 1
                        length[i]=getlen(TAD[i],A2[j])
                    else:
                        if getlen(TAD[i],B1[j])>length[i]:
                            length[i] = getlen(TAD[i],A2[j])
                            res[i] = 1
        for j in range(len(B1)):
            s = B1[j][0]
            e = B1[j][1]
            if not(start>=e or end<=s):
                if start>=s and end<=e:
                        res[i] = 2
                        length[i] = end-start+1
                else:
                    if res[i] ==-1:
                        res[i] = 2
                        length[i]=getlen(TAD[i],B1[j])
                    else:
                        if getlen(TAD[i],B1[j])>length[i]:
                            length[i] = getlen(TAD[i],B1[j])
                            res[i] = 2
        for j in range(len(B2)):
            s = B2[j][0]
            e = B2[j][1]
            if not(start>=e or end<=s):
                if start>=s and end<=e:
                        res[i] = 3
                        length[i] = end-start+1
                else:
                    if res[i] ==-1:
                        res[i] = 3
                        length[i]=getlen(TAD[i],B2[j])
                    else:
                        if getlen(TAD[i],B1[j])>length[i]:
                            length[i] = getlen(TAD[i],B2[j])
                            res[i] = 3
    return res
pccs = pearsonr(clusters, getClass_sub())
cos_sim = cosine_similarity(np.vstack((clusters, getClass_sub())))[0,1]
print(cos_sim)

0.5791062810152757


In [6]:
one_outpath = 'output/IMR90/hypergraph/four/first.txt'
two_outpath = 'output/IMR90/hypergraph/four/second.txt'
three_outpath = 'output/IMR90/hypergraph/four/three.txt'
four_outpath = 'output/IMR90/hypergraph/four/four.txt'
with open(one_outpath, "w") as out:
    for i in range(len(clusters)):
        if clusters[i] == 0:
            out.write("\t".join((str(int(TAD[i,0])),str(int(TAD[i][1])))) + "\n")
with open(two_outpath, "w") as out:
    for i in range(len(clusters)):
        if clusters[i] == 1:
            out.write("\t".join((str(int(TAD[i,0])),str(int(TAD[i][1])))) + "\n")
with open(three_outpath, "w") as out:
    for i in range(len(clusters)):
        if clusters[i] == 2:
            out.write("\t".join((str(int(TAD[i,0])),str(int(TAD[i][1])))) + "\n")
with open(four_outpath, "w") as out:
    for i in range(len(clusters)):
        if clusters[i] == 3:
            out.write("\t".join((str(int(TAD[i,0])),str(int(TAD[i][1])))) + "\n")

In [7]:
from sklearn.metrics import jaccard_score, pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from scipy.stats import pearsonr,spearmanr

first_class = []
second_class = []
third_class=[]
fourth_class=[]
def getCluster(i,j):
    for i in range(len(clusters)):
        if clusters[i]==i:
            first_class.append(i)
        if clusters[i]==j:
            second_class.append(i)
    return first_class,second_class
res = getClass_AB()
pccs,pvalue = pearsonr(clusters, res)
def getScore(i,j):
    first_class,second_class = getCluster(i,j)
    res1 = list(res[first_class])+list(res[second_class])
    classes = list(clusters[first_class])+list(clusters[second_class])

    jaccard_sim = jaccard_score(classes, res1,average='micro')
    cos_sim = cosine_similarity(np.vstack((classes, res1)))[0,1]
    euclidean_dist = euclidean_distances(np.vstack((classes, res1)))[0,1]
    manhattan_dist = manhattan_distances(np.vstack((classes, res1)))[0,1]
    per,pvalue = pearsonr(classes, res1)
    return jaccard_sim,cos_sim,euclidean_dist,manhattan_dist,per

a = getScore(0,1)
b = getScore(0,2)
c = getScore(1,2)
jaccard_sim = max(a[0],b[0],c[0])
cos_sim = max(a[1],b[1],c[1])
euclidean_dist = min(a[2],b[2],c[2])
manhattan_dist = min(a[3],b[3],c[3])
per = c[4]
print("Jaccard 相似系数: %0.3f"%(jaccard_sim))
print("余弦相似度:", cos_sim)
print("欧几里得距离:", euclidean_dist)
print("曼哈顿距离:", manhattan_dist)
print("pearson:",per)


Jaccard 相似系数: 0.027
余弦相似度: 0.0
欧几里得距离: 5.385164807134504
曼哈顿距离: 29.0
pearson: nan




In [8]:
from sklearn.metrics import jaccard_score, pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from scipy.stats import pearsonr,spearmanr

res = getClass_sub()
pccs,pvalue = pearsonr(clusters, res)

jaccard_sim = jaccard_score(clusters, res,average='micro')
cos_sim = cosine_similarity(np.vstack((clusters, res)))[0,1]
euclidean_dist = euclidean_distances(np.vstack((clusters, res)))[0,1]
manhattan_dist = manhattan_distances(np.vstack((clusters, res)))[0,1]
per,pvalue = pearsonr(clusters, res)
print("Jaccard 相似系数: %0.3f"%(jaccard_sim))
print("余弦相似度:", cos_sim)
print("欧几里得距离:", euclidean_dist)
print("曼哈顿距离:", manhattan_dist)
print("pearson:",per)


Jaccard 相似系数: 0.055
余弦相似度: 0.5791062810152757
欧几里得距离: 16.1245154965971
曼哈顿距离: 156.0
pearson: 0.1423555839674722
