In [None]:
import numpy as np
import pandas as pd

import scipy.cluster.hierarchy as hcluster
import scipy.spatial.distance as scidist
from scipy.stats import spearmanr

import networkx as nx

from matplotlib import pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages

plt.switch_backend('agg')



In [None]:
df=pd.read_csv('forCorrelation.csv',index_col='Accession')
df

In [None]:
matrix = df.transpose().corr()
matrix

In [None]:
def correlation_cluster(corr, abs_corr=False):
    corr_dist = scidist.squareform(corr.values, checks=False)
    corr_dist = 1 - np.abs(corr_dist) if abs_corr else 1 - corr_dist
    gene_clusters= hcluster.linkage(corr_dist, method="average")
    gene_leaves = hcluster.leaves_list(gene_clusters)

    clustered_corr = corr.values[gene_leaves, :][:, gene_leaves]
    clustered_genes = [corr.columns[i] for i in gene_leaves]
    rv = pd.DataFrame(clustered_corr, 
                      columns=clustered_genes, index=clustered_genes)

    # transform the linkage matrix for the reordered columns
    new_indices = np.argsort(gene_leaves).astype(np.int)
    new_linkage = gene_clusters.copy()
    old0 = np.nonzero(new_linkage[:, 0] < len(gene_leaves))[0]
    old1 = np.nonzero(new_linkage[:, 1] < len(gene_leaves))[0]
    new_linkage[old0, 0] = new_indices[new_linkage[old0, 0].astype(np.int)]
    new_linkage[old1, 1] = new_indices[new_linkage[old1, 1].astype(np.int)]
    rv.linkage = new_linkage

    return rv

In [None]:
import scipy.cluster.hierarchy as hcluster
import scipy.spatial.distance as scidist
reshaped_matrix=correlation_cluster(matrix, abs_corr=False)

In [None]:
reshaped_matrix.mean().mean()

In [None]:
import seaborn as sns
plt.rcParams['figure.figsize']=[10,10]
sns.heatmap(data=reshaped_matrix.fillna(0),cmap='bwr',vmin=-1)

In [None]:
def boundaries2(cor, percentile, threshold, frozen=None):
    if frozen == None:
        frozen = [False for i in range(cor.shape[1])]
    def find_clusters(tree):
        if tree.is_leaf():
            if frozen[tree.id]:
                return ([tree.id, tree.id + 1], True)
            else:
                return ([tree.id, tree.id + 1], False)

        (left_clusters, left_frozen) = find_clusters(tree.left)
        (right_clusters, right_frozen) = find_clusters(tree.right)

        if left_frozen or right_frozen:
            return (left_clusters[:-1] + right_clusters, True)
        else:
            joint_values = cor.values[left_clusters[0]:left_clusters[-1], 
                                       right_clusters[0]:right_clusters[-1]].flatten()
            if np.sum(np.abs(joint_values) >= threshold) >= percentile*int(len(joint_values)) :
                return ([left_clusters[0], right_clusters[-1]], False)
            else:
                return (left_clusters[:-1] + right_clusters, False)

    tree = hcluster.to_tree(cor.linkage)
    (clusters, is_frozen) = find_clusters(tree)

    return clusters

In [None]:
Uclusters = boundaries2(reshaped_matrix, 0.8, 0.3)
Lbdry = zip(Uclusters[:-1], Uclusters[1:])
Lbdry_large = [(a, b) for (a, b) in Lbdry if b - a >= 5]
Lbdry_large

In [None]:
reshaped_matrix.to_csv('E:/ManuscirptI/script/ReorderCorrelation.csv',header=True,index=True)
type(Lbdry_large)
import numpy
numpy.savetxt('E:/ManuscirptI/script/CorrelationGroup.txt',Lbdry_large)