In [None]:
import numpy as np
from scipy.sparse import coo_matrix
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from numpy import linalg as LA
from scipy.sparse.linalg import svds, eigs
from sklearn.decomposition import PCA
from scipy.sparse import csr_matrix
import pickle

def convertNansToZeros(ma):
    nan_elements = np.flatnonzero(np.isnan(ma.data))
    if len(nan_elements) > 0:
        ma.data[nan_elements] = 0
    return ma


def convertInfsToZeros(ma):
    inf_elements = np.flatnonzero(np.isinf(ma.data))
    if len(inf_elements) > 0:
        ma.data[inf_elements] = 0
    return ma

In [8]:
# Load a given dataset, in this tab we do not consider centrality, we only care about HiC data
run = '48'
karyotype = [1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,21,'X'] # chr9 and chr22 are missing 
for chr_i in karyotype:
    for chr_j in karyotype[karyotype.index(chr_i):]:
        filename = '/home/garner1/Work/dataset/gpseq+hic/bc'+run+'/chr'+str(chr_i)+'-chr'+str(chr_j)+'.inter.observed.none.txt.bc'+run+'.tsv'
        data = np.loadtxt(filename,usecols=(0,1,2))

        i = data[:,0] # bin labels as rows
        j = data[:,1] # bil labels as cols
        ij = data[:,2] # number of contact between i and j bins
        
        rows = int(max(i))
        cols = int(max(j))
        mat = coo_matrix((ij, (i, j)), shape=(rows+1, cols+1)).todense()
        if chr_i == chr_j : 
            mat = mat + mat.transpose()
        locals()['chrom'+str(chr_i)+'chrom'+str(chr_j)] = mat
        locals()['chrom'+str(chr_j)+'chrom'+str(chr_i)] = mat.transpose()

In [9]:
# Combine the HiC blocks in order to generate a genome-wide HiC map
genome = np.block([[locals()['chrom'+str(i)+'chrom'+str(j)] for j in karyotype] for i in karyotype])

# %matplotlib

mat = np.log(genome)
mat = convertNansToZeros(coo_matrix(mat)).todense()
mat = convertInfsToZeros(coo_matrix(mat)).todense()

plt.figure(0)

cmap = sns.diverging_palette(220, 20, sep=20, as_cmap=True)
ax = sns.heatmap(mat,center=mat.mean(),cmap=cmap)
# ax = sns.heatmap(mat)


Using matplotlib backend: Qt5Agg


  """


Now consider the centralities as well:

In [None]:
# Store the HiC data into pickle files after filtering by centrality
karyotype = [1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,21,'X'] # chr9 and chr22 are missing 
for run in ['48','52','58']:
    for centrality_i in range(1,5):
        for centrality_j in range(1,5):
            print run, centrality_i, centrality_j
            for chr_i in karyotype:
                for chr_j in karyotype[karyotype.index(chr_i):]:
                    filename = '/home/garner1/Work/dataset/gpseq+hic/bc'+run+'/chr'+str(chr_i)+'-chr'+str(chr_j)+'.inter.observed.none.txt.bc'+run+'.tsv'
                    data = np.loadtxt(filename,usecols=(0,1,2,3,4,5))

                    i = data[:,0] # bin labels as rows
                    j = data[:,1] # bin labels as cols
                    ij = data[:,2] # number of contact between i and j bins

                    rows = int(max(i))
                    cols = int(max(j))
                    mat = coo_matrix((ij, (i, j)), shape=(rows+1, cols+1)).todense()
                    if chr_i == chr_j : 
                        mat = mat + mat.transpose()
                    row_index = np.array(i[data[:,3] == centrality_i]) # filter mat by centralities
                    col_index = np.array(j[data[:,5] == centrality_j]) # filter mat by centralities
                    newmat = np.zeros(mat.shape)
                    for row in xrange(mat.shape[0]):
                        for col in xrange(mat.shape[1]):
                            if (row in row_index) and (col in col_index):
                                newmat[row,col] = mat[row,col]
                    locals()['run'+str(run)+'chrom'+str(chr_i)+'chrom'+str(chr_j)+'centrality'+str(centrality_i)+str(centrality_j)] = newmat
                    locals()['run'+str(run)+'chrom'+str(chr_j)+'chrom'+str(chr_i)+'centrality'+str(centrality_j)+str(centrality_i)] = newmat.transpose()
                    data = locals()['run'+str(run)+'chrom'+str(chr_i)+'chrom'+str(chr_j)+'centrality'+str(centrality_i)+str(centrality_j)]
                    filename = '/home/garner1/Work/dataset/gpseq+hic/pickle_files.observed.none/'+'run'+run+'chrom'+str(chr_i)+'chrom'+str(chr_j)+'centrality'+str(centrality_i)+str(centrality_j)+'.p'
                    pickle.dump( data, open( filename, "wb" ) ) # save data
                    data = locals()['run'+str(run)+'chrom'+str(chr_j)+'chrom'+str(chr_i)+'centrality'+str(centrality_j)+str(centrality_i)]
                    filename = '/home/garner1/Work/dataset/gpseq+hic/pickle_files.observed.none/'+'run'+run+'chrom'+str(chr_j)+'chrom'+str(chr_i)+'centrality'+str(centrality_j)+str(centrality_i)+'.p'
                    pickle.dump( data, open( filename, "wb" ) ) # save data

# # Load data
# karyotype = [1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,21,'X'] # chr9 and chr22 are missing 
# for run in ['48','52','58']:
#     for centrality_i in range(1,5):
#         for centrality_j in range(1,5):
#             print run, centrality_i, centrality_j
#             for chr_i in karyotype:
#                 for chr_j in karyotype:
#                     filename = '/home/garner1/Work/dataset/gpseq+hic/pickle_files.observed.none/'+'run'+str(run)+'chrom'+str(chr_i)+'chrom'+str(chr_j)+'centrality'+str(centrality_j)+str(centrality_i)+'.p'
#                     locals()['run'+str(run)+'chrom'+str(chr_i)+'chrom'+str(chr_j)+'centrality'+str(centrality_i)+str(centrality_j)] = pickle.load( open( filename, "rb" ) )
                    

In [None]:
# Combine all blocks into a single matrix filtered by centralities
karyotype = [1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,21,'X'] # chr9 and chr22 are missing 
run = 48
for centrality_i in range(1,5):
    for centrality_j in range(1,5):
        genome_r1r2 = np.block([[locals()['run'+str(run)+'chrom'+str(i)+'chrom'+str(j)+'centrality'+str(centrality_i)+str(centrality_j)] for j in karyotype] for i in karyotype])
        genome_r2r1 = np.block([[locals()['run'+str(run)+'chrom'+str(i)+'chrom'+str(j)+'centrality'+str(centrality_j)+str(centrality_i)] for j in karyotype] for i in karyotype])
        genome = genome_r1r2 + genome_r2r1
        print abs(centrality_i-centrality_j),genome.mean()

# # %matplotlib
# mat = np.log(genome)
# mat = convertNansToZeros(coo_matrix(mat)).todense()
# mat = convertInfsToZeros(coo_matrix(mat)).todense()

# plt.figure(0)

# cmap = sns.diverging_palette(220, 20, sep=20, as_cmap=True)
# ax = sns.heatmap(mat,center=mat.mean(),cmap=cmap)

In [None]:
plt.figure(0)
fig, ax = plt.subplots()
y = np.log([1877929240,446412792,178265904,55193388])
ax.scatter(x=range(4), y=y)
plt.show()
# plt.savefig('eigvec_on.png')
