Load basic libraries:

In [1]:
import numpy as np
from scipy.sparse import coo_matrix
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from numpy import linalg as LA
from scipy.sparse.linalg import svds, eigs
from sklearn.decomposition import PCA
from scipy.sparse import csr_matrix
import pickle

def convertNansToZeros(ma):
    nan_elements = np.flatnonzero(np.isnan(ma.data))
    if len(nan_elements) > 0:
        ma.data[nan_elements] = 0
    return ma


def convertInfsToZeros(ma):
    inf_elements = np.flatnonzero(np.isinf(ma.data))
    if len(inf_elements) > 0:
        ma.data[inf_elements] = 0
    return ma

Load a given dataset, in this tab we do not consider centrality, we only care about HiC data:

In [2]:
filter_layer = False # if you want to filter contacts by centrality layers
layer = 4
layer_i = layer # from 1 to 4
layer_j = layer # from 1 to 4

run = '48'
karyotype = [1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,21,'X'] # chr9 and chr22 are missing 
# karyotype = [1]
for chr_i in karyotype:
    for chr_j in karyotype[karyotype.index(chr_i):]:
        filename = '/home/garner1/Work/dataset/gpseq+hic/bc'+run+'/chr'+str(chr_i)+'-chr'+str(chr_j)+'.inter.observed.none.txt.bc'+run+'.tsv'
        data = np.loadtxt(filename,usecols=(0,1,2,3,4,5,6))

        i = data[:,0].astype(int) # bin labels as rows
        j = data[:,1].astype(int) # bil labels as cols
        ij = data[:,2].astype(int) # number of contact between i and j bins
        
        rows = int(max(i))
        cols = int(max(j))
        mat = coo_matrix((ij, (i, j)), shape=(rows+1, cols+1)).todense()
        if filter_layer:
            for ind_i in xrange(mat.shape[0]):
                for ind_j in xrange(mat.shape[1]):
                    if not (data[ind_i,3] == layer_i and data[ind_j,5] == layer_j):
                        mat[ind_i,ind_j] = 0
#         if chr_i == chr_j : 
#             mat = mat + mat.transpose()
        locals()['chrom'+str(chr_i)+'chrom'+str(chr_j)] = mat
        locals()['chrom'+str(chr_j)+'chrom'+str(chr_i)] = mat.transpose()
    if not filter_layer:
        centrality_vec = np.zeros((rows+1,1))
        centrality_vec[i,0] = data[:,4]
        locals()['chrom'+str(chr_i)+'chrom'+'C'] = centrality_vec.ravel()
        locals()['chrom'+'C'+'chrom'+str(chr_i)] = centrality_vec.transpose()
    layer_vec = np.zeros((rows+1,1))
    layer_vec[i,0] = data[:,3]
    locals()['chrom'+str(chr_i)+'layer'] = layer_vec.ravel()

if not filter_layer:
    locals()['chrom'+'C'+'chrom'+'C'] = np.ones((1,1))


Combine the HiC blocks in order to generate a genome-wide HiC map. 

The normalization of the genome and centrality values is done in such a way that 2*max_centrality = Log(max_interaction).

In [3]:
karyotype = [1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,21,'X'] # chr9 and chr22 are missing and C is added
genome = np.block([[locals()['chrom'+str(i)+'chrom'+str(j)] for j in karyotype] for i in karyotype])
genome = np.log(genome)
genome = convertNansToZeros(coo_matrix(genome)).todense()
genome = convertInfsToZeros(coo_matrix(genome)).todense()
centrality_col = np.block([[locals()['chrom'+str(i)+'chrom'+str(j)] for j in ['C']] for i in karyotype])
centrality_row = centrality_col.transpose()
genome = np.block([[genome,centrality_col],[centrality_row,0.0*np.ones((1,1))]])

  if __name__ == '__main__':


Use MDS to embed in low dimension. MDS has as input the euclidean distance of each bin contact profile, including centrality:

In [13]:
from sklearn.metrics import pairwise_distances
from sklearn.manifold import MDS
dim = 2
D = pairwise_distances(genome[:-1,:-1])
model = MDS(n_components=dim, dissimilarity='precomputed', random_state=1,n_jobs = -1)
out = model.fit_transform(D)

In [14]:
print out.shape,genome.shape, centrality_col.shape

(2814, 2) (2815, 2815) (2814, 1)


Generate 21 colors:

In [7]:
# from matplotlib import colors as mcolors
# from random import randint,sample

# colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
# # Sort colors by hue, saturation, value and name.
# by_hsv = sorted((tuple(mcolors.rgb_to_hsv(mcolors.to_rgba(color)[:3])), name)
#                 for name, color in colors.items())
# sorted_names = [name for hsv, name in by_hsv]
# # color_list = sample(sorted_names,21)
color_list = [u'b', u'lightgoldenrodyellow', u'navajowhite', u'paleturquoise', u'salmon', u'dimgray', u'chartreuse', u'aliceblue', u'royalblue', u'darkgreen', u'mediumturquoise', u'forestgreen', u'thistle', u'orchid', u'gold', u'maroon', u'wheat', u'navy', u'khaki', u'palevioletred', u'cornflowerblue']

Create chrom#coordinates,chrom#color,chrom#chromosome data to be merged in a pandas dataframe:

In [30]:
import pandas as pd
layer = ''
karyotype = [1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,21,'X'] # chr9 and chr22 are missing 
offset = 0
for chrom in karyotype:
    length = locals()['chrom'+str(chrom)+'chrom'+'1'].shape[0] # the length of the chromosome
    locals()['chrom'+str(chrom)+'coordinates'] = out[offset:offset+length,:dim] # its xyz coordinates
    locals()['chrom'+str(chrom)+'color'] = [color_list[karyotype.index(chrom)]]*length # its color
    locals()['chrom'+str(chrom)+'chromosome'] = [str(chrom)]*length # its label
    locals()['chrom'+str(chrom)+'df'] = pd.DataFrame({'x':locals()['chrom'+str(chrom)+'coordinates'][:,0],'y':locals()['chrom'+str(chrom)+'coordinates'][:,1], 'color':locals()['chrom'+str(chrom)+'color'][0],'chromosome':locals()['chrom'+str(chrom)+'chromosome'][0],'layer':locals()['chrom'+str(chrom)+'layer'],'centrality':locals()['chrom'+str(chrom)+'chromC'].ravel()})
    offset = length+offset
locals()['genomeDF'] = pd.concat([locals()['chrom'+str(chrom)+'df'] for chrom in karyotype])

In [38]:
genomeDF['newX'] = genomeDF.x * 1.0/genomeDF.centrality
genomeDF['newY'] = genomeDF.y * 1.0/genomeDF.centrality

In [43]:
# genomeDF
ax1 = genomeDF.plot.scatter('newX','newY',color=genomeDF['color'])
plt.savefig('with_centrality.png')

FigureCanvasNbAgg()

In [44]:
ax2 = genomeDF.plot.scatter('x','y',color=genomeDF['color'])
plt.savefig('without_centrality')

FigureCanvasNbAgg()

Visualizations:

In [463]:
'''
All layers
'''
if dim == 3:
    from mpl_toolkits.mplot3d import Axes3D
    import matplotlib.pyplot as plt
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    data = genome_layered
    ax.scatter(data['x'], data['y'], data['z'], c=data['color'])
    plt.show()
if dim == 2:
    fig, ax = plt.subplots()
    data = genome_layered
    ax.scatter(data['x'], data['y'], c=data['color'])

FigureCanvasNbAgg()

In [42]:
%matplotlib ipympl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

fig = plt.figure(figsize=(2, 5))
patches = [
    mpatches.Patch(color=color, label=label)
    for label, color in zip(karyotype, color_list)]
fig.legend(patches, karyotype, loc='center', frameon=False)
plt.savefig('legend.png')

FigureCanvasNbAgg()

In [465]:
'''
Layer 1
'''
if dim == 3:
    from mpl_toolkits.mplot3d import Axes3D
    import matplotlib.pyplot as plt
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    data = genomeDF_layer1[genomeDF_layer1['layer'] == 1.0]
    ax.scatter(data['x'], data['y'], data['z'], c=data['color'])
    plt.show()
if dim == 2:
    fig, ax = plt.subplots()
    data = genome_layer_11
    ax.scatter(data['x'], data['y'], c=data['color'])

FigureCanvasNbAgg()

In [466]:
'''
Layer 2
'''
if dim == 3:
    from mpl_toolkits.mplot3d import Axes3D
    import matplotlib.pyplot as plt
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    data = genomeDF_layer2[genomeDF_layer2['layer'] == 2.0]
    ax.scatter(data['x'], data['y'], data['z'], c=data['color'])
    plt.show()
if dim == 2:
    fig, ax = plt.subplots()
    data = genome_layer_22
    ax.scatter(data['x'], data['y'], c=data['color'])

FigureCanvasNbAgg()

In [467]:
'''
Layer 3
'''
if dim == 3:
    from mpl_toolkits.mplot3d import Axes3D
    import matplotlib.pyplot as plt
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    data = genomeDF_layer3[genomeDF_layer3['layer'] == 3.0]
    ax.scatter(data['x'], data['y'], data['z'], c=data['color'])
    plt.show()
if dim == 2:
    fig, ax = plt.subplots()
    data = genome_layer_33
    ax.scatter(data['x'], data['y'], c=data['color'])

FigureCanvasNbAgg()

In [468]:
'''
Layer 4
'''
if dim == 3:
    from mpl_toolkits.mplot3d import Axes3D
    import matplotlib.pyplot as plt
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    data = genomeDF_layer4[genomeDF_layer4['layer'] == 4.0]
    ax.scatter(data['x'], data['y'], data['z'], c=data['color'])
    plt.show()
if dim == 2:
    fig, ax = plt.subplots()
    data = genome_layer_44
    ax.scatter(data['x'], data['y'], c=data['color'])

FigureCanvasNbAgg()