# Expanse Notebook for Computational Analysis of Chromatin During Heart Development

This notebook will be used exclusively to perform expensive operations on Expanse. 

### Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from time import time

# TODO: Move functions to separate modules
import functions as f

import scripts.clique_finding as cf

In [3]:
TTN_BIN = 4275
bin_map = f.load_bin_map('mappings/bin_map_human_100000.bed')
contact_matrix_zero = np.load('samples/contact_matrix_100kb_balanced_zeroed.npy')

In [4]:
def build_walk_index(contact_matrix):
    """
    Precompute for each node:
      - neighbors[i]: 1D int array of neighbors
      - cdfs[i]:      1D float array of cumulative probabilities
    """
    N = contact_matrix.shape[0]
    neighbors = [None]*N
    cdfs      = [None]*N

    for i in tqdm(range(N)):
        w = contact_matrix[i]
        idx = np.nonzero(w)[0]
        if idx.size == 0:
            neighbors[i] = np.empty(0, dtype=int)
            cdfs[i]      = np.empty(0, dtype=float)
        else:
            probs = w[idx] / w[idx].sum()
            neighbors[i] = idx
            cdfs[i]      = np.cumsum(probs)
    return neighbors, cdfs

neighbors, cdfs = build_walk_index(contact_matrix_zero) 



100%|█████████████████████████████████████████████████████████████████████████████████████| 30894/30894 [00:10<00:00, 2901.51it/s]


### Random Walking Time Test


In [5]:
import functions as f

from time import time

start = time()


f.random_walk_fast(
    contact_matrix_zero, TTN_BIN, 5,
    neighbors=neighbors, cdfs=cdfs,
    num_molecules=10000, alpha=0.05
)
end = time()

print(f"Time taken: {end - start} seconds")

Time taken: 0.8913464546203613 seconds


### Analytical Diffusion Test

In [6]:
# create a sample matrix
num_sample_bins = 10000
sample_matrix = f.generate_sample_matrix_bins(10000)


if (num_sample_bins < 10000):
    # visualize the contact matrix hic 
    plt.imshow(sample_matrix, cmap='hot', interpolation='nearest')
    plt.show()


In [7]:
start = time()
sample_clique = cf.analytical_diffusion_clique(sample_matrix, start_node=4, n=6)[0]
end = time()

print("Sample Clique:", sample_clique)
print(f"Time taken: {end - start} seconds")

Sample Clique: [   4 2000 1000  995  122  369]
Time taken: 21.13571786880493 seconds


In [17]:
start = time()
sample_clique = cf.analytical_diffusion_clique(sample_matrix, start_node=4, n=6)[0]


print("Sample Clique:", sample_clique)
print(f"Time taken: {end - start} seconds")

Sample Clique: [   4 2000 1000  995  122  369]
Time taken: 26.47953486442566 seconds


### Get 40-Clique of TTN and Corresponding Genes

In [9]:
top_40_clique_rw = cf.random_walk(contact_matrix_zero, TTN_BIN, 40, num_molecules=100000, alpha=0.05, neighbors=neighbors, cdfs=cdfs)

In [10]:
top_40_clique_greedy = cf.find_greedy_clique(
    contact_matrix_zero, 40, TTN_BIN
)

In [11]:
# overlapping nodes from the two cliques
overlapping_nodes = np.intersect1d(top_40_clique_rw, top_40_clique_greedy)
print("Overlapping nodes between the two cliques:")
print(overlapping_nodes)


Overlapping nodes between the two cliques:
[ 4275 30357 30369 30478]


In [12]:
start = time()
top_40_diffusion = cf.analytical_diffusion_clique(contact_matrix_zero, TTN_BIN, 40, alpha=0.05)[0]

end = time()
print("TTN Clique:", top_40_diffusion)
print(f"Time taken: {end - start} seconds")

with open("TTN_BINS_ANALYTICAL_DIFFUSION.txt", 'w') as out:
    for bin in top_40_diffusion:
        out.write(f"{bin}\n")


TTN Clique: [ 4275 30478 28240 28248 30543 27873 30350 30378 30387 30386 30369 30432
 30586 30357 28243 30434 28236 21918 27880 30423 24025 27887 28358 27881
 28361 28377 28366 27139 20604 27886 28247 28364 27889 11886 30359 19160
 27835  8796 28212 21915]
Time taken: 417.7960503101349 seconds


In [13]:
# overlapping nodes from the two cliques
overlapping_nodes_diffusion = np.intersect1d(top_40_clique_rw, top_40_diffusion)
print("Overlapping nodes between the two cliques:")
print(overlapping_nodes_diffusion)

Overlapping nodes between the two cliques:
[ 4275 24025 27873 27881 27887 28236 28248 28364 30357 30369 30378 30478
 30543 30586]


In [15]:
from tqdm import tqdm

TTN_GENES = set()
for bin in tqdm(top_40_diffusion):
    genes = f.find_gene_from_bin(bin, 'mappings/bin_map_human_100000.bed', 'mappings/gencode.v38.annotation.gtf')
    TTN_GENES.update(genes)


    

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [06:43<00:00, 10.08s/it]


In [16]:
with open("TTN_GENES_ANALYTICAL_DIFFUSION.txt", 'w') as out:
    for bin in TTN_GENES:
        out.write(f"{bin}\n")
