In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
from tqdm import tqdm
from time import time

# TODO: Move functions to separate modules
import functions as f

import core.background_model as background_model
import core.clique_finding as cf
import core.stats 

# GLOBALS

TTN_BIN = 4275
BIN_MAP_PATH = 'mappings/bin_map_human_100000.bed'
GTF_PATH = 'mappings/gencode.v38.annotation.gtf'
GENE_BIN_PATH = 'mappings/gene_bins.txt'
NON_GENE_BIN_PATH = 'mappings/non_gene_bins.txt'


gene_bins = []
with open('mappings/gene_bins.txt', 'r') as file:
    for line in file:
        gene_bins.append(line.strip())
gene_bins = [int(x) for x in gene_bins]


non_gene_bins = []
with open('mappings/non_gene_bins.txt', 'r') as file:
    for line in file:
        non_gene_bins.append(line.strip())
non_gene_bins = [int(x) for x in non_gene_bins]



# LOAD CONTACT MATRICES
contact_matrix_zero = np.load('data/hic/wt_100kb_zeroed.npy')

In [None]:
def optimize_clique_size(contact_matrix, max_clique_size, seed_bin, num_samples=1000):
    print(f"Starting optimize_clique_size: max_clique_size={max_clique_size}, seed_bin={seed_bin}, num_samples={num_samples}")

    # 1) Compute the full-size TTN clique once
    ttn_full = cf.find_greedy_clique(contact_matrix, max_clique_size, seed_bin)
    print(f"Computed TTN full clique of size {len(ttn_full)}")

    # 2) Sample background cliques at full size
    bg_full = []
    for _ in tqdm(range(num_samples), desc="Sampling background cliques"):
        random_bin = np.random.randint(contact_matrix.shape[0])  
        bg_full.append(
            cf.find_greedy_clique(contact_matrix, max_clique_size, random_bin)
        )
    print("Background sampling complete.")

    sizes = list(range(1, max_clique_size + 1))
    ttn_scores, p_values, fold_changes = [], [], []
    bg_dists = {}

    # 3) For each target size, trim and score
    for size in tqdm(sizes, desc="Processing clique sizes"):
        print(f"\nSize {size}/{max_clique_size}")

        # TTN subclique and score
        ttn_sub = ttn_full[:size]
        ttn_score = core.stats.calculate_avg_interaction_strength(contact_matrix, ttn_sub)
        print(f"  TTN score: {ttn_score:.4f}")

        # Background subcliques and scores
        bg_scores = [
            core.stats.calculate_avg_interaction_strength(contact_matrix, clique[:size])
            for clique in bg_full
        ]
        bg_dists[size] = bg_scores

        # Compute statistics
        median_bg = np.median(bg_scores)
        pval = (np.sum(np.array(bg_scores) >= ttn_score) + 1) / (num_samples + 1)
        fold = ttn_score / median_bg if median_bg != 0 else float('nan')

        print(f"  Median background: {median_bg:.4f}")
        print(f"  p-value: {pval:.4f}")
        print(f"  Fold change: {fold:.4f}")

        # Store
        ttn_scores.append(ttn_score)
        p_values.append(pval)
        fold_changes.append(fold)

    print("Completed optimize_clique_size")
    return sizes, ttn_scores, p_values, fold_changes, bg_dists

In [None]:
sizes, ttn_scores, pvals, folds, bg = optimize_clique_size(
    contact_matrix=contact_matrix_zero,
    max_clique_size=20,
    seed_bin=TTN_BIN,
    num_samples=5000
)

# then you can e.g. plot fold vs size or pval vs size

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 1) Plot p-value vs size
plt.figure()
plt.plot(sizes, pvals, marker='o', linestyle='-')
plt.axhline(0.05, color='red', linestyle='--', label='α=0.05')
plt.xlabel('Clique size')
plt.ylabel('Empirical p-value')
plt.title('P-value vs Clique Size')
plt.xticks(sizes)
plt.legend()
plt.tight_layout()
plt.show()

# 2) Choose the size with minimum p-value
opt_idx = int(np.argmin(pvals))
opt_size = sizes[opt_idx]
opt_pval = pvals[opt_idx]
print(f'Optimal clique size = {opt_size}, p-value = {opt_pval:.4f}')