In [1]:
!pip install numpy matplotlib
!pip install scipy
import os
print(os.getcwd())


/home/hzhou53/2024 Fall DNA and GIN model


In [5]:
#without filtering out short pairewise distances (<1500)
import re
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import entropy
from scipy.signal import fftconvolve

def read_data(file_path):
    data = {}
    with open(file_path, 'r') as file:
        for line in file:
            if "PASS" not in line:
                continue
            match = re.search(r'(chr\w+):(\d+-\d+);(chr\w+):(\d+-\d+);(chr\w+):(\d+-\d+)', line)
            if match:
                chr1, coords1, chr2, coords2, chr3, coords3 = match.groups()
                for chr_, coords in [(chr1, coords1), (chr2, coords2), (chr3, coords3)]:
                    if chr_ not in data:
                        data[chr_] = {}
                    start, end = map(int, coords.split('-'))
                    bin_start = start // 500 + 1
                    bin_end = end // 500 + 1
                    for bin_num in range(bin_start, bin_end + 1):
                        data[chr_][bin_num] = data[chr_].get(bin_num, 0) + 1
    return data

def convert_to_bin_counts(fragments_dict):
    max_bin = max(fragments_dict.keys())
    bin_counts = np.zeros(max_bin + 1, dtype=int)
    for bin_num, count in fragments_dict.items():
        bin_counts[bin_num] = count
    return bin_counts

def calculate_distance_histogram(bin_counts, max_distance=None):
    autocorr = fftconvolve(bin_counts, bin_counts[::-1], mode='full')
    mid = len(autocorr) // 2
    distance_hist = autocorr[mid:]
    if max_distance is not None:
        distance_hist = distance_hist[:max_distance + 1]
    return distance_hist

def normalize_histogram_to_pmf(hist, bins=50):
    if np.sum(hist) == 0:
        return np.zeros(bins)
    
    bin_edges = np.linspace(0, len(hist), bins + 1)
    distance_values = np.arange(len(hist))
    
    pmf, _ = np.histogram(distance_values, bins=bin_edges, weights=hist, density=False)
    
    pmf = pmf / np.sum(pmf)
    return pmf

def calculate_kl_divergence(pmf1, pmf2):
    epsilon = 1e-10
    pmf1 = pmf1 + epsilon
    pmf2 = pmf2 + epsilon
    return entropy(pmf1, pmf2)

def main(file_path, bin_size=500, distance_bins=50, max_distance=None):
    data = read_data(file_path)
    chromosome_pmf = {}

    for chrom, fragments_dict in data.items():
        bin_counts = convert_to_bin_counts(fragments_dict)
        distance_hist = calculate_distance_histogram(bin_counts, max_distance)
        if np.sum(distance_hist) > 0:
            pmf = normalize_histogram_to_pmf(distance_hist, bins=distance_bins)
            chromosome_pmf[chrom] = pmf

    chromosomes = list(chromosome_pmf.keys())
    for i in range(len(chromosomes)):
        for j in range(i + 1, len(chromosomes)):
            chr1 = chromosomes[i]
            chr2 = chromosomes[j]
            pmf1 = chromosome_pmf[chr1]
            pmf2 = chromosome_pmf[chr2]
            kl_divergence = calculate_kl_divergence(pmf1, pmf2)
            print(f"KL divergence between {chr1} and {chr2}: {kl_divergence}")

if __name__ == "__main__":
    file_path = '/home/hzhou53/2024 Fall DNA and GIN model/GSM3347525NR_FDR_0.1_pseudoGEM_10000_enrichTest_master.txt'
    
    print("Running KL Divergence Calculation between Chromosomes:\n")
    main(file_path)


Running KL Divergence Calculation between Chromosomes:

KL divergence between chr2L and chr2R: 0.028994135043890885
KL divergence between chr2L and chr3L: 0.02811511347599341
KL divergence between chr2L and chr3R: 0.00986534947996091
KL divergence between chr2L and chr4: 0.06706297452263746
KL divergence between chr2L and chrX: 0.058642504880012374
KL divergence between chr2R and chr3L: 0.024899818535587988
KL divergence between chr2R and chr3R: 0.015222378613282576
KL divergence between chr2R and chr4: 0.0253064121871615
KL divergence between chr2R and chrX: 0.023050997872651233
KL divergence between chr3L and chr3R: 0.025152740580591745
KL divergence between chr3L and chr4: 0.035049230751195516
KL divergence between chr3L and chrX: 0.05492728660194429
KL divergence between chr3R and chr4: 0.059785507191629304
KL divergence between chr3R and chrX: 0.05621758725883139
KL divergence between chr4 and chrX: 0.02667469127725024


In [3]:
#Filtered out short pairewise distances (<1500)
import re
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import entropy
from scipy.signal import fftconvolve

def read_data(file_path):
    data = {}
    with open(file_path, 'r') as file:
        for line in file:
            if "PASS" not in line:
                continue
            match = re.search(r'(chr\w+):(\d+-\d+);(chr\w+):(\d+-\d+);(chr\w+):(\d+-\d+)', line)
            if match:
                chr1, coords1, chr2, coords2, chr3, coords3 = match.groups()
                for chr_, coords in [(chr1, coords1), (chr2, coords2), (chr3, coords3)]:
                    if chr_ not in data:
                        data[chr_] = {}
                    start, end = map(int, coords.split('-'))
                    bin_start = start // 500 + 1
                    bin_end = end // 500 + 1
                    for bin_num in range(bin_start, bin_end + 1):
                        data[chr_][bin_num] = data[chr_].get(bin_num, 0) + 1
    return data

def convert_to_bin_counts(fragments_dict):
    max_bin = max(fragments_dict.keys())
    bin_counts = np.zeros(max_bin + 1, dtype=int)
    for bin_num, count in fragments_dict.items():
        bin_counts[bin_num] = count
    return bin_counts

def calculate_distance_histogram(bin_counts, max_distance=None):
    autocorr = fftconvolve(bin_counts, bin_counts[::-1], mode='full')
    mid = len(autocorr) // 2
    distance_hist = autocorr[mid:]
    if max_distance is not None:
        distance_hist = distance_hist[:max_distance + 1]
    return distance_hist

def normalize_histogram_to_pmf(hist, bins=50):
    if np.sum(hist) == 0:
        return np.zeros(bins)
    
    bin_edges = np.linspace(0, len(hist), bins + 1)
    distance_values = np.arange(len(hist))
    
    pmf, _ = np.histogram(distance_values, bins=bin_edges, weights=hist, density=False)
    
    pmf = pmf / np.sum(pmf)
    return pmf

def calculate_kl_divergence(pmf1, pmf2):
    epsilon = 1e-10
    pmf1 = pmf1 + epsilon
    pmf2 = pmf2 + epsilon
    return entropy(pmf1, pmf2)

def main(file_path, bin_size=500, distance_bins=50, threshold_distance=1500):
    data = read_data(file_path)
    chromosome_pmf = {}
    for chrom, fragments_dict in data.items():
        bin_counts = convert_to_bin_counts(fragments_dict)
        distance_hist = calculate_distance_histogram(bin_counts)
        threshold_bin = int(threshold_distance / bin_size)
        distance_hist_filtered = distance_hist[threshold_bin:]
        if np.sum(distance_hist_filtered) > 0:
            pmf = normalize_histogram_to_pmf(distance_hist_filtered, bins=distance_bins)
            chromosome_pmf[chrom] = pmf
    chromosomes = list(chromosome_pmf.keys())
    for i in range(len(chromosomes)):
        for j in range(i + 1, len(chromosomes)):
            chr1 = chromosomes[i]
            chr2 = chromosomes[j]
            pmf1 = chromosome_pmf[chr1]
            pmf2 = chromosome_pmf[chr2]
            kl_divergence = calculate_kl_divergence(pmf1, pmf2)
            print(f"KL divergence between {chr1} and {chr2}: {kl_divergence}")

if __name__ == "__main__":
    file_path = '/home/hzhou53/2024 Fall DNA and GIN model/GSM3347525NR_FDR_0.1_pseudoGEM_10000_enrichTest_master.txt'
    
    print("Running KL Divergence Calculation after removing short distances:\n")
    main(file_path)


Running KL Divergence Calculation after removing short distances:

KL divergence between chr2L and chr2R: 0.029022050705499236
KL divergence between chr2L and chr3L: 0.028069091093875953
KL divergence between chr2L and chr3R: 0.009880368915250544
KL divergence between chr2L and chr4: 0.06662611126413559
KL divergence between chr2L and chrX: 0.05868077976698265
KL divergence between chr2R and chr3L: 0.02489512107442574
KL divergence between chr2R and chr3R: 0.015185199763381804
KL divergence between chr2R and chr4: 0.025000213503130076
KL divergence between chr2R and chrX: 0.023058894413462407
KL divergence between chr3L and chr3R: 0.024989878915888976
KL divergence between chr3L and chr4: 0.03521623238524013
KL divergence between chr3L and chrX: 0.05502815247862682
KL divergence between chr3R and chr4: 0.05906050616836078
KL divergence between chr3R and chrX: 0.05621783858195895
KL divergence between chr4 and chrX: 0.026671426616494905
