#### Import dependencies ####

In [1]:
import os
import sys
import re
import time
import warnings
import math
import csv
import h5py
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.cm as cm
import matplotlib.patches as mpatches
import matplotlib.colors as mcolors
import matplotlib.gridspec as gridspec
from matplotlib.colors import ListedColormap
import matplotlib.ticker as ticker
from matplotlib.ticker import LogFormatterMathtext
from mpl_scatter_density import ScatterDensityArtist
import pyideogram
import pybigtools
import scipy.optimize
import scipy.io
import scipy.stats as stats
from scipy.stats import pearsonr, spearmanr, kendalltau, ttest_ind, mannwhitneyu
from scipy.stats import zscore
from scipy.ndimage import gaussian_filter
from scipy.ndimage import gaussian_filter1d
from scipy.optimize import curve_fit, fsolve
from numpy import genfromtxt
from typing import Any
from time import monotonic
import cProfile
from random import random
from itertools import accumulate
from math import floor
import gzip
from Bio import SeqIO
from scipy.signal import argrelextrema

# General options
np.set_printoptions(threshold=sys.maxsize)

# Suppress specific warnings
warnings.filterwarnings("ignore", message="The iteration is not making good progress")
warnings.filterwarnings("ignore", message=".*Creating legend with loc=\"best\" can be slow with large amounts of data.*")
warnings.filterwarnings("ignore", message=".*All-NaN slice encountered.*", category=RuntimeWarning)

#### General variables ####

In [2]:
cell_lines = ["HeLa-S3","BJ1","IMR90","HUVEC","K562","GM12878","HepG2","MCF-7","H1","H9","HCT"]
chr_lengths = [249251, 243200, 198023, 191155, 180916, 171116, 159139, 146365, 141214, 135535, 135007, 133852, 115170, 107350, 102532, 90355, 81196, 78078, 59129, 63026, 48130, 51305]

list1 = ["time_data", "time_sim", "error", "fire_rates", "forkd","telomeres","rna_seq","gro_seq","DNaseIHS","chip_seq","prom","coding", "speed_data", "speed_sim"]
list2 = ["Replication time (min)", "Replication time (min)", "Error", "Firing rate", "Fork directionality", "Telomeres", "RNA-Seq","GRO-Seq","DNaseI HS","ChIP-Seq","Promoter","Coding genes","Replication rate (data)", "Replication rate (sim)"]
title_map = {key: value for key, value in zip(list1, list2)}

plt.rcParams['text.usetex'] = False # Enable LaTeX font rendering

#### Model plots ####

In [3]:
def plot_theory(nmax, v, fmin, fmax, saveQ=False):

    def given_function(f, n, v):
        sum_part = 0
        for k in range((n-3)//2 + 1):
            sum_part += (np.exp(-f*k**2/v) - np.exp(-f*(k+1)**2/v)) / (2*k + 1)
        
        additional_term = np.exp(-f*((n-1)/2)**2/v) / n
        
        result = (1/f) * (sum_part + additional_term)
        return result
    
    # Parameters
    f_values = np.linspace(fmin, fmax, 400)  # f values from 0.0001 to 0.04
    n_values = range(1, nmax + 1)  # n values from 1 to 60
    
    # Setting up the color map
    norm = mcolors.Normalize(vmin=min(n_values), vmax=max(n_values))
    cmap = plt.colormaps.get_cmap('autumn')
    
    # Plotting
    scl = 0.5
    fig, ax = plt.subplots(figsize=(scl * 14, scl * 10))
    for n in n_values:
        y_values = [given_function(f, n, v) for f in f_values]
        ax.plot(f_values, y_values, color=cmap(norm(n)), label=f'n={n}')
    
    # Adding the additional function plot
    additional_y_values = (1/2) * np.sqrt(np.pi / (f_values * v))
    blue_plot, = ax.plot(f_values, additional_y_values, color='blue', label=r'$\frac{1}{2}\sqrt{\frac{\pi}{fv}}$')
    
    ax.set_xlabel(r'Firing rate ($f$)')
    ax.set_ylabel(r'$E[T_j; n]$')
    ax.set_yscale('log')
    ax.set_ylim(10**0.55, 10**2)
    ax.set_xlim(0, 0.04)
    
    # Adding color bar inside the plot
    sm = cm.ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])
    cbar = plt.colorbar(sm, orientation='horizontal', pad=0.05, fraction=0.05, ax=ax)
    cbar.ax.tick_params(size=0)  # Remove ticks
    cbar.outline.set_visible(False)  # Remove the frame
    cbar.ax.set_position([0.68, 0.81, 0.18, 0.05])  # Adjust position [left, bottom, width, height]
    
    # Adding the label to the right of the color bar
    cbar.ax.text(1.05, 0.5, r'$n$', transform=cbar.ax.transAxes, va='center')
    
    # Remove colorbar ticks
    cbar.set_ticks([])
    
    # Adding the legend for the blue plot below the color bar
    legend = ax.legend(handles=[blue_plot], loc='upper right', bbox_to_anchor=(1, 0.97), fontsize=15)
    legend.get_frame().set_linewidth(0)  # Remove legend frame

    # Save plot
    if saveQ:
        plt.savefig('figures/theoryplot.pdf', bbox_inches='tight', transparent=True)
    
    plt.show()

#### Data generation ####

##### Replication timing #####

In [4]:
### BigWig data ###
# From: https://genome.ucsc.edu/cgi-bin/hgFileUi?db=hg19&g=wgEncodeUwRepliSeq

def sigmoid(values, k):
    values = np.array(values)  
    if k > 0:
        return 50 * (1 + np.tanh((k / 100) * (values - 50)) / np.tanh(0.5 * k))
    elif k == 0:
        return values  # Identity function when k=0

def datagenBigWig(cell_line, chr, minp, maxp, resolution, alld, dtscale, saveQ, info, sigscale=0):
    file_path = f'data/bigwig_files/{cell_line}.bigWig'
    bw = pybigtools.open(open(file_path, 'rb'))  # Keep the original file opening method as requested
    time_data_all = bw.values(f'chr{chr}')
    
    if not alld:
        time_data_all = bw.values(f'chr{chr}', minp * resolution, maxp * resolution)
    
    # Sample equally spaced values from `time_data_all` with the given resolution
    time_data = np.array(time_data_all[::resolution])
    

    # Identify invalid positions
    invalid_positions = np.where(np.isnan(time_data) | (time_data <= 0))[0]
    
    # Filter the time_data
    time_data = np.nan_to_num(time_data, nan=0.0001)  # Map 'nan' to 0.0001
    time_data[time_data <= 0] = 0.0001  # Map values less than or equal to 0 to 0.0001
    time_data[time_data > 100] = 100  # Cap values greater than 100 to 100
    time_data = np.array([100 - i for i in time_data]) # Data is given in inversed scale
    # Optional: Apply sigmoid transformation (0 for no transform)
    #time_data = sigmoid(time_data, sigscale) # Use k = (0,2,5,10,50)
    # Scaling
    time_data = dtscale * time_data
    interval_min = 30 # The start of the S-phase is often detected within a range of 0 to 30 minutes into the S-phase.
    interval_max = max(time_data)
    time_data = (time_data - np.min(time_data) )/ (np.max(time_data) - np.min(time_data)) * (interval_max - interval_min) + interval_min
    time_data[invalid_positions] = max(time_data)

    if saveQ:
        np.savetxt(f"data/whole-genome_timing_data/time_data_{info}.txt", time_data, fmt='%.30f')
        np.savetxt(f"data/whole-genome_missing_data/missing_data_{info}.txt", invalid_positions, fmt='%d')

    return time_data

In [5]:
### High-resolution data ###
# From: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE137764

def logistic(x, L, k, x0):
    """Logistic function used for curve fitting."""
    return L / (1 + np.exp(-k * (x - x0)))

def calculate_medians(rtil2):
    """Calculates the median time points for each list of observations."""
    medians = []

    for data in rtil2:
        if np.sum(data) == 0:
            medians.append(np.nan)  # Handle the case of all zeros or no data
            continue

        # Accumulate the data points
        accumulated_data = np.cumsum(data)

        # Normalize the accumulated data to have a final value of 1
        normalized_data = accumulated_data / accumulated_data[-1]

        # Time points evenly distributed over 10 hours
        time_points = np.linspace(0, 10, len(data))

        # Fit the logistic function to the normalized accumulated data
        try:
            params, _ = curve_fit(logistic, time_points, normalized_data, p0=[1, 1, 5])
            L, k, x0 = params
            medians.append(x0)  # Append the median time point
        except RuntimeError:
            medians.append(np.nan)  # Append NaN if the fit fails

    return medians

def datagenHighRes(cell_line, chr, minp, maxp, resolution, alld, dtscale, saveQ, info):

    global time_data
    
    # Lengths of each chromosome in kilobases
    chr_lengths = [249251, 243200, 198023, 191155, 180916, 171116, 159139, 146365, 141214, 135535, 135007, 133852, 115170, 107350, 102532, 90355, 81196, 78078, 59129, 63026, 48130, 51305]

    # Path to the data file
    matfile = f'data/high_res_files/GSE137764_{cell_line}_GaussiansGSE137764_mooth_scaled_autosome.mat'
    
    # Read the data
    data = pd.read_csv(matfile, delimiter="\t", low_memory=False)
    
    # Extract relevant columns for the chromosome
    selected_columns = [col for col in data.columns if (f'chr{str(chr)}' == col or f'chr{str(chr)}.' in col)]
    
    rtil1 = []
    for col in selected_columns:
        lcol = np.array(data[col][2:18])
        lcol[np.isnan(lcol)] = 0.  # Ensures no NaNs are processed
        rtil1.append(lcol)

    rtil2 = calculate_medians(rtil1)  # Assume this function calculates some form of median or summarization
    
    # Calculate repeat factor and ensure the length is exactly the chromosome length
    original_length = len(rtil2)
    repeat_factor = chr_lengths[chr - 1] // original_length + (chr_lengths[chr - 1] % original_length > 0)
    print(repeat_factor)
    
    # Create the repeated array, then slice to the exact chromosome length
    extended_data = np.repeat(rtil2, repeat_factor)[:chr_lengths[chr - 1]]
    
    # Apply Gaussian smoothing
    sigma = 20  # Standard deviation for Gaussian smoothing
    time_data = gaussian_filter(extended_data, sigma=sigma)

    invalid_positions = np.where(np.isnan(time_data) | (time_data <= 0))[0]
    
    # Ensure there are no NaN values in the final output
    time_data = np.nan_to_num(time_data)
    time_data = 60 * time_data
    time_data[invalid_positions] = max(time_data)

    if saveQ:
        np.savetxt(f"data/whole-genome_timing_data/time_data_{info}.txt", time_data, fmt='%.30f')
        np.savetxt(f"data/whole-genome_missing_data/missing_data_{info}.txt", invalid_positions, fmt='%d')
    
    return time_data

In [6]:
### Simple data generation () ###
# To be used in fitting
def datagenfs(cell_line, chr_number, chrpos_min, chrpos_max, resolution, alld, dtscale, saveQ, info, sigscale=0):
    if alld:
        time_data = np.loadtxt(f'data/whole-genome_timing_data/time_data_{cell_line}_chr[{chr_number}].txt', dtype=float)
    else:
        time_data = np.loadtxt(f'data/whole-genome_timing_data/time_data_{cell_line}_chr[{chr_number}].txt', dtype=float)[chrpos_min:chrpos_max]
        np.savetxt(f"data/whole-genome_timing_data/time_data_{cell_line}_chr[{chr_number}]_{chrpos_min}-{chrpos_max}.txt", time_data, fmt='%.30f')
    return time_data

##### RNA-Seq files #####

In [7]:
### RNA-Seq transcription data ###
# From: http://hgdownload.cse.ucsc.edu/gbdb/hg19/bbi/wgEncodeRegTxnCaltechRnaSeqHelas3R2x75Il200SigPooled.bw

def datagenBigWig_RNA(cell_line, chr, minp, maxp, resolution, alld, saveQ, info):

    global rna_seq_data
    
    file_path = f'data/rna-seq_files/wgEncodeRegTxnCaltechRnaSeqHelas3R2x75Il200SigPooled.bw'
    bw = pybigtools.open(open(file_path, 'rb'))  # Keep the original file opening method as requested
    rna_seq_data_all = bw.values(f'chr{chr}')

    if not alld:
        rna_seq_data_all = bw.values(f'chr{chr}', minp * resolution, maxp * resolution)
    
    # Sample equally spaced values from `rna_seq_data_all` with the given resolution
    rna_seq_data = np.array(rna_seq_data_all[::resolution])
    

    # Identify invalid positions
    invalid_positions = np.where(np.isnan(rna_seq_data))[0]
    
    # Filter
    rna_seq_data[invalid_positions] = max(rna_seq_data)

    rna_seq_data = rna_seq_data[0:chr_lengths[chr-1]]

    if saveQ:
        np.savetxt(f"data/rna-seq_files/rna_seq_{info}.txt", rna_seq_data, fmt='%.30f')

    return rna_seq_data

##### GRO-Seq files #####

In [8]:
def datagenBedgraph_GRO(cell_line, chr_number, saveQ=False):
    # Define the path to the compressed BEDGraph file
    bedgraph_file = 'data/gro-seq_files/GSM2486801_HUVEC_GROseq_normoxia_rep1.bedGraph.gz'  # Replace with other cell lines
    output_file = f'data/gro-seq_files/gro_seq_{cell_line}_chr[{chr_number}].txt'
    chromosome = f'chr{chr_number}'
    
    # Read the compressed BEDGraph file into a DataFrame
    with gzip.open(bedgraph_file, 'rt') as f:
        columns = ['chrom', 'start', 'end', 'value']
        bedgraph_df = pd.read_csv(f, sep='\t', names=columns, comment='#')

    # Filter the DataFrame for the given chromosome
    filtered_df = bedgraph_df[bedgraph_df['chrom'] == chromosome]

    # Determine the range in base pairs
    start_range = filtered_df['start'].min()
    end_range = filtered_df['end'].max()

    # Convert the range to kilobases
    start_kb = start_range // 1000
    end_kb = end_range // 1000

    # Create an array to count the number of observations for each kilobase
    range_kb = int(end_kb - start_kb + 1)  # Ensure range_kb is an integer
    values_array = np.zeros(range_kb, dtype=int)

    # Count the number of observations at each kilobase
    for _, row in filtered_df.iterrows():
        start_kb_index = (row['start'] // 1000) - start_kb
        end_kb_index = (row['end'] // 1000) - start_kb
        for kb in range(int(start_kb_index), int(end_kb_index) + 1):  # Ensure the indices are integers
            values_array[kb] += 1

    # Extend values_array to match the length of the chromosome
    chrom_length_kb = chr_lengths[chr_number - 1]
    if len(values_array) < chrom_length_kb:
        values_array = np.pad(values_array, (0, chrom_length_kb - len(values_array)), 'constant')

    # Save the results to a text file if saveQ is True
    if saveQ:
        np.savetxt(output_file, values_array, fmt='%d')


In [9]:
def datagenBigWig_GRO(cell_line, saveQ=False):
    # Path to the bigWig file
    bigwig_file = r"data\gro-seq_files\GSM1480325_K562_GROseq_plus.bigWig"
    
    # Chromosome lengths provided (in kb)
    chr_lengths = [249251, 243200, 198023, 191155, 180916, 171116, 159139, 146365, 141214, 135535, 
                   135007, 133852, 115170, 107350, 102532, 90355, 81196, 78078, 59129, 63026, 48130, 51305]
    
    # Chromosome labels
    chromosomes = [f"{i+1}" for i in range(len(chr_lengths))]
    
    # Open the bigWig file using pybigtools
    bw = pybigtools.open(open(bigwig_file, 'rb'))
    
    # Function to create 1kb bins for a given chromosome
    def create_1kb_bins(chrom, chrom_size):
        bins = []
        for start in range(0, chrom_size * 1000, 1000):  # Convert size from kb to bp
            end = start + 1000
            bins.append([chrom, start, end])
        return pd.DataFrame(bins, columns=["chrom", "start", "end"])
    
    # Process each chromosome
    j=1
    for chrom, size in zip(chromosomes, chr_lengths):
        # Create 1 kb bins for this chromosome
        bins_df = create_1kb_bins(f"chr{chrom}", size)
        
        # Initialize an array to store the score for each 1kb bin
        bin_scores = np.zeros(len(bins_df))
    
        # Iterate over each bin and retrieve the average signal value from the bigWig file
        for idx, row in bins_df.iterrows():
            start, end = row["start"], row["end"]
            
            # Ensure the range is valid; bigWig files may not include the very last base, so adjust the range
            if start >= bw.chroms(f"chr{chrom}"):  # Make sure start is within valid bounds
                break  # No more data to process
    
            end = min(end, bw.chroms(f"chr{chrom}"))  # Adjust end to not exceed chromosome length
            
            # Get the signal values for this 1kb region from the bigWig file
            try:
                signal_values = bw.values(f"chr{chrom}", start, end)
                # Handle potential NaN values
                if np.isnan(signal_values).all():
                    bin_scores[idx] = 0
                else:
                    bin_scores[idx] = np.nanmean(signal_values)
            except Exception as e:
                print(f"Error processing region {start}-{end} on {chrom}: {e}")
                bin_scores[idx] = 0  # Default to 0 if an error occurs
    
        # Write the result for this chromosome to a text file
        output_file = f"data/gro-seq_files/gro_seq_{cell_line}_chr[{j}].txt"
        j+=1
        if saveQ:
            np.savetxt(output_file, bin_scores, fmt='%.6f')
    
    # Close the bigWig file
    bw.close()

##### DNase-I HS files #####

In [10]:
def datagenBigWig_DHS(cell_line, saveQ=False):
    # Path to the bigWig file
    bigwig_file = r"data\DNaseIHS_files\wgEncodeUwDnaseK562RawRep1.bigWig" # Replace with other cell lines
    
    # Chromosome lengths provided (in kb)
    chr_lengths = [249251, 243200, 198023, 191155, 180916, 171116, 159139, 146365, 141214, 135535, 
                   135007, 133852, 115170, 107350, 102532, 90355, 81196, 78078, 59129, 63026, 48130, 51305]
    
    # Chromosome labels
    chromosomes = [f"{i+1}" for i in range(len(chr_lengths))]
    
    # Open the bigWig file using pybigtools
    bw = pybigtools.open(open(bigwig_file, 'rb'))
    
    # Function to create 1kb bins for a given chromosome
    def create_1kb_bins(chrom, chrom_size):
        bins = []
        for start in range(0, chrom_size * 1000, 1000):  # Convert size from kb to bp
            end = start + 1000
            bins.append([chrom, start, end])
        return pd.DataFrame(bins, columns=["chrom", "start", "end"])
    
    # Process each chromosome
    j=1
    for chrom, size in zip(chromosomes, chr_lengths):
        # Create 1 kb bins for this chromosome
        bins_df = create_1kb_bins(f"chr{chrom}", size)
        
        # Initialize an array to store the score for each 1kb bin
        bin_scores = np.zeros(len(bins_df))
    
        # Iterate over each bin and retrieve the average signal value from the bigWig file
        for idx, row in bins_df.iterrows():
            start, end = row["start"], row["end"]
            
            # Ensure the range is valid; bigWig files may not include the very last base, so adjust the range
            if start >= bw.chroms(f"chr{chrom}"):  # Make sure start is within valid bounds
                break  # No more data to process
    
            end = min(end, bw.chroms(f"chr{chrom}"))  # Adjust end to not exceed chromosome length
            
            # Get the signal values for this 1kb region from the bigWig file
            try:
                signal_values = bw.values(f"chr{chrom}", start, end)
                # Handle potential NaN values
                if np.isnan(signal_values).all():
                    bin_scores[idx] = 0
                else:
                    bin_scores[idx] = np.nanmean(signal_values)
            except Exception as e:
                print(f"Error processing region {start}-{end} on {chrom}: {e}")
                bin_scores[idx] = 0  # Default to 0 if an error occurs
    
        # Write the result for this chromosome to a text file
        output_file = f"data/DNaseIHS_files/DNaseIHS_{cell_line}_chr[{j}].txt"
        j+=1
        if saveQ:
            np.savetxt(output_file, bin_scores, fmt='%.6f')
    
    # Close the bigWig file
    bw.close()

In [11]:
def datagennarrowPeak_DHS(cell_line, saveQ=False):
    # Path to the gzipped narrowPeak file
    narrowpeak_file = r"data\DNaseIHS_files\wgEncodeOpenChromDnaseHelas3Pk.narrowPeak.gz"
    
    # Chromosome lengths provided (in kb)
    chr_lengths = [249251, 243200, 198023, 191155, 180916, 171116, 159139, 146365, 141214, 135535, 
                   135007, 133852, 115170, 107350, 102532, 90355, 81196, 78078, 59129, 63026, 48130, 51305]
    
    # Chromosome labels
    chromosomes = [f"chr{i+1}" for i in range(len(chr_lengths))]
    
    # Read and parse the narrowPeak.gz file
    peak_data = []
    with gzip.open(narrowpeak_file, 'rt') as f:  # Open in text mode
        for line in f:
            fields = line.strip().split()
            peak_data.append({
                "chrom": fields[0],
                "start": int(fields[1]),
                "end": int(fields[2]),
                "signalValue": float(fields[6])
            })
    
    # Convert to pandas DataFrame
    peak_df = pd.DataFrame(peak_data)
    
    # Function to create 1kb bins for a given chromosome
    def create_1kb_bins(chrom, chrom_size):
        bins = []
        for start in range(0, chrom_size * 1000, 1000):  # Convert size from kb to bp
            end = start + 1000
            bins.append([chrom, start, end])
        return pd.DataFrame(bins, columns=["chrom", "start", "end"])
    
    # Process each chromosome
    j=1
    for chrom, size in zip(chromosomes, chr_lengths):
        # Create 1 kb bins for this chromosome
        bins_df = create_1kb_bins(chrom, size)
        
        # Filter the peak data for the current chromosome
        chrom_peak_df = peak_df[peak_df["chrom"] == chrom]
    
        # Initialize an array to store the score for each 1kb bin
        bin_scores = np.zeros(len(bins_df))
    
        # Iterate over each peak and assign scores to the corresponding bins
        for _, peak in chrom_peak_df.iterrows():
            peak_start, peak_end, peak_value = peak["start"], peak["end"], peak["signalValue"]
            
            # Find the bins overlapping with the peak
            bin_start_idx = peak_start // 1000
            bin_end_idx = peak_end // 1000
            
            # Assign the peak value to the overlapping bins
            for i in range(bin_start_idx, bin_end_idx + 1):
                if i < len(bin_scores):
                    bin_scores[i] += peak_value  # Summing the values; you can choose another method
        
        # Write the result for this chromosome to a text file
        output_file = f"data/DNaseIHS_files/DNaseIHS_{cell_line}_chr[{j}].txt"
        j+=1
        if saveQ:
            np.savetxt(output_file, bin_scores, fmt='%.6f')


##### ChIP-seq files #####

In [12]:
def datagenBigWig_chip(cell_line, saveQ=False):
    # Path to the bigWig file
    bigwig_file = r"data\chip-seq_files\wgEncodeUwHistoneK562H3k4me3StdRawRep1.bigWig"
    
    # Chromosome lengths provided (in kb)
    chr_lengths = [249251, 243200, 198023, 191155, 180916, 171116, 159139, 146365, 141214, 135535, 
                   135007, 133852, 115170, 107350, 102532, 90355, 81196, 78078, 59129, 63026, 48130, 51305]
    
    # Chromosome labels
    chromosomes = [f"{i+1}" for i in range(len(chr_lengths))]
    
    # Open the bigWig file using pybigtools
    bw = pybigtools.open(open(bigwig_file, 'rb'))
    
    # Function to create 1kb bins for a given chromosome
    def create_1kb_bins(chrom, chrom_size):
        bins = []
        for start in range(0, chrom_size * 1000, 1000):  # Convert size from kb to bp
            end = start + 1000
            bins.append([chrom, start, end])
        return pd.DataFrame(bins, columns=["chrom", "start", "end"])
    
    # Process each chromosome
    j=1
    for chrom, size in zip(chromosomes, chr_lengths):
        # Create 1 kb bins for this chromosome
        bins_df = create_1kb_bins(f"chr{chrom}", size)
        
        # Initialize an array to store the score for each 1kb bin
        bin_scores = np.zeros(len(bins_df))
    
        # Iterate over each bin and retrieve the average signal value from the bigWig file
        for idx, row in bins_df.iterrows():
            start, end = row["start"], row["end"]
            
            # Ensure the range is valid; bigWig files may not include the very last base, so adjust the range
            if start >= bw.chroms(f"chr{chrom}"):  # Make sure start is within valid bounds
                break  # No more data to process
    
            end = min(end, bw.chroms(f"chr{chrom}"))  # Adjust end to not exceed chromosome length
            
            # Get the signal values for this 1kb region from the bigWig file
            try:
                signal_values = bw.values(f"chr{chrom}", start, end)
                # Handle potential NaN values
                if np.isnan(signal_values).all():
                    bin_scores[idx] = 0
                else:
                    bin_scores[idx] = np.nanmean(signal_values)
            except Exception as e:
                print(f"Error processing region {start}-{end} on {chrom}: {e}")
                bin_scores[idx] = 0  # Default to 0 if an error occurs
    
        # Write the result for this chromosome to a text file
        output_file = f"data/chip-seq_files/chip_seq_{cell_line}_chr[{j}].txt"
        j+=1
        if saveQ:
            np.savetxt(output_file, bin_scores, fmt='%.6f')
    
    # Close the bigWig file
    bw.close()

##### Promoter files #####

In [13]:
def datagenBigWig_prom(cell_line, saveQ=False):
    # Path to the bigWig file
    bigwig_file = r"data\promoter_files\GSM733682_hg19_wgEncodeBroadHistoneHelas3H3k4me3StdSig.bigWig"
    
    # Chromosome lengths provided (in kb)
    chr_lengths = [249251, 243200, 198023, 191155, 180916, 171116, 159139, 146365, 141214, 135535, 
                   135007, 133852, 115170, 107350, 102532, 90355, 81196, 78078, 59129, 63026, 48130, 51305]
    
    # Chromosome labels
    chromosomes = [f"{i+1}" for i in range(len(chr_lengths))]
    
    # Open the bigWig file using pybigtools
    bw = pybigtools.open(open(bigwig_file, 'rb'))
    
    # Function to create 1kb bins for a given chromosome
    def create_1kb_bins(chrom, chrom_size):
        bins = []
        for start in range(0, chrom_size * 1000, 1000):  # Convert size from kb to bp
            end = start + 1000
            bins.append([chrom, start, end])
        return pd.DataFrame(bins, columns=["chrom", "start", "end"])
    
    # Process each chromosome
    j=1
    for chrom, size in zip(chromosomes, chr_lengths):
        # Create 1 kb bins for this chromosome
        bins_df = create_1kb_bins(f"chr{chrom}", size)
        
        # Initialize an array to store the score for each 1kb bin
        bin_scores = np.zeros(len(bins_df))
    
        # Iterate over each bin and retrieve the average signal value from the bigWig file
        for idx, row in bins_df.iterrows():
            start, end = row["start"], row["end"]
            
            # Ensure the range is valid; bigWig files may not include the very last base, so adjust the range
            if start >= bw.chroms(f"chr{chrom}"):  # Make sure start is within valid bounds
                break  # No more data to process
    
            end = min(end, bw.chroms(f"chr{chrom}"))  # Adjust end to not exceed chromosome length
            
            # Get the signal values for this 1kb region from the bigWig file
            try:
                signal_values = bw.values(f"chr{chrom}", start, end)
                # Handle potential NaN values
                if np.isnan(signal_values).all():
                    bin_scores[idx] = 0
                else:
                    bin_scores[idx] = np.nanmean(signal_values)
            except Exception as e:
                print(f"Error processing region {start}-{end} on {chrom}: {e}")
                bin_scores[idx] = 0  # Default to 0 if an error occurs
    
        # Write the result for this chromosome to a text file
        output_file = f"data/promoter_files/prom_{cell_line}_chr[{j}].txt"
        j+=1
        if saveQ:
            np.savetxt(output_file, bin_scores, fmt='%.6f')
    
    # Close the bigWig file
    bw.close()

In [14]:
def datagenBigWig_prom_smooth(cell_line, saveQ=False):
    # Path to the bigWig file
    bigwig_file = r"data/promoter_files/wgEncodeBroadHistoneHuvecH3k4me3StdSig.bigWig"
    
    # Chromosome lengths provided (in kb)
    chr_lengths = [249251]#, 243200, 198023, 191155, 180916, 171116, 159139, 146365, 141214, 135535, 
                   #135007, 133852, 115170, 107350, 102532, 90355, 81196, 78078, 59129, 63026, 48130, 51305]
    
    # Chromosome labels
    chromosomes = [f"{i+1}" for i in range(len(chr_lengths))]
    
    # Open the bigWig file using pybigtools
    bw = pybigtools.open(open(bigwig_file, 'rb'))

    # Define Gaussian smoothing parameters
    sigma = 50  # Adjust the sigma for the Gaussian smoothing
    
    # Process each chromosome
    j = 1
    for chrom, size in zip(chromosomes, chr_lengths):
        chrom = f"chr{chrom}"
        
        # Retrieve all values for the entire chromosome
        chrom_length = bw.chroms(chrom)
        
        # Get the signal values for the entire chromosome
        signal_values = bw.values(chrom, 0, chrom_length)

        # Convert to numpy array (list of values needs to be converted to numpy array)
        signal_values = np.array(signal_values)
        
        # Apply Gaussian smoothing to the entire chromosome
        smoothed_values = gaussian_filter1d(signal_values, sigma=sigma, mode='constant', cval=np.nan)
        
        # Create 1kb bins for this chromosome, and handle the remainder at the end
        num_bins = chrom_length // 1000
        remainder = chrom_length % 1000
        
        # Initialize an array to store the score for each 1kb bin plus the remainder
        bin_scores = np.zeros(num_bins + (1 if remainder > 0 else 0))
        
        # Sample the smoothed values at 1 kb intervals
        for i in range(num_bins):
            start = i * 1000
            end = start + 1000
            bin_scores[i] = np.nanmean(smoothed_values[start:end])
        
        # Handle the remainder, if there is one
        if remainder > 0:
            bin_scores[-1] = np.nanmean(smoothed_values[num_bins * 1000:])
        
        # Write the result for this chromosome to a text file
        output_file = f"data/promoter_files/prom_{cell_line}_chr[{j}]_smooth.txt"
        j += 1
        
        if saveQ:
            np.savetxt(output_file, bin_scores, fmt='%.6f')
    
    # Close the bigWig file
    bw.close()


##### Fork speed files #####

In [15]:
def datagen_speed(cell_line, chr_number, timedata, timesim):
    # Calculate the discrete derivative (slope)
    speed_data = np.abs(np.diff(timedata, prepend=timedata[0]))
    speed_sim = np.abs(np.diff(timesim, prepend=timesim[0]))
    
    # Avoid division by zero by replacing zeros in speed_data with a small value
    speed_data[speed_data == 0] = np.inf  # Set to infinity to represent no movement
    speed_sim[speed_sim == 0] = np.inf  # Set to infinity to represent no movement
    
    # Calculate the inverse of the slope
    inverse_speed_data = 1 / speed_data
    inverse_speed_sim = 1 / speed_sim
    
    # Define the output file path
    output_file_data = f'data/fork_speed/speed_data_{cell_line}_chr[{chr_number}].txt'
    output_file_sim = f'data/fork_speed/speed_sim_{cell_line}_chr[{chr_number}].txt'
    
    # Save the inverse_speed_data to a text file, each value on a new line
    np.savetxt(output_file_data, inverse_speed_data, fmt='%f')
    np.savetxt(output_file_sim, inverse_speed_sim, fmt='%f')

#### Fitting ####

In [16]:
def fitfunction(list, v0, st0, fit_step, maxiter, err_threshold, saveQ, info):
    
    timel = list
    
    v = v0
    st = st0
    exp_v = np.exp(-1/v)
    x00 = np.array([(math.pi/(4*v))*i**(-2) for i in timel])
    lm = 1000 # Remove end regions for error calculation
    
    # VECTORIZED APPROACH
    
    def mse(y_true, y_pred):
        mse_value = sum((yt - yp) ** 2 for yt, yp in zip(y_true, y_pred)) / len(y_true)
        return mse_value
    
    def fast_roll_add(dst, src, shift):
        dst[shift:] += src[:-shift]
        dst[:shift] += src[-shift:]
    
    # Expected replication time computation (replaces bcs)
    def fp(x, L, v):
        n = len(x)
        y = np.zeros(n)
    
        last_exp_2_raw = np.zeros(n)
        last_exp_2 = np.ones(n)
        unitary = x.copy()
        for k in range(L+1):
            if k != 0:
                fast_roll_add(unitary, x, k)
                fast_roll_add(unitary, x, -k)
            exp_1_raw = last_exp_2_raw
            exp_1 = last_exp_2
            exp_2_raw = exp_1_raw + unitary / v
            exp_2 = np.exp(-exp_2_raw)
    
            # Compute the weighted sum for each j and add to the total
            y += (exp_1 - exp_2) / unitary
            
            last_exp_2_raw = exp_2_raw
            last_exp_2 = exp_2
        return y

    # Fitting iteration
    def fitf(time, lst, x0, j, fit_step):
        return x0[j] * (lst[j] / time[j])**(fit_step)

    # Alternative fitting
    def fitf0(time, lst, x0, j, fit_step):
        return x0[j]**(np.log(time[j]) / np.log(lst[j]))

    # Fitting control
    def cfit(time, lst, x0, fit_step):
        result = np.empty_like(x0)
        for j in range(len(x0)):
            fit_result = fitf(time, lst, x0, j, fit_step)
            if fit_result < 10**(-err_threshold):
                result[j] = 10**(-err_threshold)
            #elif abs(time[j] - lst[j]) < .5:
            #    result[j] = x0[j]
            else:
                result[j] = fit_result
        return result
    
    xs = x00
    ys = fp(xs, len(xs)//st, v)
    new_err0 = mse(timel[lm:-lm], ys[lm:-lm])
    err = 10**10
    
    for j in range(maxiter):
        xs0 = xs
        ys0 = ys
        xs = cfit(timel, ys, xs, fit_step)
        ys = fp(xs, len(xs)//st, v)
        
        new_err = mse(timel[lm:-lm], ys[lm:-lm])
        print(str(j+1) + '/' + str(maxiter) + ' err: ' + str('{:.30f}'.format(new_err)), end="\r")
        
        err = new_err  # Update the error with the new calculated error

    fire_rates = ['{:.30f}'.format(i) for i in xs]
    time_sim = ys
    
    if saveQ:
        with open(r'data/whole-genome_firing_rates/fire_rates_'+info+'.txt', 'w') as f:
            for rate in fire_rates:
                f.write(rate + '\n')
        np.savetxt(r'data/whole-genome_timing_simulation/time_sim_'+info+'.txt', time_sim, fmt='%.30f')
    
    return [fire_rates, time_sim]

In [17]:
def fitfunction(list, v0, st0, fit_step, maxiter, err_threshold, saveQ, info, cell_line='', chr_number=''):
    
    timel = list
    
    v = v0
    st = st0
    exp_v = np.exp(-1/v)
    x00 = np.array([(math.pi/(4*v))*i**(-2) for i in timel])
    lm = 1000 # Remove end regions for error calculation
    
    # VECTORIZED APPROACH
    
    def mse(y_true, y_pred):
        mse_value = sum((yt - yp) ** 2 for yt, yp in zip(y_true, y_pred)) / len(y_true)
        return mse_value
    
    def fast_roll_add(dst, src, shift):
        dst[shift:] += src[:-shift]
        dst[:shift] += src[-shift:]
    
    # Expected replication time computation (replaces bcs)
    def fp(x, L, v):
        n = len(x)
        y = np.zeros(n)
    
        last_exp_2_raw = np.zeros(n)
        last_exp_2 = np.ones(n)
        unitary = x.copy()
        for k in range(L+1):
            if k != 0:
                fast_roll_add(unitary, x, k)
                fast_roll_add(unitary, x, -k)
            exp_1_raw = last_exp_2_raw
            exp_1 = last_exp_2
            exp_2_raw = exp_1_raw + unitary / v
            exp_2 = np.exp(-exp_2_raw)
    
            # Compute the weighted sum for each j and add to the total
            y += (exp_1 - exp_2) / unitary
            
            last_exp_2_raw = exp_2_raw
            last_exp_2 = exp_2
        return y

    # Fitting iteration
    def fitf(time, lst, x0, j, fit_step):
        return x0[j] * (lst[j] / time[j])**(fit_step)

    # Alternative fitting
    def fitf0(time, lst, x0, j, fit_step):
        return x0[j]**(np.log(time[j]) / np.log(lst[j]))

    # Fitting control
    def cfit(time, lst, x0, fit_step):
        result = np.empty_like(x0)
        for j in range(len(x0)):
            fit_result = fitf(time, lst, x0, j, fit_step)
            if fit_result < 10**(-err_threshold):
                result[j] = 10**(-err_threshold)
            else:
                result[j] = fit_result
        return result
    
    xs = x00
    ys = fp(xs, len(xs)//st, v)
    new_err0 = mse(timel[lm:-lm], ys[lm:-lm])
    err = 10**10

    # Open the file to store the error values
    with open(f'data/whole-genome_mse/mse_{cell_line}_chr[{chr_number}].txt', 'a') as mse_file:
        # Write the initial error to the file before the loop
        mse_file.write(f'{new_err0:.30f}\n')

        for j in range(maxiter):
            xs0 = xs
            ys0 = ys
            xs = cfit(timel, ys, xs, fit_step)
            ys = fp(xs, len(xs)//st, v)
            
            new_err = mse(timel[lm:-lm], ys[lm:-lm])
            print(str(j+1) + '/' + str(maxiter) + ' err: ' + str('{:.30f}'.format(new_err)), end="\r")
            
            # Write the new error to the file
            mse_file.write(f'{new_err:.30f}\n')
            
            err = new_err  # Update the error with the new calculated error

    fire_rates = ['{:.30f}'.format(i) for i in xs]
    time_sim = ys
    
    if saveQ:
        with open(r'data/whole-genome_firing_rates/fire_rates_'+info+'.txt', 'w') as f:
            for rate in fire_rates:
                f.write(rate + '\n')
        np.savetxt(r'data/whole-genome_timing_simulation/time_sim_'+info+'.txt', time_sim, fmt='%.30f')
    
    return [fire_rates, time_sim]


#### Error generation ####

In [18]:
def compute_squared_error(time_data, time_simulation):
    return (time_data - time_simulation) ** 2

def process_files_and_compute_squared_error(cell_lines, chr_numbers, base_path):
    for cell_line in cell_lines:
        for chr_number in chr_numbers:
            # Define file paths
            time_data_file = os.path.join(base_path, f'whole-genome_timing_data/time_data_{cell_line}_chr[{chr_number}].txt')
            time_simulation_file = os.path.join(base_path, f'whole-genome_timing_simulation/time_sim_{cell_line}_chr[{chr_number}].txt')
            error_file = os.path.join(base_path, f'whole-genome_error/error_{cell_line}_chr[{chr_number}].txt')

            # Load data
            time_data = np.loadtxt(time_data_file, dtype=float)
            time_simulation = np.loadtxt(time_simulation_file, dtype=float)

            # Compute squared error
            squared_error = compute_squared_error(time_data, time_simulation)
            
            # Save squared error to file
            np.savetxt(error_file, squared_error, fmt='%.30f')

#### bedgraph file generation ####

In [19]:
# Dictionary of chromosome sizes for hg18
def chr_size_fun(genome_build):
    if genome_build == 'hg18':
        return {
            '1': 247249719, '2': 242951149, '3': 199501827, '4': 191273063, '5': 180857866, 
            '6': 170899992, '7': 158821424, '8': 146274826, '9': 140273252, '10': 135374737, 
            '11': 134452384, '12': 132349534, '13': 114142980, '14': 106368585, '15': 100338915, 
            '16': 88827254, '17': 78774742, '18': 76117153, '19': 63811651, '20': 62435964, 
            '21': 46944323, '22': 49691432, 'X': 154913754, 'Y': 57772954
        }
    elif genome_build == 'hg19':
        return {
            '1': 249250621, '2': 243199373, '3': 198022430, '4': 191154276, '5': 180915260,
            '6': 171115067, '7': 159138663, '8': 146364022, '9': 141213431, '10': 135534747,
            '11': 135006516, '12': 133851895, '13': 115169878, '14': 107349540, '15': 102531392,
            '16': 90354753, '17': 81195210, '18': 78077248, '19': 59128983, '20': 63025520,
            '21': 48129895, '22': 51304566, 'X': 155270560, 'Y': 59373566
        }
    elif genome_build == 'hg38':
        return {
            '1': 248956422, '2': 242193529, '3': 198295559, '4': 190214555, '5': 181538259,
            '6': 170805979, '7': 159345973, '8': 145138636, '9': 138394717, '10': 133797422,
            '11': 135086622, '12': 133275309, '13': 114364328, '14': 107043718, '15': 101991189,
            '16': 90338345, '17': 83257441, '18': 80373285, '19': 58617616, '20': 64444167,
            '21': 46709983, '22': 50818468, 'X': 156040895, 'Y': 57227415
        }

def txt_to_bedgraph(cell_line, data_type='error', genome_build='hg19'):
    # Define the output file name
    if data_type=='error':
        output_file = f'data/whole-genome_error/bedgraph_files/error_{cell_line}.bedgraph'
    elif data_type=='fire_rates':
        output_file = f'data/whole-genome_firing_rates/bedgraph_files/fire_rates_{cell_line}.bedgraph'
    
    # Open the output file in write mode
    with open(output_file, 'w') as bedgraph:
        # Write the header line
        bedgraph.write('track type=bedGraph\n')
        
        # Iterate through all the txt files for the specified cell line
        for chr_number in list(map(str, range(1, 23))) + ['X', 'Y']:
            if data_type=='error':
                input_file = f'data/whole-genome_error/error_{cell_line}_chr[{chr_number}].txt'
            elif data_type=='fire_rates':
                input_file = f'data/whole-genome_firing_rates/fire_rates_{cell_line}_chr[{chr_number}].txt'
            chr_size = chr_size_fun(genome_build)[chr_number]
            
            # Check if the input file exists
            if os.path.isfile(input_file):
                with open(input_file, 'r') as infile:
                    # Read through each line and write the corresponding bedgraph entry
                    for position, value in enumerate(infile):
                        start = position * 1000
                        end = min(start + 1000, chr_size)
                        if start >= chr_size:
                            break
                        value = value.strip()
                        bedgraph.write(f'chr{chr_number}\t{start}\t{end}\t{value}\n')
            else:
                None
                
    print(f'BEDGRAPH file created: {output_file}', end="\r")

#### BCS file generation ####

In [20]:
def bcs_gen(cell_line, chr_number, chrpos_min, chrpos_max, fork_speed, fire_rates, resolution):

    file_name = cell_line+'_chr['+str(chr_number)+']_'+str(chrpos_min)+'-'+str(chrpos_max)
    
    bcfile = 'code/DNAReplication.bc'
    new_bcfile = f'code/bcs_scripts/DNAReplication_{cell_line}_chr[{chr_number}]_{chrpos_min}-{chrpos_max}.bc'
    bcsfile = []
    
    chrLength = chrpos_max - chrpos_min
    orign = int(chrLength * resolution / 1000)
    fast = 100000
    x = np.linspace(chrpos_min, chrpos_max, chrLength)  # Chromosome positions
    
    with open(bcfile, 'r') as file:
        bcsfile = file.readlines()
    bcsfile[bcsfile.index("// Chromosome length\n")+1] = "L = "+str(chrLength)+";\n"
    bcsfile[bcsfile.index("// Fast rate\n")+1] = "fast = "+str(fast)+";\n"
    bcsfile[bcsfile.index("// Fork velocity\n")+1] = "v = "+str(fork_speed)+";\n"
    
    oril = list(map(floor, np.linspace(1, chrLength, num=orign)))
    
    flistn = fire_rates
    
    # write new origins
    oriarr = np.array([
        'ORI[' + str(floor(oril[i1])) + ',' + '{:.30f}'.format(flistn[i1]) + ']'
        for i1 in range(0, orign)
    ])
    
    # delete all the origins
    with open(new_bcfile, 'w') as fp:
        for number, line in enumerate(bcsfile):
            if number not in range(bcsfile.index("// PROCESS INITIATION\n")+1, bcsfile.index("// END")-2):
                fp.write(line)
        
    # now change the last line
    with open(new_bcfile, 'r') as file:
        bcsfile = file.readlines()
        bcsfile[bcsfile.index("// PROCESS INITIATION\n")+1] = str(oriarr).replace('"','').replace("'",'').replace(" "," || ")[1:-1]+';\n'
    
    with open(new_bcfile, 'w') as file:
        file.writelines(bcsfile)

#### BCS simulation output ####

##### Replication timing, fork directionality and origins #####

In [21]:
def process_bcs_output(cell_line, chr_number, chrpos_min, chrpos_max, fork_speed, resolution, scale_factor, sim_number, compute_replication_time, compute_fork_directionality, compute_origin_positions):
    # Define the file path
    file_path = f'data/bcs_output/bcs_output_{cell_line}_chr[{chr_number}]_{chrpos_min}-{chrpos_max}.simulation.bcs'

    # Initialize arrays to store replication time and fork directionality
    DNA_replicationtime = [0.0 for _ in range(0, chrpos_max - chrpos_min)] if compute_replication_time else None
    DNA_forkdirectionality = [0.0 for _ in range(0, chrpos_max - chrpos_min)] if compute_fork_directionality else None
    DNA_originpositions = [] if compute_origin_positions else None  # List to store origin positions per simulation
    current_origins = []
    sim_iteration = 0

    with open(file_path) as f:
        for line in f:
            if sim_iteration == sim_number + 1:
                break
            if line[0] == '>':
                alreadyDone = []
                if compute_origin_positions and current_origins:  # If we have collected origins for the current simulation
                    DNA_originpositions.append(current_origins)
                current_origins = []
                print(sim_iteration, end="\r")
                sim_iteration += 1
                continue
            splitLine = line.split('\t')
            if compute_origin_positions and splitLine[2] == "ORI":
                origin_pos = int(splitLine[4])
                current_origins.append(origin_pos)
            if splitLine[2] == "FL":
                pos = int(splitLine[4]) - 1
                time = float(splitLine[0])
                if pos not in alreadyDone:
                    if compute_replication_time:
                        DNA_replicationtime[pos] += time
                    if compute_fork_directionality:
                        DNA_forkdirectionality[pos] -= 1  # Track left-moving forks
                    alreadyDone.append(pos)
            if splitLine[2] == "FR":
                pos = int(splitLine[4]) - 1
                time = float(splitLine[0])
                if pos not in alreadyDone:
                    if compute_replication_time:
                        DNA_replicationtime[pos] += time
                    if compute_fork_directionality:
                        DNA_forkdirectionality[pos] += 1  # Track right-moving forks
                    alreadyDone.append(pos)

    # Don't forget to add the origins of the last simulation
    if compute_origin_positions and current_origins:
        DNA_originpositions.append(current_origins)

    # Average the results over the number of simulations
    if compute_replication_time:
        for i in range(len(DNA_replicationtime)):
            DNA_replicationtime[i] = float(DNA_replicationtime[i]) / float(sim_number)

    if compute_fork_directionality:
        for i in range(len(DNA_forkdirectionality)):
            DNA_forkdirectionality[i] = float(DNA_forkdirectionality[i]) / float(sim_number)

    # Define file paths for saving the results
    base_path = 'data'
    replication_time_path = os.path.join(base_path, 'whole-genome_timing_bcs', f'time_bcs_{cell_line}_chr[{chr_number}]_{chrpos_min}-{chrpos_max}.txt')
    fork_directionality_path = os.path.join(base_path, 'whole-genome_fork_directionality', f'fork_directionality_{cell_line}_chr[{chr_number}]_{chrpos_min}-{chrpos_max}.txt')
    origin_positions_path = os.path.join(base_path, 'whole-genome_origins', f'origin_positions_{cell_line}_chr[{chr_number}]_{chrpos_min}-{chrpos_max}.txt')

    # Create directories if they do not exist
    os.makedirs(os.path.dirname(replication_time_path), exist_ok=True)
    os.makedirs(os.path.dirname(fork_directionality_path), exist_ok=True)
    os.makedirs(os.path.dirname(origin_positions_path), exist_ok=True)

    # Save the results to text files
    if compute_replication_time:
        np.savetxt(replication_time_path, DNA_replicationtime, fmt='%.6f')

    if compute_fork_directionality:
        np.savetxt(fork_directionality_path, DNA_forkdirectionality, fmt='%.6f')

    if compute_origin_positions:
        with open(origin_positions_path, 'w') as f:
            for origins in DNA_originpositions:
                f.write(' '.join(map(str, origins)) + '\n')

In [22]:
def process_intervals(cell_lines, chr_numbers, fork_speed=1.4, resolution=1000, scale_factor=6, sim_number=5, compute_replication_time=True, compute_fork_directionality=True, compute_origin_positions=True, interval=None):
    for cell_line in cell_lines:
        for chr_number in chr_numbers:
            chr_length = chr_lengths[chr_number - 1]  # Get the length of the chromosome
            intervals = [(interval[0], interval[1])] if interval else [(start, min(start + 10000, chr_length)) for start in range(0, chr_length, 10000)]
            for start, end in intervals:
                process_bcs_output(
                    cell_line=cell_line,
                    chr_number=chr_number,
                    chrpos_min=start,
                    chrpos_max=end,
                    fork_speed=fork_speed,
                    resolution=resolution,
                    scale_factor=scale_factor,
                    sim_number=sim_number,
                    compute_replication_time=compute_replication_time,
                    compute_fork_directionality=compute_fork_directionality,
                    compute_origin_positions=compute_origin_positions
                )

##### Interorigin distances #####

In [23]:
def compute_interorigin_distances(cell_line, chr_number, chrpos_min, chrpos_max):
    base_path = 'data'
    
    # Define file path for loading the origins
    origins_path = os.path.join(base_path, 'whole-genome_origins', f'origin_positions_{cell_line}_chr[{chr_number}]_{chrpos_min}-{chrpos_max}.txt')
    
    # Load origins data from text file
    if os.path.exists(origins_path):
        with open(origins_path, 'r') as f:
            origins_data = [list(map(int, line.strip().strip('[]').split())) for line in f]
    else:
        raise FileNotFoundError(f"Origins data not found at {origins_path}")
    
    # Compute interorigin distances for each simulation
    interorigin_distances = []
    for origins in origins_data:
        origins_sorted = sorted(origins)
        distances = np.diff(origins_sorted)
        interorigin_distances.append(distances)
    
    # Define file path for saving the interorigin distances
    iod_path = os.path.join(base_path, 'whole-genome_interorigin_distances', f'iod_{cell_line}_chr[{chr_number}]_{chrpos_min}-{chrpos_max}.txt')
    
    # Save interorigin distances to a text file
    with open(iod_path, 'w') as f:
        for distances in interorigin_distances:
            f.write(f"{list(distances)}\n")

def compute_interorigin_intervals(cell_lines, chr_numbers, interval=None):
    for cell_line in cell_lines:
        for chr_number in chr_numbers:
            chr_length = chr_lengths[chr_number - 1]  # Get the length of the chromosome
            intervals = [(interval[0], interval[1])] if interval else [(start, min(start + 10000, chr_length)) for start in range(0, chr_length, 10000)]
            for start, end in intervals:
                compute_interorigin_distances(
                    cell_line=cell_line,
                    chr_number=chr_number,
                    chrpos_min=start,
                    chrpos_max=end
                )

def average_iod_data(cell_lines, chr_numbers, factor_min=5, show_per_cell_line=False):
    all_iod_data = []
    iod_data_per_cell_line = []

    for cell_line in cell_lines:
        cell_line_iod_data = []
        for chr_number in chr_numbers:
            chr_length = chr_lengths[chr_number - 1]
            for start in range(0, chr_length, 10000):
                end = min(start + 10000, chr_length)
                iod_data = load_function_metrics(cell_line, chr_number, "iod", start, end, factor_min=factor_min)
                cell_line_iod_data.extend(iod_data)
        iod_data_per_cell_line.append(cell_line_iod_data)
        all_iod_data.extend(cell_line_iod_data)

    if show_per_cell_line:
        return iod_data_per_cell_line
    else:
        return [all_iod_data]

#### File joining functions ####

In [24]:
def join_files(cell_line, chr_number, datatype, interval=10000):
    all_data = []
    max_length = chr_lengths[chr_number - 1]
    
    for start in range(0, max_length, interval):
        end = min(start + interval, max_length)  # Ensure the end does not exceed max_length
        file_name = f'data/whole-genome_{datatype}/fork_directionality_{cell_line}_chr[{chr_number}]_{start}-{end}.txt'
        
        if os.path.exists(file_name):
            data = np.loadtxt(file_name, dtype=float)
            all_data.append(data)
        else:
            print(f'Warning: {file_name} does not exist and will be skipped.')
    
    # Concatenate all data into a single array
    if all_data:
        concatenated_data = np.concatenate(all_data)
    else:
        concatenated_data = np.array([])

    # Save concatenated data to a new file
    output_file = f'data/whole-genome_{datatype}/fork_directionality_{cell_line}_chr[{chr_number}].txt'
    np.savetxt(output_file, concatenated_data)

#### Loading functions ####

In [25]:
def load_function(cell_line, chr_number, load_type, replace_missingQ=True):
    if load_type == 'time_data':
        file_path = f'data/whole-genome_timing_data/time_data_{cell_line}_chr[{chr_number}].txt'
    elif load_type == 'time_sim':
        file_path = f'data/whole-genome_timing_simulation/time_sim_{cell_line}_chr[{chr_number}].txt'
    elif load_type == 'error':
        file_path = f'data/whole-genome_error/error_{cell_line}_chr[{chr_number}].txt'
    elif load_type == 'fire_rates':
        file_path = f'data/whole-genome_firing_rates/fire_rates_{cell_line}_chr[{chr_number}].txt'
    elif load_type == 'forkd':
        file_path = f'data/whole-genome_fork_directionality/fork_directionality_{cell_line}_chr[{chr_number}].txt'
    elif load_type == 'rna_seq':
        file_path = f'data/rna-seq_files/rna_seq_{cell_line}_chr[{chr_number}].txt'
    elif load_type == 'gro_seq':
        file_path = f'data/gro-seq_files/gro_seq_{cell_line}_chr[{chr_number}].txt'
    elif load_type == 'DNaseIHS':
        file_path = f'data/DNaseIHS_files/DNaseIHS_{cell_line}_chr[{chr_number}].txt'
    elif load_type == 'chip_seq':
        file_path = f'data/chip-seq_files/chip_seq_{cell_line}_chr[{chr_number}].txt'
    elif load_type == 'prom':
        file_path = f'data/promoter_files/prom_{cell_line}_chr[{chr_number}].txt'
    elif load_type == 'prom_smooth':
        file_path = f'data/promoter_files/prom_{cell_line}_chr[{chr_number}]_smooth.txt'
    elif load_type == 'coding':
        file_path = f'data/genome_regions/coding/coding_chr[{chr_number}]_smooth.txt'
    elif load_type == 'speed_data':
        file_path = f'data/fork_speed/speed_data_{cell_line}_chr[{chr_number}].txt'
    elif load_type == 'speed_sim':
        file_path = f'data/fork_speed/speed_sim_{cell_line}_chr[{chr_number}].txt'
        
    if not os.path.exists(file_path):
        return np.array([], dtype=int)
    data = np.loadtxt(file_path, dtype=float)
    if replace_missingQ:
        missing_data_path = f'data/whole-genome_missing_data/missing_data_{cell_line}_chr[{chr_number}].txt'
        if os.path.getsize(missing_data_path) > 0:
            missing_positions = np.loadtxt(missing_data_path, dtype=int)
        else:
            missing_positions = np.array([], dtype=int) 
        data[missing_positions] = np.nan
    return data

def load_function_pos(chr_number, load_type, cell_line=None, site_letter='A', base='A', gene_name='WWOX'):
    if load_type == 'centromeres':
        file_path = f'data/genome_regions/centromeres/positions_centromeres_chr[{chr_number}].txt'
    elif load_type == 'telomeres':
        file_path = f'data/genome_regions/telomeres/positions_telomeres_chr[{chr_number}].txt'
    elif load_type == 'fragile_sites':
        file_path = f'data/genome_regions/fragile_sites/positions_fragile_site_{chr_number}{site_letter}.txt'
    elif load_type == 'bases':
        file_path = f'data/genome_regions/bases/positions_{base}_chr[{chr_number}].txt'
    elif load_type == 'coding':
        file_path = f'data/genome_regions/coding/codingpos_chr[{chr_number}].txt'
    elif load_type == 'gene':
        file_path = f'data/genome_regions/genes/{gene_name}_pos.txt'
        
    if not os.path.exists(file_path):
        return np.array([], dtype=int)
    data = np.loadtxt(file_path, dtype=int)
    return data

def load_function_metrics(cell_line, chr_number, load_type, chrpos_min, chrpos_max, factor_min=5, replace_missingQ=True):
    if load_type == "iod":
        file_path = f'data/whole-genome_interorigin_distances/iod_{cell_line}_chr[{chr_number}]_{chrpos_min}-{chrpos_max}.txt'
        iod_data = []
        with open(file_path, 'r') as file:
            for line in file:
                iod_values = list(map(float, line.strip().strip('[]').split(',')))
                iod_data.extend([iod for iod in iod_values if iod >= factor_min])  # Filter IOD values
        data = np.array(iod_data)
        return data

def load_missing_data(cell_line, chr_number):
    file_path = f'data/whole-genome_missing_data/missing_data_{cell_line}_chr[{chr_number}].txt'
    if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
        data = np.loadtxt(file_path, dtype=int)
    else:
        data = np.array([], dtype=int)
    return data

#### Genome regions ####

In [26]:
def generate_all_data(cell_line, chr_numbers, load_type1, load_type2):
    all_data1 = []
    all_data2 = []
    for chr_number in chr_numbers:
        data1 = load_function(cell_line, chr_number, load_type1)
        data2 = load_function(cell_line, chr_number, load_type2)
        all_data1.extend(data1)
        all_data2.extend(data2)
    return [all_data1, all_data2]

In [27]:
# Centromeres and telomeres

def gen_positions_centromere_telomeres(chr_number):
    # Telomere positions (start and end 500 kb)
    telomere_start = 500  # in kb
    telomere_end_offset = 500  # in kb

    # Centromere positions (in kb, hg38, approximate)
    centromere_positions_hg38 = [
        (121535, 124535), (92326, 95326), (90505, 93505), (49660, 52660),
        (46406, 49406), (58830, 61830), (58054, 61054), (43839, 46839),
        (47368, 50368), (39255, 42255), (51644, 54644), (34857, 37857),
        (16000, 19000), (16000, 19000), (17000, 20000), (35336, 38336),
        (22263, 25263), (15461, 18461), (24682, 27682), (26370, 29370),
        (11288, 14288), (13000, 16000)
    ]

    # Provided chromosome lengths (in kb)
    chromosome_lengths = [
        249251, 243200, 198023, 191155, 180916, 171116, 159139, 146365,
        141214, 135535, 135007, 133852, 115170, 107350, 102532, 90355,
        81196, 78078, 59129, 63026, 48130, 51305
    ]

    # Define additional positions to include telomeres and centromeres
    length = chromosome_lengths[chr_number - 1]
    centromere_start, centromere_end = centromere_positions_hg38[chr_number - 1]
    positions_telomeres = np.concatenate([
        np.arange(0, telomere_start),
        np.arange(length - telomere_end_offset, length)
    ])
    positions_centromeres = np.arange(centromere_start, centromere_end)

    # Save telomere_positions to a text file
    np.savetxt(f'data/genome_regions/telomeres/positions_telomeres_chr[{chr_number}].txt', positions_telomeres, fmt='%d')

    # Save centromere_positions to a text file
    np.savetxt(f'data/genome_regions/centromeres/positions_centromeres_chr[{chr_number}].txt', positions_centromeres, fmt='%d')

In [28]:
# Fragile sites

def gen_positions_fragile_sites(chr_number, site_letter):
    # Load the CSV file
    csv_path = 'data/genome_regions/fragile_sites/humCFS-fragile_sites.csv'
    df = pd.read_csv(csv_path, header=None)
    
    positions_fragile_site = []

    # Find the column corresponding to the given chromosome
    col_index = chr_number - 1  # Chromosome 1 corresponds to column 0, and so on

    if col_index >= df.shape[1]:
        print(f"Warning: Chromosome {chr_number} not found in the CSV file.")
        return

    # Find the row corresponding to the given site letter
    row_index = ord(site_letter.upper()) - ord('A')  # 'A' corresponds to row 0, 'B' to row 1, and so on

    if row_index >= df.shape[0]:
        #print(f"Warning: Site letter {site_letter} not found in the CSV file for chromosome {chr_number}.")
        return

    # Extract the range in the form chrposmin-chrposmax
    site_range = df.iloc[row_index, col_index]

    if pd.isna(site_range):
        #print(f"Warning: No data for site {site_letter} on chromosome {chr_number}.")
        return

    # Split the range into minimum and maximum positions
    pos_min, pos_max = map(int, site_range.split('-'))

    # Convert positions to kb
    pos_min_kb = pos_min // 1000
    pos_max_kb = pos_max // 1000

    # Append the range as a numpy array
    positions_fragile_site.append(np.arange(pos_min_kb, pos_max_kb))

    # Flatten the list of arrays
    flattened_positions = np.concatenate(positions_fragile_site)

    # Save positions_fragile_site to a text file, with each value on a new line
    np.savetxt(f'data/genome_regions/fragile_sites/positions_fragile_site_{chr_number}{site_letter}.txt', flattened_positions, fmt='%d')

In [29]:
# Base regions

def gen_positions_bases(local_genome_file, chr_lengths):
    bases = ['A', 'T', 'G', 'C']
    
    output_dir = 'data/genome_regions/bases'
    os.makedirs(output_dir, exist_ok=True)
    
    with gzip.open(local_genome_file, "rt") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            chr_number = record.id.lstrip("chr")  # Removing 'chr' prefix
            if chr_number.isdigit():
                chr_number = int(chr_number)
                if chr_number in range(1, 23):  # Assuming we only want chromosomes 1-22
                    seq = str(record.seq).upper()  # Ensure sequence is uppercase
                    base_files = {base: [] for base in bases}
                    
                    for kb in range(chr_lengths[chr_number-1]):
                        position = kb * 1000  # 0-based position
                        if position < len(seq):
                            base_pair = seq[position]
                            if base_pair in base_files:
                                base_files[base_pair].append(kb)
                    
                    for base, locations in base_files.items():
                        with open(os.path.join(output_dir, f'positions_{base}_chr[{chr_number}].txt'), 'w') as file:
                            for loc in locations:
                                file.write(f'{loc}\n')

In [30]:
# Function to generate coding intervals and exact positions for each chromosome
def generate_coding_intervals_and_positions():

    # Input BigBed file and output directory
    bigbed_path = r'data/genome_regions/coding/gencodeV45lift37.bb'
    output_dir = r'data/genome_regions/coding/'
    
    # Open the BigBed file using pybigtools
    bb = pybigtools.open(bigbed_path)
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # List of chromosomes as strings (1-22)
    chromosomes = [str(i) for i in range(1, 23)]
    
    # Loop over each chromosome
    for i, chrom in enumerate(chromosomes):
        
        # Initialize a list representing the chromosome at 1kb resolution, with each position set to 0
        chrom_data = [0] * chr_lengths[i]
        
        # List to store exact coding positions (in kb)
        exact_positions = []
        
        # Query all records for the chromosome
        for record in bb.records(f"chr{chrom}"):
            chrom_start = record[0]
            chrom_end = record[1]
            fields = record[2:]  # Additional fields, including transcript class
            
            # Assuming transcriptClass is in a specific position
            transcript_class = fields[17]
            
            # Only process coding transcripts
            if transcript_class == 'coding':
                # Mark positions as coding (1) for the corresponding 1kb intervals
                for pos in range(chrom_start // 1000, min(chrom_end // 1000 + 1, chr_lengths[i])):
                    chrom_data[pos] = 1
                
                # Store exact positions for coding intervals
                for pos in range(chrom_start // 1000, chrom_end // 1000 + 1):
                    exact_positions.append(pos)
        
        # Set output file for 1kb resolution data (0 and 1)
        output_file_1kb = os.path.join(output_dir, f"coding_chr[{chrom}].txt")
        
        # Write the 1kb resolution data to the output file
        with open(output_file_1kb, 'w') as f_out_1kb:
            f_out_1kb.write("\n".join(map(str, chrom_data)))
        
        # Set output file for exact coding positions
        output_file_positions = os.path.join(output_dir, f"codingpos_chr[{chrom}].txt")
        
        # Write the exact coding positions to the output file
        with open(output_file_positions, 'w') as f_out_pos:
            f_out_pos.write("\n".join(map(str, exact_positions)))

def process_coding_file(chr_number):
    # Load the original file
    input_file = f"data/genome_regions/coding/codingpos_chr[{chr_number}].txt"
    output_file = f"data/genome_regions/coding/codingpos_chr[{chr_number}].txt"
    
    # Read the file contents
    with open(input_file, 'r') as file:
        # Read each line, strip newlines, convert to integer, and store in a set to remove duplicates
        data = set(int(line.strip()) for line in file)
    
    # Sort the unique values
    sorted_data = sorted(data)
    
    # Write the sorted data back to a new file
    with open(output_file, 'w') as file:
        for value in sorted_data:
            file.write(f"{value}\n")

#### Replication timing plots

In [31]:
def rt_plotf(cell_line, chr_number, chrpos_min, chrpos_max, scale_factor, file_name, spec_fileQ, saveQ, ax=None, show_ticks=True, show_title=True, simQ=False):
    global time_data

    # Data loading (Warning: requires saving data in fitting procedure)
    # Choose between whole-genome files or particular simulation
    if spec_fileQ:
        time_data = np.loadtxt(f'data/whole-genome_timing_data/time_data_{cell_line}_chr[{chr_number}]_{chrpos_min}-{chrpos_max}.txt', dtype=float)
        time_sim = np.loadtxt(f'data/whole-genome_timing_simulation/time_sim_{cell_line}_chr[{chr_number}]_{chrpos_min}-{chrpos_max}.txt', dtype=float)
    else:
        time_data = np.loadtxt(f'data/whole-genome_timing_data/time_data_{cell_line}_chr[{chr_number}].txt', dtype=float)[chrpos_min:chrpos_max]
        time_sim = np.loadtxt(f'data/whole-genome_timing_simulation/time_sim_{cell_line}_chr[{chr_number}].txt', dtype=float)[chrpos_min:chrpos_max]

    if simQ:
        time_sim = np.loadtxt(f'data/whole-genome_timing_bcs/time_bcs_{cell_line}_chr[{chr_number}]_{chrpos_min}-{chrpos_max}.txt', dtype=float)
    
    x = np.linspace(chrpos_min, chrpos_max, chrpos_max - chrpos_min)  # Chromosome positions

    
    # Plotting
    if ax is None:
        plt.figure(figsize=(10, 6))
        ax = plt.gca()
        
    ax.plot(x, time_data, label='data', color='gray', linewidth=4, alpha=0.6)
    ax.plot(x, time_sim, label='bcs', color='red', linewidth=4, alpha=0.6)
    if show_title:
        ax.set_title(cell_line + ' - Chromosome ' + str(chr_number))
    ax.set_xlabel('Chromosome position (kb)' if show_title else None)
    ax.set_ylabel('Time in S-phase (min)' if show_title else "Replication time")
    ax.set_ylim(100 * scale_factor, 0)
    ax.set_xlim(chrpos_min, chrpos_max)  # Ensure the x-axis covers the full range
    ax.legend(loc='lower right')
    ax.grid(True)
    ax.grid(False)
    ax.tick_params(axis='both', which='both', direction='out', bottom=show_ticks, labelbottom=show_ticks, left=show_ticks, labelleft=show_ticks)
    for spine in ax.spines.values():
        spine.set_visible(True)
    
    # Save plot
    if saveQ:
        plt.savefig('figures/plot_RT_' + file_name + '.pdf', bbox_inches='tight', transparent=True)

    if ax is None:
        plt.show()

#### KDE plots

In [32]:
def plot_relative_kdes(data_list, labels, bw_adjust=1, saveQ=False, x_grid_size=1000, normalize=False, log_scale=False, x_min=None, x_max=None, plot_title="Relative density plots", x_title="Error", save_name="savedfile"):
    if log_scale:
        # Filter out non-positive values for log scale
        data_list = [data[data > 0] for data in data_list]
        if x_min is None:
            x_min = min(data.min() for data in data_list)
        if x_max is None:
            x_max = max(data.max() for data in data_list)
        x = np.logspace(np.log10(x_min), np.log10(x_max), x_grid_size)
    else:
        if x_min is None:
            x_min = min(data.min() for data in data_list)
        if x_max is None:
            x_max = max(data.max() for data in data_list)
        x = np.linspace(x_min, x_max, x_grid_size)

    plt.figure(figsize=(4, 4))

    # Compute and plot the KDEs
    if normalize:
        for data in data_list:
            ax = sns.kdeplot(data, fill=True, bw_adjust=bw_adjust, log_scale=log_scale)#, alpha=.5)
    else:
        ax = sns.kdeplot(data_list, fill=True, bw_adjust=bw_adjust, log_scale=log_scale)#, alpha=.5)
    handles = [mpatches.Patch(facecolor=color, label=label, alpha=0.5) for color, label in zip(plt.rcParams['axes.prop_cycle'].by_key()['color'], labels)]


    
    plt.title(plot_title)
    plt.xlabel(x_title)
    
    plt.gca().yaxis.set_visible(False)  # Remove y-axis ticks
    plt.ylabel('')  # Remove y-axis label
    if log_scale:
        plt.xscale('log')
    
    plt.xlim(x_min, x_max)
    plt.legend(handles=handles, loc='upper left')
    if saveQ:
        plt.savefig(f'figures/fig_kdeplot_{save_name}.pdf', bbox_inches='tight', transparent=True)
    plt.show()

#### Data vs data scatter plots ####

In [33]:
def create_modified_cmap(map_to_white):
    viridis = plt.cm.viridis
    newcolors = viridis(np.linspace(0, 1, 256))
    if map_to_white:
        white = np.array([1, 1, 1, 1])
        newcolors[:1, :] = white
    modified_cmap = mcolors.ListedColormap(newcolors)
    return modified_cmap

def plot_replication_data_vs_data(data, labels, colors, xmin=1e-15, xmax=1e5, ymin=0, ymax=500, sizep=0.1, title_x="Error", title_y="Replication time (min)",title="",
                                  log_x=True, log_y=False, use_density=True, dpi=100, map_to_white=False, invertyQ=False, saveQ=False):
    
    fig, ax = plt.subplots(figsize=(10, 6))
    modified_cmap = create_modified_cmap(map_to_white)
    
    if use_density:
        for (data1, data2), label, color in zip(data, labels, colors):
            # Filter out NaN values
            mask = ~np.isnan(data2) & ~np.isnan(data1)
            data2 = np.array(data2)[mask]
            data1 = np.array(data1)[mask]

            # Rasterize the density artist for reduced file size
            density = ScatterDensityArtist(ax, data2, data1, cmap=modified_cmap, dpi=dpi, rasterized=True)
            ax.add_artist(density)

        cbar = plt.colorbar(density, ax=ax)
        cbar.ax.set_ylabel('Density')
    else:
        for (data1, data2), label, color in zip(data, labels, colors):
            # Rasterize the scatter points for reduced file size
            ax.scatter(data2, data1, s=0.02 if label == 'Whole-genome' else sizep, color=color, label=label, rasterized=True)

        # Custom legend
        handles, labels = ax.get_legend_handles_labels()
        new_handles = [plt.Line2D([], [], color=handle.get_facecolor()[0], marker='o', linestyle='', markersize=3) for handle in handles]
        ax.legend(handles=new_handles, labels=labels)

    # Set x-axis scale to log if log_x is True
    if log_x:
        ax.set_xscale('log')

    # Set y-axis scale to log if log_y is True
    if log_y:
        ax.set_yscale('log')
        ax.set_ylim((ymin, ymax))
    else:
        ax.set_ylim((ymin, ymax))
    if invertyQ:
        ax.set_ylim(ax.get_ylim()[::-1])  # Invert y-axis only if not log scale

    ax.set_xlim((xmin, xmax))
    ax.set_aspect(aspect='auto')

    ax.set_title(title)
    ax.set_xlabel(title_x)
    ax.set_ylabel(title_y)

    if saveQ:
        plt.savefig('figures/fig_scatter_density.pdf', bbox_inches='tight', transparent=True, dpi=dpi)  # Adjust dpi for saving
    
    plt.show()

In [82]:
def generate_plot(load_type1, load_type2, cell_line, chr_numbers,
                  show_all = True,
                  show_telomeres=False, show_centromeres=False, show_fragile_sites=False,
                  chr_number_fragile_sites=[1], site_letters=['A'],
                  xmin=1e-15, xmax=1e-3, ymin=0, ymax=500, log_x=True, log_y=False, invertyQ=False,
                  custom_positions=[], custom_labels=[""], sizep=0.1,
                  use_density=True, dpi=100, map_to_white=False, title="", saveQ=False):

    global all_data
    
    title_x = title_map[load_type2]
    title_y = title_map[load_type1]

    all_data = []
    labels = []
    colours = []

    all_data.append(generate_all_data(cell_line, chr_numbers, load_type1, load_type2))
    labels.append("Whole-genome")
    #colours.append('#1f77b4')
    colours.append('lightgrey')

    for chr_number in chr_numbers:

        if show_centromeres:
            positions_centromeres = load_function_pos(chr_number, "centromeres")
            data_centromeres = [[sublist[pos] for pos in positions_centromeres] for sublist in all_data[0]]
            all_data.append(data_centromeres)
            labels.append("" if "Centromeres" in labels else "Centromeres")
            colours.append('#1f77b4')
    
        if show_telomeres:
            positions_telomeres = load_function_pos(chr_number, "telomeres")
            data_telomeres = [[sublist[pos] for pos in positions_telomeres] for sublist in all_data[0]]
            all_data.append(data_telomeres)
            labels.append("" if "Telomeres" in labels else "Telomeres")
            colours.append('orange')

        if len(custom_positions) != 0:
            cmap = plt.get_cmap('tab20')
            color_list = [cmap(i / 10) for i in range(len(custom_positions)+1)]
            for i in range(0,len(custom_positions)):
                positions_custom = np.array(custom_positions[i])
                data_custom = [[sublist[pos] for pos in custom_positions[i]] for sublist in all_data[0]]
                all_data.append(data_custom)
                labels.append(custom_labels[i])
                #colours.append('red')
                colours.append(color_list[i+1])

    for chr_number in chr_number_fragile_sites:

        if show_fragile_sites:
            for site_letter in site_letters:
                positions_fragile_sites = load_function_pos(chr_number, "fragile_sites", site_letter=site_letter)
                data_fragile_sites = [[sublist[pos] for pos in positions_fragile_sites] for sublist in all_data[0]]
                all_data.append(data_fragile_sites)
                labels.append(f'FRA{chr_number}{site_letter}')
                colours.extend(list(plt.cm.autumn(np.linspace(0, .5, len(labels) - len(colours)))))
                
    if load_type1 == 'DNaseIHS':
        all_data[0][0] = np.where(np.array(all_data[0][0]) <= 0, np.nan, all_data[0][0])

    if show_all == False:
        all_data = all_data[1:]
        labels = labels[1:]
        colours = colours[1:]

    plot_replication_data_vs_data(all_data, labels, colours, xmin, xmax, ymin, ymax, sizep=sizep, title_x=title_x, title_y=title_y, title=title, log_x=log_x, log_y=log_y,
                                  use_density=use_density, dpi=dpi, map_to_white=map_to_white, invertyQ=invertyQ, saveQ=saveQ)

In [35]:
def heat_scatter(speed_sim, speed_data, error, x_lim=100, y_lim=100, saveQ=False):
    # Apply logarithmic scale to error, adding a small constant to avoid log(0)
    log_error = np.log10(error + 1e-10)  # Add a small constant to avoid log(0)

    # Create the scatter plot with green-to-red colormap
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(speed_sim, speed_data, c=log_error, cmap=plt.get_cmap('RdYlGn_r'), 
                          s=.5, edgecolor=None, alpha=0.7, rasterized=True)  # Enable rasterization
    
    # Add color bar with green-to-red scale
    cbar = plt.colorbar(scatter)
    cbar.set_label('Error (log)', rotation=90, labelpad=15)
    
    # Set labels and title
    plt.xlabel('Simulated replication rate (kb/min)')
    plt.ylabel('Observed replication rate (kb/min)')
    plt.title('Error Heatmap (Green to Red)')
    
    # Set x and y axis limits
    plt.xlim(1.2, x_lim)
    plt.ylim(0, y_lim)

    # Save plot
    if saveQ:
        plt.savefig('figures/ratescatter.pdf', bbox_inches='tight', transparent=True, dpi=300)  # Use rasterized save
    
    # Show the plot
    plt.show()

In [36]:
def find_local_maxima_filter_by_region(data_error, data_time,
                                       point1=(0.01, 100), point2=(0.1, 500),
                                       point3=(0.1, 100), point4=(1, 500)):
    local_maxima_positions = []

    # Step 1: Identify all local maxima
    for i in range(1, len(data_error) - 1):
        if data_error[i-1] < data_error[i] > data_error[i+1]:
            local_maxima_positions.append(i)

    # Step 2: Filter by the defined subregion
    filtered_positions = []

    # Line equations calculations
    def line_equation(pointA, pointB):
        # Calculate slope and intercept
        slope = (pointB[1] - pointA[1]) / (pointB[0] - pointA[0])
        intercept = pointA[1] - slope * pointA[0]
        return lambda x: slope * x + intercept

    # Create line functions from points
    line1 = line_equation(point1, point2)
    line2 = line_equation(point3, point4)

    # Filtering based on subregion defined by the lines
    for pos in local_maxima_positions:
        x_value = data_error[pos]
        y_value = data_time[pos]
        # Check if the position satisfies both inequalities
        if max(line2(x_value),0) <= y_value <= min(line1(x_value),300):
            filtered_positions.append(pos)

    return filtered_positions

In [84]:
def gene_scatter(cell_line, chr_numbers, genes, saveQ=False):
    custom_labels = genes

    # Colors for the scatter plot
    colors = plt.get_cmap('tab10').colors  # Use a colormap to get different colors
    
    # Initialize a list to store the lists of pairs (error[i], time_data[i])
    gene_data_pairs = []
    
    # Loop through the chromosomes and genes
    for i in range(len(chr_numbers)):
        # Load positions for the gene on the given chromosome
        positions = load_function_pos(chr_numbers[i], 'gene', gene_name=genes[i])
        
        # Load the time_data and error data for those positions
        time_data = load_function(cell_line, chr_numbers[i], 'time_data')[positions]
        error = load_function(cell_line, chr_numbers[i], 'error')[positions]
        
        # Create a list of pairs (error[i], time_data[i]) for this gene
        pairs = list(zip(error, time_data))
        gene_data_pairs.append(pairs)
    
    # Create the scatter plot
    fig, ax = plt.subplots(figsize=(6, 6))
    
    # Loop through the gene data pairs and plot each gene's data
    for i, gene_data in enumerate(gene_data_pairs):
        error_vals, time_vals = zip(*gene_data)  # Unzip the pairs into error and time data
        ax.scatter(error_vals, time_vals, color=colors[i % len(colors)], 
                    label=genes[i], alpha=0.7, s=1, rasterized=True)  # Rasterized scatter points
    
    # Set labels and title
    ax.set_xlabel('Error (Log Scale)')
    ax.set_ylabel('Replication Time (min)')
    ax.set_xlim(1e-4, 1e4)
    ax.set_title('')
    
    # Invert the y-axis
    ax.invert_yaxis()
    
    # Set x-axis to logarithmic scale
    ax.set_xscale('log')
    
    # Set scientific notation for log scale tick labels
    ax.xaxis.set_major_formatter(ticker.LogFormatterMathtext())
    
    # Create a custom legend with larger marker sizes
    handles, labels = ax.get_legend_handles_labels()
    new_handles = [plt.Line2D([], [], marker='o', color=colors[i % len(colors)], linestyle='None', markersize=5)
                   for i in range(len(handles))]
    ax.legend(new_handles, labels, title='Large genes')
    
    if saveQ:
        plt.savefig('figures/scatter_genes.pdf', bbox_inches='tight', transparent=True, dpi=300)
    
    # Show the plot
    plt.show()


#### Firing rate plots

In [38]:
def fire_plotf(cell_line, chr_numbers, resolution, file_name, saveQ, ax=None, aspect_ratio=(10, 6), replace_missing_with_nan=True, show_ticks=True, show_title=True):

    # Chromosome lengths in kb (1 kb resolution)
    chr_lengths = [249251, 243200, 198023, 191155, 180916, 171116, 159139, 146365, 141214, 135535, 135007, 133852, 115170, 107350, 102532, 90355, 81196, 78078, 59129, 63026, 48130, 51305]

    if ax is None:
        fig, axes = plt.subplots(len(chr_numbers), 1, figsize=aspect_ratio, sharey=True)
    else:
        axes = [ax]  # Ensure axes is a list even if it's just one subplot
    
    for idx, chr_number in enumerate(chr_numbers):
        # Data loading: Read firing rates from a text file
        firing_rates_file_path = f'data/whole-genome_firing_rates/fire_rates_{cell_line}_chr[{chr_number}].txt'
        firing_rates = np.loadtxt(firing_rates_file_path, dtype=float)

        if replace_missing_with_nan:
            missing_data_path = f'data/whole-genome_missing_data/missing_data_{cell_line}_chr[{chr_number}].txt'
            missing_positions = np.loadtxt(missing_data_path, dtype=int)
            firing_rates[missing_positions] = np.nan

        # Generate chromosome positions in Mb
        x = np.linspace(0, chr_lengths[chr_number - 1] / 1000, len(firing_rates))  # Chromosome positions in Mb

        # Plotting
        ax = axes[idx] if len(chr_numbers) > 1 else axes[0]
        ax.plot(x, firing_rates, color='#1f77b4', linewidth=2, alpha=.9)
        
        if idx == 0 and show_title:
            ax.set_title(f'{cell_line}')
        if idx == len(chr_numbers) - 1:
            if show_ticks:
                ax.set_xlabel('Chromosome position (Mb)')
                ax.set_xticks(np.arange(0, chr_lengths[0] / 1000, 20))  # Set x-axis ticks to show every 20 Mb
        else:
            ax.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)  # Remove x-axis ticks and labels for all but the last plot
        
        ax.set_yscale('log')
        ax.set_ylim(10**-12, 10**-3)
        ax.set_xlim(0, chr_lengths[0] / 1000)  # Set x-axis limit to the largest chromosome length in Mb
        ax.tick_params(axis='both', which='both', direction='out')
        for spine in ax.spines.values():
            spine.set_visible(True)
        
        # Add chromosome number on the top right corner with a transparent box
        ax.text(0.995, 0.93, f'chr {chr_number}', transform=ax.transAxes, 
                fontsize=12, verticalalignment='top', horizontalalignment='right',
                bbox=dict(facecolor='white', alpha=0., edgecolor='none'))
        
        ax.grid(False)  # Remove grid for each plot

    # Add a shared y-axis label if no axis is provided
    if ax is None:

        fig.text(0.065, 0.5, 'Firing rate', va='center', rotation='vertical')

        # Adjust layout
        plt.subplots_adjust(left=0.1, right=0.95, top=0.95, bottom=0.05, hspace=0.15)  # Adjust margins and spacing

    # Save plot if saveQ is True
    if saveQ:
        plt.savefig(f'figures/fig_plot_fire_signatures_{cell_line}.pdf', bbox_inches='tight', transparent=True, dpi=300)

        plt.show()


In [39]:
def fire_plotf2(cell_line, chr_number, chrpos_min, chrpos_max, file_name, saveQ, ax=None, show_ticks=True, show_title=True):
    global fire_data

    # Data loading
    fire_data = np.loadtxt('data/whole-genome_firing_rates/fire_rates_' + cell_line + '_chr[' + str(chr_number) + '].txt', dtype=float)[chrpos_min:chrpos_max]
    x = np.linspace(chrpos_min, chrpos_max, chrpos_max - chrpos_min)  # Chromosome positions

    # Plotting
    if ax is None:
        plt.figure(figsize=(10, 6))
        ax = plt.gca()
        
    ax.plot(x, fire_data, color='#1f77b4', linewidth=2, alpha=.9)
    if show_title:
        ax.set_title(cell_line + ' - Chromosome ' + str(chr_number))
    ax.set_xlabel('Chromosome position (kb)' if show_title else None)
    ax.set_ylabel('Firing rate' if show_title else "Firing")
    ax.set_yscale('log')
    ax.set_ylim(10**-12, 10**-3)
    ax.set_xlim(chrpos_min, chrpos_max)  # Ensure the x-axis covers the full range
    ax.tick_params(axis='both', which='both', direction='out', bottom=show_ticks, labelbottom=show_ticks, left=show_ticks, labelleft=show_ticks)
    for spine in ax.spines.values():
        spine.set_visible(True)
        
    # Set logarithmic ticks for y-axis
    ax.yaxis.set_major_formatter(LogFormatterMathtext())
    ax.yaxis.set_minor_formatter(LogFormatterMathtext())

    # Save plot
    if saveQ:
        plt.savefig('figures/plot_fire_' + file_name + '.pdf', bbox_inches='tight', transparent=True)

    if ax is None:
        plt.show()

#### Error plots ####

In [40]:
def error_plotf(cell_line, chr_number, chrpos_min, chrpos_max, file_name, saveQ, ax=None, show_ticks=True, show_title=True):
    global error_data

    # Data loading
    error_data = np.loadtxt('data/whole-genome_error/error_' + cell_line + '_chr[' + str(chr_number) + '].txt', dtype=float)[chrpos_min:chrpos_max]
    x = np.linspace(chrpos_min, chrpos_max, chrpos_max - chrpos_min)  # Chromosome positions

    # Plotting
    if ax is None:
        plt.figure(figsize=(10, 6))
        ax = plt.gca()
        
    ax.plot(x, error_data, color='#ff7f0e', linewidth=2, alpha=.9)
    if show_title:
        ax.set_title(cell_line + ' - Chromosome ' + str(chr_number))
    ax.set_xlabel('Chromosome position (kb)' if show_title else None)
    ax.set_ylabel('Error' if show_title else "Error")
    ax.set_yscale('log')
    ax.set_ylim(10**-12, 10**6)
    ax.set_xlim(chrpos_min, chrpos_max)  # Ensure the x-axis covers the full range
    ax.tick_params(axis='both', which='both', direction='out', bottom=show_ticks, labelbottom=show_ticks, left=show_ticks, labelleft=show_ticks)
    for spine in ax.spines.values():
        spine.set_visible(True)
        
    # Set logarithmic ticks for y-axis
    ax.yaxis.set_major_formatter(LogFormatterMathtext())
    ax.yaxis.set_minor_formatter(LogFormatterMathtext())

    # Save plot
    if saveQ:
        plt.savefig('figures/plot_error_' + file_name + '.pdf', bbox_inches='tight', transparent=True)

    if ax is None:
        plt.show()

In [41]:
def plot_goodness_of_fit(chr_number, saveQ, ax=None, cell_lines=None, chrpos_min=0, chrpos_max=10, alld=True, base_path='data/', shift_param=1.0):
    global spaced_data, missing_data
    
    if cell_lines is None:
        cell_lines_BigWig = ["HeLa-S3","BJ1","IMR90","HUVEC","K562","GM12878","HepG2","MCF-7"]
        cell_lines_HighRes = ["H1","H9","HCT"]
        cell_lines = cell_lines_BigWig + cell_lines_HighRes

    chr_lengths = [249251, 243200, 198023, 191155, 180916, 171116, 159139, 146365, 141214, 135535, 135007, 133852, 115170, 107350, 102532, 90355, 81196, 78078, 59129, 63026, 48130, 51305]
    num_positions = chr_lengths[chr_number - 1]
    num_cell_lines = len(cell_lines)

    # Placeholder for goodness of fit data (use actual data from your model)
    goodness_of_fit = [load_function(cell_line, chr_number, 'error') for cell_line in cell_lines]

    # Apply shifted log transformation
    transformed_goodness_of_fit = [np.log1p(data + shift_param) for data in goodness_of_fit]

    # Normalize the transformed data to [0, 1] range
    min_val = np.nanmin(transformed_goodness_of_fit)
    max_val = np.nanmax(transformed_goodness_of_fit)
    normalized_goodness_of_fit = [(data - min_val) / (max_val - min_val) for data in transformed_goodness_of_fit]

    # Create a custom colormap for vivid red to vivid green
    cmap = plt.get_cmap('RdYlGn_r')  # Note the '_r' to reverse the colormap

    # Create a colormap that includes gray for the "no data" regions
    colors = cmap(np.linspace(0, 1, 256))
    colormap_gray = np.array([[0., 0., 0., 0.17]])  # RGBA for gray in colormap
    new_colors = np.vstack((colors, colormap_gray))
    extended_cmap = ListedColormap(new_colors)

    if not alld:
        normalized_goodness_of_fit = [i[chrpos_min:chrpos_max] for i in normalized_goodness_of_fit]
        num_positions = chrpos_max - chrpos_min
        x = np.linspace(chrpos_min, chrpos_max, chrpos_max - chrpos_min)  # Chromosome positions

    # Create the spaced data array with triplicates and NaN rows
    spaced_data = np.full((num_cell_lines * 4 - 1, num_positions), np.nan)
    for i in range(num_cell_lines):
        spaced_data[i * 4] = normalized_goodness_of_fit[i]
        spaced_data[i * 4 + 1] = normalized_goodness_of_fit[i]
        spaced_data[i * 4 + 2] = normalized_goodness_of_fit[i]

    # Load missing data and update goodness_of_fit
    for i, cell_line in enumerate(cell_lines):
        missing_data = load_missing_data(cell_line, chr_number) 
        if not alld:
            #missing_data = list(set(missing_data) & set(range(chrpos_min, chrpos_max)))
            missing_data = [i - chrpos_min for i in range(chrpos_min, chrpos_max) if i in missing_data]
        spaced_data[i * 4, missing_data] = 256  # Mark missing data positions with index for gray
        spaced_data[i * 4 + 1, missing_data] = 256
        spaced_data[i * 4 + 2, missing_data] = 256

    # Avoiding the maximum value being exactly 1 by subtracting a small epsilon value
    spaced_data0 = spaced_data
    spaced_data = np.clip(spaced_data * 255, 0, 255 - 1)  # Normalize to 0-255 range
    spaced_data[spaced_data0 == 256] = 256  # Ensure missing data stays at 256

    # Normalize data for colormap
    if ax is None:
        fig, ax = plt.subplots(figsize=(15, .5*num_cell_lines))
        
    cax = ax.imshow(spaced_data, aspect='auto', cmap=extended_cmap, interpolation='nearest', vmin=0, vmax=255)

    # Add color bar to the right and match height of the bars on the left
    if alld:
        cbar = fig.colorbar(cax, orientation='vertical', pad=0.02)
        cbar.set_label('Normalized error' if alld else None)
        cbar.set_ticks(np.linspace(0, 255, 6))
        cbar.set_ticklabels(np.round(np.linspace(0, 1, 6), 2))

    # Set ticks and labels
    yticks_positions = np.arange(1, num_cell_lines * 4, 4)
    ax.set_yticks(yticks_positions if alld else [])
    ax.set_yticklabels(cell_lines if alld else [])
    xtick_positions = np.arange(0, num_positions, 20000)
    xtick_labels = (xtick_positions / 1000).astype(int)
    ax.set_xticks(xtick_positions if alld else [])
    ax.set_xticklabels(xtick_labels if alld else [])
    ax.set_xlabel('Chromosome position (Mb)' if alld else None)

    ax.grid(False)

    # Add a gray square and text for "Missing data" in the top right corner
    if alld:
        ax.text(0, 1.04, f'Chromosome {chr_number}', transform=ax.transAxes, fontsize=7, verticalalignment='top', horizontalalignment='left', bbox=dict(facecolor="none", edgecolor='none'))
        ax.text(1, 1.04, 'Missing data', transform=ax.transAxes, fontsize=7, verticalalignment='top', horizontalalignment='right', bbox=dict(facecolor="none", edgecolor='none'))
        ax.add_patch(plt.Rectangle((.924, 1.02), 0.008, 0.02, transform=ax.transAxes, color=[0., 0., 0., 0.17], clip_on=False))

    if saveQ:
        plt.savefig(f'figures/fig_goodness_of_fit_chr[{chr_number}].pdf', bbox_inches='tight', transparent=True)

    if ax is None:
        plt.show()

In [42]:
# MSE plots
def read_mse_file(file_path, num_elements=100):
    mse_values = []
    with open(file_path, 'r') as file:
        for line in file:
            try:
                mse_values.append(float(line.split(':')[-1].strip()))
            except ValueError:
                pass
    return mse_values[:num_elements]

def average_mse_across_chromosomes(cell_line, num_elements=100):
    num_chromosomes = 22  # Assuming 24 chromosomes
    all_mse_values = []

    for chr_number in range(1, num_chromosomes + 1):
        file_path = f'data/whole-genome_mse/mse_{cell_line}_chr[{chr_number}].txt'
        mse_values = read_mse_file(file_path, num_elements)
        if mse_values:
            all_mse_values.append(mse_values)
    
    if not all_mse_values:
        return []

    # Convert list to numpy array for easy processing
    all_mse_values = np.array(all_mse_values)
    # Compute the mean across chromosomes for each iteration
    averaged_mse = np.mean(all_mse_values, axis=0)
    return averaged_mse

def plot_averaged_mse(cell_lines, num_elements=100, log_scale=False, saveQ=False):
    plt.figure(figsize=(10, 8))

    for cell_line in cell_lines:
        averaged_mse = average_mse_across_chromosomes(cell_line, num_elements)
        if len(averaged_mse) == 0:
            print(f"No data found for {cell_line}.")
            continue
        iterations = np.arange(1, len(averaged_mse) + 1)
        plt.plot(iterations, averaged_mse, label=f'{cell_line}')

    plt.xlabel('Iteration Number')
    plt.ylabel('Averaged MSE')
    plt.ylim(100,1600)
    
    if log_scale:
        plt.yscale('log')

    plt.title('Averaged MSE Over Iterations for Different Cell Lines')
    plt.grid(False)
    plt.legend()

    # Save plot
    if saveQ:
        plt.savefig('figures/mseplots.pdf', bbox_inches='tight', transparent=True)
    
    plt.show()

#### Fork directionality plots ####

In [43]:
def forkd_plotf(cell_line, chr_number, chrpos_min, chrpos_max, file_name, spec_fileQ, saveQ, ax=None, show_ticks=True, show_title=True):
    global time_data

    # Data loading (Warning: requires saving data in fitting procedure)
    # Choose between whole-genome files or particular simulation
    if spec_fileQ:
        forkd_data = np.loadtxt(f'data/whole-genome_fork_directionality/fork_directionality_{cell_line}_chr[{chr_number}]_{chrpos_min}-{chrpos_max}.txt', dtype=float)
    else:
        forkd_data = np.loadtxt(f'data/whole-genome_fork_directionality/fork_directionality_{cell_line}_chr[{chr_number}].txt', dtype=float)[chrpos_min:chrpos_max]
    x = np.linspace(chrpos_min, chrpos_max, chrpos_max - chrpos_min)  # Chromosome positions

    
    # Plotting
    if ax is None:
        plt.figure(figsize=(10, 6))
        ax = plt.gca()
        
    ax.plot(x, forkd_data, label='', color='black', linewidth=2, alpha=0.6)
    if show_title:
        ax.set_title(cell_line + ' - Chromosome ' + str(chr_number))
    ax.set_xlabel('Chromosome position (kb)' if show_title else None)
    ax.set_ylabel('Fork directionality' if show_title else "Fork dir.")
    ax.set_ylim(-1, 1)
    ax.set_xlim(chrpos_min, chrpos_max)  # Ensure the x-axis covers the full range
    #ax.legend(loc='lower right')<s
    ax.grid(True)
    ax.grid(False)
    ax.tick_params(axis='both', which='both', direction='out', bottom=show_ticks, labelbottom=show_ticks, left=show_ticks, labelleft=show_ticks)
    for spine in ax.spines.values():
        spine.set_visible(True)
    
    # Save plot
    if saveQ:
        plt.savefig('figures/plot_forkd_' + file_name + '.pdf', bbox_inches='tight', transparent=True)

    if ax is None:
        plt.show()

#### RNA-Seq plots ####

In [44]:
def rna_seq_plotf(cell_line, chr_number, chrpos_min, chrpos_max, file_name, saveQ, ax=None, show_ticks=True, show_title=True):

    # Data loading
    rna_seq_data = np.loadtxt('data/rna-seq_files/rna_seq_' + cell_line + '_chr[' + str(chr_number) + '].txt', dtype=float)[chrpos_min:chrpos_max]
    x = np.linspace(chrpos_min, chrpos_max, chrpos_max - chrpos_min)  # Chromosome positions

    # Plotting
    if ax is None:
        plt.figure(figsize=(10, 6))
        ax = plt.gca()
        
    ax.plot(x, rna_seq_data, color='black', linewidth=1, alpha=.9)
    if show_title:
        ax.set_title(cell_line + ' - Chromosome ' + str(chr_number))
    ax.set_xlabel('Chromosome position (kb)' if show_title else None)
    ax.set_ylabel('Transcription levels (RNA-Seq)' if show_title else "RNA-Seq")
    #ax.set_yscale('log')
    ax.set_ylim(0, 100)
    ax.set_xlim(chrpos_min, chrpos_max)  # Ensure the x-axis covers the full range
    ax.tick_params(axis='both', which='both', direction='out', bottom=show_ticks, labelbottom=show_ticks, left=show_ticks, labelleft=show_ticks)
    for spine in ax.spines.values():
        spine.set_visible(True)
        
    # Set logarithmic ticks for y-axis
    ax.yaxis.set_major_formatter(LogFormatterMathtext())
    ax.yaxis.set_minor_formatter(LogFormatterMathtext())

    # Save plot
    if saveQ:
        plt.savefig('figures/plot_rna_seq_' + file_name + '.pdf', bbox_inches='tight', transparent=True)

    if ax is None:
        plt.show()

#### GRO-Seq plots ####

In [45]:
def gro_seq_plotf(cell_line, chr_number, chrpos_min, chrpos_max, file_name, saveQ, ax=None, show_ticks=True, show_title=True):

    # Data loading
    gro_seq_data = np.loadtxt('data/gro-seq_files/gro_seq_' + cell_line + '_chr[' + str(chr_number) + '].txt', dtype=float)[chrpos_min:chrpos_max]
    x = np.linspace(chrpos_min, chrpos_max, chrpos_max - chrpos_min)  # Chromosome positions

    # Plotting
    if ax is None:
        plt.figure(figsize=(10, 6))
        ax = plt.gca()
        
    ax.plot(x, np.log1p(gro_seq_data), color='black', linewidth=1, alpha=.9)
    if show_title:
        ax.set_title(cell_line + ' - Chromosome ' + str(chr_number))
    ax.set_xlabel('Chromosome position (kb)' if show_title else None)
    ax.set_ylabel('Transcription levels (GRO-Seq)' if show_title else "GRO-Seq")
    #ax.set_yscale('log')
    ax.set_ylim(0, 5)
    ax.set_xlim(chrpos_min, chrpos_max)  # Ensure the x-axis covers the full range
    ax.tick_params(axis='both', which='both', direction='out', bottom=show_ticks, labelbottom=show_ticks, left=show_ticks, labelleft=show_ticks)
    for spine in ax.spines.values():
        spine.set_visible(True)
        
    # Set logarithmic ticks for y-axis
    ax.yaxis.set_major_formatter(LogFormatterMathtext())
    ax.yaxis.set_minor_formatter(LogFormatterMathtext())

    # Save plot
    if saveQ:
        plt.savefig('figures/plot_gro_seq_' + file_name + '.pdf', bbox_inches='tight', transparent=True)

    if ax is None:
        plt.show()

#### Ideogram plots ####

In [46]:
def show_genome(cell_line, chr_number, chrpos_min, chrpos_max,
                show_genes_allQ=True, show_genes_bandsQ=True, show_genesQ=False,
                show_rt_plotQ=True, show_fire_plotQ=True, show_forkd_plotQ=False, show_error_plotQ=True, show_rna_seq_plotQ=False, show_gro_seq_plotQ=False,
                show_error_heat_plotQ=True,
                show_axisQ=True, saveQ=False):
    chrom = f"chr{chr_number}"
    start = chrpos_min * 1000
    end = chrpos_max * 1000

    chr_lengths = [249251, 243200, 198023, 191155, 180916, 171116, 159139, 146365, 141214, 135535, 135007, 133852, 115170, 107350, 102532, 90355, 81196, 78078, 59129, 63026, 48130, 51305]

    fig = plt.figure(figsize=(10, 10), dpi=150)

    fig.suptitle(f"{cell_line} - Chromosome {chr_number}")

    height_ratios0 = [0.1, 0.1, 0.05, .2, .5, .2, .2, .2, .3, .3, .1, 0., 1]
    show_components = [show_genes_allQ, show_genes_allQ, show_genes_bandsQ, show_genesQ, show_rt_plotQ, show_fire_plotQ, show_forkd_plotQ, show_error_plotQ, show_rna_seq_plotQ, show_gro_seq_plotQ, show_error_heat_plotQ, show_axisQ, True]
    height_ratios = [height_ratios0[i] for i in range(len(height_ratios0)) if show_components[i]]

    gs_i = -1

    gs = fig.add_gridspec(
        nrows=len(height_ratios),
        ncols=3,
        width_ratios=[2, 20, 2],
        height_ratios=height_ratios,
        hspace=0.01,
        left=0.01,
        right=0.99,
        top=0.94,
    )

    # Add the full chromosome ideogram
    if show_genes_allQ:
        gs_i += 1
        all_chrom_ax = fig.add_subplot(gs[gs_i, 1])
        all_chrom_ax.axis("off")
        all_chrom_ax.set_xticks([])
        pyideogram.ideogramh(chrom, ax=all_chrom_ax)
        
        # Draw a red rectangle on top of the region depicted in the full chromosome ideogram
        rect_start = start  # Start position in base pairs
        rect_end = end  # End position in base pairs
        rectangle_thickness = 3  # Set the thickness of the rectangle outline
        all_chrom_ax.add_patch(plt.Rectangle((rect_start, -0.5), rect_end - rect_start, 1, edgecolor='red', facecolor='none', linewidth=rectangle_thickness, clip_on=False))
        
        # Draw lines from the bottom corners of the rectangle
        #all_chrom_ax.plot([rect_start, -chr_lengths[chr_number-1] * 1e3 * (1/5.25)], [-0.5, -2], color='black', linewidth=1, linestyle='-', clip_on=False)
        #all_chrom_ax.plot([rect_end, chr_lengths[chr_number-1] * 1e3+chr_lengths[chr_number-1] * 1e3 * (1/5.3)], [-0.5, -2], color='black', linewidth=1, linestyle='-', clip_on=False)

        gs_i += 1
        empty_ax = fig.add_subplot(gs[gs_i, :])
        empty_ax.axis("off")
        empty_ax.tick_params(labelbottom=False)
        #empty_ax.plot([rect_start, -chr_lengths[chr_number-1] * 1e3 * (1/5.25)], [-0.5, -2], color='black', linewidth=1, linestyle='-', clip_on=False)

    # Add the zoomed ideogram within the same interval
    if show_genes_bandsQ:
        gs_i += 1
        interval_ideogram_ax = fig.add_subplot(gs[gs_i, :])
        interval_ideogram_ax.axis("off")
        interval_ideogram_ax.set_xlim(start, end)
        interval_ideogram_ax.set_xticks([])
        pyideogram.ideogramh(chrom, ax=interval_ideogram_ax, names=True)
        zoom_ax = interval_ideogram_ax

    # Add the gene track plot
    if show_genesQ:
        gs_i += 1
        gene_track_ax = fig.add_subplot(gs[gs_i, :])
        pyideogram.genetrack(
            f"{chrom}:{start}-{end}",
            ax=gene_track_ax,
            textlane=True,
            transcriptstyle="arrowed",
            exonstyle="Box",
        )
        # Set genome ticks on the gene track plot
        pyideogram.set_genome_xticks(gene_track_ax)
        # Remove axis labels and ticks for the gene track plot
        gene_track_ax.axis("on")
        gene_track_ax.set_xticks([])
        gene_track_ax.set_ylabel('Genes')
        gene_track_ax.tick_params(labelbottom=False)
        if not show_genes_bandsQ:
            zoom_ax = gene_track_ax

    # Add RT plot
    if show_rt_plotQ:
        gs_i += 1
        rt_ax = fig.add_subplot(gs[gs_i, :])
        rt_plotf(cell_line, chr_number, chrpos_min, chrpos_max, 6, 'example_file', False, False, ax=rt_ax, show_ticks=False, show_title=False)

    # Add firing plot
    if show_fire_plotQ:
        gs_i += 1
        fire_ax = fig.add_subplot(gs[gs_i, :])
        fire_plotf2(cell_line, chr_number, chrpos_min, chrpos_max, 'example_file', False, ax=fire_ax, show_ticks=False, show_title=False)

    # Add fork directionality plot
    if show_forkd_plotQ:
        gs_i += 1
        forkd_ax = fig.add_subplot(gs[gs_i, :])
        forkd_plotf(cell_line, chr_number, chrpos_min, chrpos_max, 'example_file', False, False, ax=forkd_ax, show_ticks=False, show_title=False)

    # Add the error plot
    if show_error_plotQ:
        gs_i += 1
        error_ax = fig.add_subplot(gs[gs_i, :])
        error_plotf(cell_line, chr_number, chrpos_min, chrpos_max, 'example_file', False, ax=error_ax, show_ticks=False, show_title=False)

    # Add the RNA-Seq plot
    if show_rna_seq_plotQ:
        gs_i += 1
        rna_seq_ax = fig.add_subplot(gs[gs_i, :])
        rna_seq_plotf(cell_line, chr_number, chrpos_min, chrpos_max, 'example_file', False, ax=rna_seq_ax, show_ticks=False, show_title=False)

    # Add the GRO-Seq plot
    if show_gro_seq_plotQ:
        gs_i += 1
        gro_seq_ax = fig.add_subplot(gs[gs_i, :])
        gro_seq_plotf(cell_line, chr_number, chrpos_min, chrpos_max, 'example_file', False, ax=gro_seq_ax, show_ticks=False, show_title=False)
    
    # Add the error heat plot
    if show_error_heat_plotQ:
        gs_i += 1
        error_heat_ax = fig.add_subplot(gs[gs_i, :])
        error_heat_ax.set_ylabel('Error')
        plot_goodness_of_fit(chr_number, False, ax=error_heat_ax, cell_lines=[cell_line], chrpos_min=chrpos_min, chrpos_max=chrpos_max, alld=False)

    # Add a new row showing just the x-axis with the ticks
    if show_axisQ:
        gs_i += 1
        x_ticks_ax = fig.add_subplot(gs[gs_i, :])
        x_ticks_ax.set_xlim(start, end)
        x_ticks_ax.xaxis.set_visible(True)
        x_ticks_ax.spines['top'].set_visible(False)
        x_ticks_ax.spines['left'].set_visible(False)
        x_ticks_ax.spines['right'].set_visible(False)
        x_ticks_ax.tick_params(axis='x', which='both', bottom=True, top=False, labelbottom=True)
        # Hide y-axis and labels
        x_ticks_ax.tick_params(left=False, labelleft=False)
        x_ticks_ax.set_yticks([])
        # Set a dummy plot for ticks visibility
        x_ticks_ax.plot([start, end], [0, 0], color='white', alpha=0)  # Ensure the axis is plotted correctly
    
        # Determine tick step and format
        tick_step = (end - start) // 5
        if tick_step < 1:
            tick_step = 1
        tick_labels = []
        tick_positions = []
        for x in range(start, end + tick_step, tick_step):
            tick_positions.append(x)
            if (end - start) < 10000000:  # Less than 10 Mb range
                tick_labels.append(f'{x / 10**6:.1f}'.rstrip('0').rstrip('.'))
            else:
                tick_labels.append(f'{x / 10**6:.0f}')
        
        x_ticks_ax.set_xticks(tick_positions)
        x_ticks_ax.set_xticklabels(tick_labels)
        
        # Add a label to the x-axis
        x_ticks_ax.set_xlabel('Chromosome position (Mb)')
        
        # Add extra ticks
        minor_tick_step = tick_step // 5
        if minor_tick_step > 0:
            minor_ticks = []
            for x in range(start, end, minor_tick_step):
                if x not in tick_positions:
                    minor_ticks.append(x)
            x_ticks_ax.set_xticks(minor_ticks, minor=True)
            x_ticks_ax.tick_params(axis='x', which='minor', bottom=True, top=False, labelbottom=False)



    # Zoom the ideogram
    if show_genes_allQ:
        pyideogram.zoom(zoom_ax, all_chrom_ax)

    if saveQ:
        plt.savefig(f'figures/ideogram_{cell_line}.pdf', bbox_inches='tight', transparent=True, dpi=300)

    plt.show()

#### General plots ####

In [47]:
def plot_data(cell_line, chr_number, data_type, chrmin=1, chrmax='max', aspect_ratio=5, ylimits=None, log_scale=False, smoothQ = False, sigma=5, standQ=False, saveQ=False):
    
    # Load data
    if chrmax == 'max':
        data = load_function(cell_line, chr_number, data_type, True)
    else:
        data = load_function(cell_line, chr_number, data_type, True)[chrmin:chrmax]

    if smoothQ:
        data = gaussian_filter1d(data, sigma=sigma, mode='constant', cval=np.nan)
    
    # Create figure with specified aspect ratio
    plt.figure(figsize=(10, 10 / aspect_ratio))  # aspect_ratio controls height relative to width

    if standQ:
        data = zscore_with_nan(data)
    
    # Plot data
    plt.plot(data)
    plt.xlabel('')
    plt.ylabel(title_map[data_type])
    plt.title('')

    # Set y-axis limits if provided
    if ylimits is not None:
        plt.ylim(ylimits)
    
    # Set y-axis to log scale if requested
    if log_scale:
        plt.yscale('log')

    plt.gca().set_yticks([])
    plt.gca().get_xaxis().set_visible(False)

    # Disable gridlines
    plt.grid(False)

    # Save plot
    if saveQ:
        plt.savefig(f'figures/plotsdetail_{data_type}.pdf', bbox_inches='tight', transparent=True)
    
    # Display the plot
    plt.show()

#### Correlations ####

In [48]:
# Function to calculate average correlations and export results to CSV
def correlation_tests_to_csv(cell_lines, chr_numbers, data_types1, data_types2, output_file, chrmin=1, chrmax='max', smoothQ1=False, smoothQ2=False, sigma=50, standQ=True):
    results = []

    for cell_line in cell_lines:
        for data_type1 in data_types1:
            for data_type2 in data_types2:
                # Initialize accumulators for correlations
                pearson_corr_list = []
                spearman_corr_list = []
                kendall_corr_list = []

                # Loop over the chromosomes
                for chr_number in chr_numbers:
                    # Load data for the first and second datasets
                    if chrmax == 'max':
                        data1 = load_function(cell_line, chr_number, data_type1, True)
                        data2 = load_function(cell_line, chr_number, data_type2, True)
                    else:
                        data1 = load_function(cell_line, chr_number, data_type1, True)[chrmin:chrmax]
                        data2 = load_function(cell_line, chr_number, data_type2, True)[chrmin:chrmax]

                    # Check if any of the datasets are empty
                    if data1.size == 0 or data2.size == 0:
                        continue  # Skip further calculations for this chromosome
                    
                    # Apply Gaussian smoothing
                    if smoothQ1:
                        data1 = gaussian_filter1d(data1, sigma=sigma, mode='constant', cval=np.nan)
                    if smoothQ2:
                        data2 = gaussian_filter1d(data2, sigma=sigma, mode='constant', cval=np.nan)
                    
                    # Filter for valid (non-NaN, finite) data
                    valid_mask = ~np.isnan(data1) & ~np.isnan(data2) & np.isfinite(data1) & np.isfinite(data2)
                    data1, data2 = data1[valid_mask], data2[valid_mask]

                    if standQ:
                        data1 = zscore(data1)
                        data2 = zscore(data2)

                    if len(data1) > 0 and len(data2) > 0:
                        # Compute Pearson's correlation
                        pearson_corr, _ = pearsonr(data1, data2)
                        pearson_corr_list.append(pearson_corr)
                        
                        # Compute Spearman's correlation
                        spearman_corr, _ = spearmanr(data1, data2)
                        spearman_corr_list.append(spearman_corr)
                        
                        # Compute Kendall's Tau correlation
                        kendall_corr, _ = kendalltau(data1, data2)
                        kendall_corr_list.append(kendall_corr)

                # Compute the average correlations across chromosomes
                if pearson_corr_list:
                    avg_pearson_corr = np.mean(pearson_corr_list)
                    avg_spearman_corr = np.mean(spearman_corr_list)
                    avg_kendall_corr = np.mean(kendall_corr_list)
                else:
                    avg_pearson_corr = 'N/A'
                    avg_spearman_corr = 'N/A'
                    avg_kendall_corr = 'N/A'

                # Store the averaged results
                results.append({
                    "Cell line": cell_line,
                    "Data type 1": title_map[data_type1],
                    "Data type 2": title_map[data_type2],
                    "Spearman": round(avg_spearman_corr, 3) if avg_spearman_corr != 'N/A' else 'N/A',
                    "Kendall's Tau": round(avg_kendall_corr, 3) if avg_kendall_corr != 'N/A' else 'N/A',
                    "Pearson": round(avg_pearson_corr, 3) if avg_pearson_corr != 'N/A' else 'N/A'
                })

    # Convert results to DataFrame and write to CSV
    df = pd.DataFrame(results)
    df.to_csv(output_file, index=False)


In [49]:
def plot_combined_heatmap(data, figsize=(12, 4), saveQ=False):

    # Load the data from the CSV file
    file_path = 'data/tables/correlations.csv'
    df = pd.read_csv(file_path)
    
    # Ensure the order of cell lines and data types as they appear in the CSV
    df['Cell line'] = pd.Categorical(df['Cell line'], categories=df['Cell line'].unique(), ordered=True)
    df['Data type 1'] = pd.Categorical(df['Data type 1'], categories=df['Data type 1'].unique(), ordered=True)
    df['Data type 2'] = pd.Categorical(df['Data type 2'], categories=df['Data type 2'].unique(), ordered=True)

    # Extract cell types from the DataFrame in the order they appear in the CSV
    cell_types = df['Cell line'].cat.categories  # This will now preserve the order in the CSV

    # Create a grid with enough space for all sub-heatmaps (3 heatmaps per cell type)
    fig = plt.figure(figsize=(figsize[0], figsize[1] * len(cell_types)))  # Adjust height dynamically based on cell types
    gs = gridspec.GridSpec(len(cell_types), 3, width_ratios=[1, 1, 1], wspace=0.05, hspace=0.1)

    vmin, vmax = -1, 1  # Correlation values typically range from -1 to 1

    for i, cell_type in enumerate(cell_types):
        subset = df[df['Cell line'] == cell_type]
        
        # Create pivot tables for Pearson, Spearman, and Kendall's Tau, preserving the order of data types
        heatmap_data_pearson = subset.pivot(index='Data type 1', columns='Data type 2', values='Pearson').reindex(index=subset['Data type 1'].cat.categories, columns=subset['Data type 2'].cat.categories)
        heatmap_data_spearman = subset.pivot(index='Data type 1', columns='Data type 2', values='Spearman').reindex(index=subset['Data type 1'].cat.categories, columns=subset['Data type 2'].cat.categories)
        heatmap_data_kendall = subset.pivot(index='Data type 1', columns='Data type 2', values="Kendall's Tau").reindex(index=subset['Data type 1'].cat.categories, columns=subset['Data type 2'].cat.categories)
        
        # Ensure that the diagonal is 1
        for heatmap_data in [heatmap_data_pearson, heatmap_data_spearman, heatmap_data_kendall]:
            for col in heatmap_data.columns:
                if col in heatmap_data.index:
                    heatmap_data.loc[col, col] = 1
        
        # Plot Pearson, Spearman, and Kendall's Tau in a row for each cell type
        ax0 = fig.add_subplot(gs[i, 0])
        sns.heatmap(heatmap_data_pearson, annot=True, annot_kws={"rotation": 90}, cmap="coolwarm", cbar=False, vmin=vmin, vmax=vmax, ax=ax0)
        ax0.set_xlabel('')
        if i < len(cell_types) - 1:
            plt.xticks(ticks=[])  # Remove x-ticks on all rows but the last
        if i == 0:
            ax0.set_title('Pearson')
        ax0.set_ylabel(cell_type)  # Set the cell line name as the y-label only, preserving the order in the CSV

        ax1 = fig.add_subplot(gs[i, 1])
        sns.heatmap(heatmap_data_spearman, annot=True, annot_kws={"rotation": 90}, cmap="coolwarm", cbar=False, vmin=vmin, vmax=vmax, ax=ax1)
        ax1.set_xlabel('')
        if i < len(cell_types) - 1:
            plt.xticks(ticks=[])  # Remove x-ticks on all rows but the last
        ax1.set_yticks([])  # Remove y-tick labels for columns other than the first
        if i == 0:
            ax1.set_title('Spearman')
        ax1.set_ylabel('')

        ax2 = fig.add_subplot(gs[i, 2])
        sns.heatmap(heatmap_data_kendall, annot=True, annot_kws={"rotation": 90}, cmap="coolwarm", cbar=False, vmin=vmin, vmax=vmax, ax=ax2)
        if i < len(cell_types) - 1:
            plt.xticks(ticks=[])  # Remove x-ticks except on last row
        ax2.set_xlabel('')
        ax2.set_yticks([])  # Remove y-ticks for columns other than the first
        if i == 0:
            ax2.set_title("Kendall's Tau")
        ax2.set_ylabel('')

    # Add a single color bar for the whole figure, matching the height of the heatmaps
    # Add a single color bar for the whole figure, positioned at the top with vertical ticks
    cbar_ax = fig.add_axes([0.05, 0.95, 0.85, 0.02])  # Adjust the position [left, bottom, width, height]
    norm = plt.Normalize(vmin=vmin, vmax=vmax)
    sm = plt.cm.ScalarMappable(cmap="coolwarm", norm=norm)
    sm.set_array([])
    cbar = fig.colorbar(sm, cax=cbar_ax, orientation='horizontal')
    
    # Move the ticks to the top of the color bar
    cbar.ax.xaxis.set_ticks_position('top')
    
    # Rotate the tick labels to be vertical
    cbar.ax.xaxis.set_tick_params(rotation=90)
    cbar.ax.invert_xaxis()


    # Adjust layout
    plt.subplots_adjust(left=0.05, right=0.9, top=0.9, bottom=0.05, wspace=0.05, hspace=0.2)
    if saveQ:
        plt.savefig('figures/correlations.pdf', bbox_inches='tight', transparent=True)
    plt.show()

#### Fragile sites plots ####

In [50]:
# Function to calculate continuous segment lengths where values > threshold
def find_continuous_segments(data, threshold):
    lengths = []
    current_length = 0
    for value in data:
        if value > threshold:
            current_length += 1
        else:
            if current_length > 0:
                lengths.append(current_length)
            current_length = 0
    if current_length > 0:  # Catch the last segment if it ends with a continuous section
        lengths.append(current_length)
    return lengths

In [51]:
def fragiles_plot(cell_line, common_fragile_sites, rare_fragile_sites, threshold, saveQ=False):

    # Combine all fragile sites
    all_fragile_sites = common_fragile_sites + rare_fragile_sites
    
    # Initialize lists for data
    errors_fragile = []
    labels_fragile = []
    fragile_site_lengths = []  # Total lengths of fragile sites
    fragile_site_misfit_lengths = []  # Misfit lengths in fragile sites
    
    # Loop over all fragile sites to load data and calculate segments
    for site_label, chrom in all_fragile_sites:
        positions = load_function_pos(chrom, "fragile_sites", cell_line=None, site_letter=site_label[-1], base='A')
        data = load_function(cell_line, chrom, "error")
        errors_fragile.append(data[positions])
        labels_fragile.append(site_label)
        
        # Calculate the total length and misfit length of the fragile site
        total_length = len(data[positions])
        misfit_length = np.sum(np.array(data[positions]) > threshold) 
        fragile_site_lengths.append(total_length)
        fragile_site_misfit_lengths.append(misfit_length)
    
    # Calculate continuous segment lengths for each sublist in errors_fragile
    continuous_lengths = [find_continuous_segments(sublist, threshold) for sublist in errors_fragile]
    
    # Create a figure with two stacked subplots (reversed order)
    fig, (ax2, ax1) = plt.subplots(2, 1, figsize=(8, 5), sharex=True, gridspec_kw={'height_ratios': [1, .8]})
    
    # Define a spacing factor between common and rare fragile sites
    spacing = 2
    bar_width = 0.5  # Width of the bars
    
    # Adjusted positions for alignment
    index_common = np.arange(1, len(common_fragile_sites) + 1)
    index_rare = np.arange(len(common_fragile_sites) + spacing, len(common_fragile_sites) + spacing + len(rare_fragile_sites))
    index = np.concatenate([index_common, index_rare])
    
    # First plot (top): Continuous misfit regions (boxplot)
    common_box = ax2.boxplot(continuous_lengths[:len(common_fragile_sites)], 
                             positions=index_common,
                             widths=bar_width, patch_artist=True, boxprops=dict(facecolor='lightblue', color='black'),
                             medianprops=dict(color='red'), whiskerprops=dict(color='black'))
    
    rare_box = ax2.boxplot(continuous_lengths[len(common_fragile_sites):], 
                           positions=index_rare,
                           widths=bar_width, patch_artist=True, boxprops=dict(facecolor='lightgreen', color='black'),
                           medianprops=dict(color='red'), whiskerprops=dict(color='black'))
    
    # Dashed line separating common and rare fragile sites in the top plot
    ax2.axvline(len(common_fragile_sites) + spacing / 2, color='gray', linestyle='--')

    # Set section titles for ax2 (top plot)
    ax2.text(len(common_fragile_sites) / 2 + 0.5, ax2.get_ylim()[1] * 1.04, 'Common fragile sites', 
             ha='center', va='bottom', fontsize=10)
    
    ax2.text(len(common_fragile_sites) + spacing - 0.5 + len(rare_fragile_sites) / 2, ax2.get_ylim()[1] * 1.04, 
             'Rare fragile sites', ha='center', va='bottom', fontsize=10)
    
    # Set labels and title for the top plot
    ax2.set_ylabel('Continuous misfit regions (kb)', fontsize=10)
    
    # Remove x-ticks from the top plot (boxplot)
    ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
    
    # Second plot (bottom): Total fragile site lengths and misfit proportions (bar plot)
    fragile_site_bars_common = ax1.bar(index_common, np.array(fragile_site_lengths[:len(common_fragile_sites)]) / 1000, 
                                       bar_width, label='Common FS', color='lightblue')
    fragile_site_bars_rare = ax1.bar(index_rare, np.array(fragile_site_lengths[len(common_fragile_sites):]) / 1000, 
                                     bar_width, label='Rare FS', color='lightgreen')
    
    fragile_site_misfit_bars_common = ax1.bar(index_common, np.array(fragile_site_misfit_lengths[:len(common_fragile_sites)]) / 1000, 
                                              bar_width, color='firebrick', label='Misfit fraction')
    fragile_site_misfit_bars_rare = ax1.bar(index_rare, np.array(fragile_site_misfit_lengths[len(common_fragile_sites):]) / 1000, 
                                            bar_width, color='firebrick')
    
    # Dashed line separating common and rare fragile sites in the bottom plot
    ax1.axvline(len(common_fragile_sites) + spacing / 2, color='gray', linestyle='--')
    
    # Set labels and title for the second plot (in Mb)
    ax1.set_ylabel('Length (Mb)', fontsize=10)
    
    # Set x-ticks and labels for both plots (bottom plot only)
    ax1.set_xticks(index)
    ax1.set_xticklabels(labels_fragile, rotation=45, ha='right')
    
    # Add legend to the bottom plot in the upper right corner
    ax1.legend(loc='upper right', fontsize=9)
    
    # Increase space between the outermost boxplots and the frame axes for both plots
    ax1.set_xlim(0, len(common_fragile_sites) + len(rare_fragile_sites) + spacing)
    
    # Remove spacing between the subplots and ensure axes labels don't overlap
    plt.subplots_adjust(hspace=1)
    
    # Show the plot
    plt.tight_layout(pad=-0.1)
    # Save plot
    if saveQ:
        plt.savefig('figures/fragile_box.pdf', bbox_inches='tight', transparent=True)
    
    plt.show()

#### Genes plots ####

In [52]:
def genes_plot(cell_line, genes, threshold, fragile_sites, saveQ=False):
    
    # Initialize lists for data
    errors_genes = []
    labels_genes = []
    gene_lengths = []  # Total lengths of genes
    gene_misfit_fractions = []  # Misfit fractions in genes
    
    # Loop over all genes to load data and calculate segments
    for i, (gene_name, chrom) in enumerate(genes):
        # Load positions of the gene
        positions = load_function_pos(chrom, "gene", gene_name=gene_name)
        
        # Load error data for the chromosome
        data = load_function(cell_line, chrom, "error")
        
        # Extract the data for the specific gene positions
        gene_data = data[positions]
        
        # Store the gene data and label
        errors_genes.append(gene_data)
        
        # Combine gene name with fragile site in label
        labels_genes.append(f"{gene_name}\n{fragile_sites[i]}")
        
        # Calculate the total length and misfit fraction of the gene
        total_length = len(gene_data)
        misfit_length = np.sum(np.array(gene_data) > threshold)  # Count misfit segments based on threshold
        misfit_fraction = misfit_length / total_length if total_length > 0 else 0  # Calculate fraction
        gene_lengths.append(total_length)
        gene_misfit_fractions.append(misfit_fraction)
    
    # Sort data by gene size (descending order)
    sorted_indices = np.argsort(gene_lengths)[::-1]
    gene_lengths = np.array(gene_lengths)[sorted_indices]
    gene_misfit_fractions = np.array(gene_misfit_fractions)[sorted_indices]
    labels_genes = np.array(labels_genes)[sorted_indices]
    
    # Create a figure for bar plots
    fig, ax1 = plt.subplots(figsize=(8, 4))

    # Convert lengths to Mb by dividing by 1000
    gene_lengths_mb = gene_lengths / 1000
    gene_misfit_lengths_mb = [length * fraction / 1000 for length, fraction in zip(gene_lengths, gene_misfit_fractions)]

    # Define the width of the bars and spacing
    bar_width = 0.4
    index = np.arange(len(genes))
    
    # Plot the total gene lengths in Mb
    gene_bars = ax1.bar(index, gene_lengths_mb, bar_width, label='Gene', color='wheat')
    
    # Plot the misfit fractions in Mb on top of the total gene lengths
    misfit_bars = ax1.bar(index, gene_misfit_lengths_mb, 
                          bar_width, color='firebrick', label='Misfit fraction', bottom=0)

    # Add labels and title
    ax1.set_ylabel('Length (Mb)', fontsize=12)
    ax1.set_title('Large genes', fontsize=14)

    # Add gene labels to x-axis
    ax1.set_xticks(index)
    ax1.set_xticklabels(labels_genes, rotation=45, ha='center', fontsize=10)

    # Add legend
    ax1.legend(loc='upper right', fontsize=10)
    
    # Show the plot
    plt.tight_layout()

    # Save plot
    if saveQ:
        plt.savefig('figures/gene_bars.pdf', bbox_inches='tight', transparent=True)
        
    plt.show()

In [None]:
def gene_detector_error():

    # Example usage
    cell_line = "H1"
    chr_numbers = range(1,23)
    data_type = "error"
    threshold = 10**2.8
    nmax = 10
    
    positions = []
    for chr_number in chr_numbers:
        
        error = load_function(cell_line, chr_number, data_type)
        local_max0 = argrelextrema(error, np.greater)[0]
        local_max = [value for value in local_max0 if error[value] > threshold]
        
        local_max_list = [np.array(range(max(0,maxi - nmax), min(chr_lengths[chr_number-1],maxi + nmax)+1)) for maxi in local_max]
        
        positions.append([item for sublist in local_max_list for item in sublist])
    
    # Input BigBed file and output directory
    bigbed_path = r'data/genome_regions/genes/gencodeV46.bb'
    output_dir = r'data/genome_regions/genes/'
    
    # Open the BigBed file using pybigtools
    bb = pybigtools.open(bigbed_path)
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # List of chromosomes as strings (1-22)
    chromosomes = [str(i) for i in range(1, 23)]
    
    # Loop over each chromosome
    for i, chrom in enumerate(chromosomes):
    
        gene_names = []
        gene_lengths = []
        
        # Initialize a list representing the chromosome at 1kb resolution, with each position set to 0
        chrom_data = [0] * chr_lengths[i]
        
        # List to store exact coding positions (in kb)
        exact_positions = []
        
        # Query all records for the chromosome
        for record in bb.records(f"chr{chrom}"):
            chrom_start = record[0]
            chrom_end = record[1]
            fields = record[2:]  # Additional fields, including transcript class
            
            # Assuming transcriptClass is in a specific position
            transcript_class = fields[17]
    
            # Only process coding transcripts
            if transcript_class == 'coding':
                rangepos = range(chrom_start // 1000, min(chrom_end // 1000 + 1, chr_lengths[i]))
                
                # Mark positions as coding (1) for the corresponding 1kb intervals
                for pos in rangepos:
                    chrom_data[pos] = 1
                
                # Store exact positions for coding intervals
                for pos in rangepos:
                    exact_positions.append(pos)
    
                if set(rangepos) & set(positions[i]):
                    gene_name = fields[14]
                    gene_length = chrom_end//1000-chrom_start//1000
                    gene_names.append(gene_name)
                    gene_lengths.append(gene_length)
    
                        
        genes_with_lengths = list(zip(gene_names, gene_lengths))
        genes_with_lengths.sort(key=lambda x: x[1], reverse=True)
        sorted_gene_names = [gene for gene, length in genes_with_lengths]
        sorted_gene_names = list(dict.fromkeys(sorted_gene_names))
        
        # Save the sorted and deduplicated gene names to a text file
        np.savetxt(f"data/genome_regions/genes/gene_error_chr[{i+1}].txt", sorted_gene_names, fmt='%s')


In [None]:
def gene_detector_fragile_sites(chr_number):
    # Load gene names from the text file for the given chromosome
    gene_file = f"data/genome_regions/genes/gene_error_chr[{chr_number}].txt"
    
    if not os.path.exists(gene_file):
        print(f"No gene file found for chromosome {chr_number}.")
        return [], []
    
    # Load only the first 10 gene names
    gene_names = np.loadtxt(gene_file, dtype=str, ndmin=1)[:12]

    # Load BigBed file for gene data
    bigbed_path = r'data/genome_regions/genes/gencodeV46.bb'
    bb = pybigtools.open(bigbed_path)
    
    # Initialize a dictionary to store fragile site positions by letter
    fragile_sites = {}
    for site_letter in string.ascii_uppercase:  # Iterate over all capital letters A-Z
        site_file = f"data/genome_regions/fragile_sites/positions_fragile_site_{chr_number}{site_letter}.txt"
        if os.path.exists(site_file):
            site_positions = np.loadtxt(site_file, dtype=int)
            fragile_sites[site_letter] = site_positions
    
    # Initialize lists to store matching genes and respective fragile sites
    matching_genes = []
    matching_fragile_sites = []
    
    # Loop over each gene name (only first 10)
    for gene_name in gene_names:
        # Query all records for the specified chromosome to find the gene's position
        gene_found = False
        for record in bb.records(f"chr{chr_number}"):
            fields = record[2:]  # Additional fields, including gene information
            if fields[14] == gene_name:  # Match the gene name
                chrom_start = record[0] // 1000  # Start position in kb
                chrom_end = record[1] // 1000    # End position in kb
                gene_found = True
                break  # Stop once the gene is found
        
        if not gene_found:
            continue
        
        # Check if the gene overlaps with any fragile sites
        for site_letter, site_positions in fragile_sites.items():
            for fragile_site_pos in site_positions:
                if fragile_site_pos >= chrom_start and fragile_site_pos <= chrom_end:
                    matching_genes.append(gene_name)
                    matching_fragile_sites.append(f"Fragile site {chr_number}{site_letter} at {fragile_site_pos} kb")
                    break  # Stop checking once the gene overlaps with a fragile site
    
    # Return the sublist of genes and their respective fragile sites
    return matching_genes, matching_fragile_sites

#### Pie and bar charts ####

In [53]:
def generate_pie_chart(data_lists, labels, ax, title=''):
    if len(data_lists) != len(labels):
        raise ValueError("The number of data lists and labels must be the same.")
    
    # Count the number of elements in each sublist
    sizes = [len(data_list) for data_list in data_lists]
    
    # Automatically generate a color palette based on the number of categories
    num_categories = len(data_lists)
    colors = sns.color_palette('flare', max(num_categories + 1, 3))[1:num_categories + 1]  # Ensure enough distinct colors
    
    # Create the pie chart on the provided axis
    wedges, texts, autotexts = ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors)
    
    # Set the color of the percentages to white
    for autotext in autotexts:
        autotext.set_color('white')
    
    # Set equal aspect ratio to ensure the pie is drawn as a circle
    ax.axis('equal')
    
    # Set a title for the pie chart
    ax.set_title(title, fontsize=11)


def plot_four_pie_charts(cell_line, chr_numbers, error_threshold, time_threshold, saveQ=False):
    # Set up the figure for subplots (4 pie charts)
    fig, axs = plt.subplots(2, 2, figsize=(6, 6))
    axs = axs.flatten()

    ### Initialize data lists for all pie charts ###
    late_data, early_data = [], []
    coding_data, noncoding_data = [], []
    at_data, gc_data = [], []
    telomere_data, centromere_data, cfs_data, rfs_data, other_data = [], [], [], [], []

    common_sites = [
        "1A", "1B", "1C", "1D", "2A", "2B", "2C", "3B", "3C", "3D", "6E", "6F", 
        "7C", "7G", "7H", "7Q", "9A", "9B", "9C", "9D", "10A", "10B", "10C", 
        "11A", "11B", "12A", "12B", "12C", "16B", "16C", "16D", "17A", "17B", 
        "18B", "18C", "19A", "19B", "20A", "21A"
    ]

    rare_sites = [
        "1E", "1F", "1G", "1H", "4A", "4B", "5A", "5B", "5C", "8A", "8B", "8C", 
        "13A", "13B", "13C", "14A", "14B", "15A", "15B", "16E", "17C", "18A",
        "20B", "21B", "22A", "22B"
    ]

    ### Loop through all chromosomes and collect data for all charts ###
    for chr_number in chr_numbers:
        # Load the data for the current chromosome
        error = load_function(cell_line, chr_number, "error")
        time = load_function(cell_line, chr_number, "time_data")

        # Handle replication timing (Late vs Early)
        late_condition = (time > time_threshold) & (error > error_threshold)
        early_condition = (time <= time_threshold) & (error > error_threshold)
        late_data.extend(error[late_condition])
        early_data.extend(error[early_condition])

        # Handle coding vs non-coding regions
        positions_coding = load_function_pos(chr_number, "coding")
        positions_noncoding = list(set(range(len(error))) - set(positions_coding))

        error_coding = error[positions_coding]
        error_noncoding = error[positions_noncoding]
        coding_data.extend([i for i in error_coding if i > error_threshold])
        noncoding_data.extend([i for i in error_noncoding if i > error_threshold])

        # Handle base composition (AT vs GC)
        positions_A = load_function_pos(chr_number, "bases", base='A')
        positions_T = load_function_pos(chr_number, "bases", base='T')
        positions_G = load_function_pos(chr_number, "bases", base='G')
        positions_C = load_function_pos(chr_number, "bases", base='C')

        positions_AT = np.concatenate((positions_A, positions_T))
        positions_GC = np.concatenate((positions_G, positions_C))

        error_AT = error[positions_AT]
        error_GC = error[positions_GC]
        at_data.extend([i for i in error_AT if i > error_threshold])
        gc_data.extend([i for i in error_GC if i > error_threshold])

        # Handle genomic regions (Telomeres, Centromeres, CFS, RFS, Others)
        positions_telomeres = load_function_pos(chr_number, "telomeres")
        positions_centromeres = load_function_pos(chr_number, "centromeres")
        error_telomeres = error[positions_telomeres]
        error_centromeres = error[positions_centromeres]

        common_sites_chr = [site for site in common_sites if re.match(f"^{chr_number}[A-Z]$", site)]
        rare_sites_chr = [site for site in rare_sites if re.match(f"^{chr_number}[A-Z]$", site)]

        error_common_sites = []
        error_rare_sites = []
        
        # Ensure that positions are within bounds
        for site in common_sites_chr:
            positions_common_sites = load_function_pos(chr_number, "fragile_sites", site_letter=site[-1])
            positions_common_sites = np.intersect1d(positions_common_sites, np.arange(len(error)))  # Bounds check
            error_common_sites.extend(error[positions_common_sites])

        for site in rare_sites_chr:
            positions_rare_sites = load_function_pos(chr_number, "fragile_sites", site_letter=site[-1])
            positions_rare_sites = np.intersect1d(positions_rare_sites, np.arange(len(error)))  # Bounds check
            error_rare_sites.extend(error[positions_rare_sites])

        # Define other regions as everything outside telomeres, centromeres, and fragile sites
        positions_fragile = np.concatenate((positions_telomeres, positions_centromeres, positions_common_sites, positions_rare_sites))
        positions_other = list(set(range(len(error))) - set(positions_fragile))
        error_other = error[positions_other]

        # Append data to corresponding lists
        telomere_data.extend([i for i in error_telomeres if i > error_threshold])
        centromere_data.extend([i for i in error_centromeres if i > error_threshold])
        cfs_data.extend([i for i in error_common_sites if i > error_threshold])
        rfs_data.extend([i for i in error_rare_sites if i > error_threshold])
        other_data.extend([i for i in error_other if i > error_threshold])

    ### Generate Pie Charts ###
    # Pie chart 1: Replication timing (Late vs Early)
    data_lists_1 = [late_data, early_data]
    labels_1 = ['Late', 'Early']
    generate_pie_chart(data_lists_1, labels_1, axs[0], title='Replication timing')

    # Pie chart 2: Coding vs Non-coding regions
    data_lists_2 = [noncoding_data, coding_data]
    labels_2 = ['Non-coding','Coding']
    generate_pie_chart(data_lists_2, labels_2, axs[1], title='Coding regions')

    # Pie chart 3: Base composition (AT vs GC)
    data_lists_3 = [at_data, gc_data]
    labels_3 = ['AT', 'GC']
    generate_pie_chart(data_lists_3, labels_3, axs[2], title='Base composition')

    # Pie chart 4: Genomic regions (Telomeres, Centromeres, CFS, RFS, Others)
    data_lists_4 = [rfs_data, cfs_data, other_data, centromere_data, telomere_data]
    labels_4 = ['RFS','CFS','Other','Centromeres','Telomeres']
    generate_pie_chart(data_lists_4, labels_4, axs[3], title='Genomic regions')

    # Save plot
    if saveQ:
        plt.savefig('figures/piecharts.pdf', bbox_inches='tight', transparent=True)

    # Adjust layout to prevent overlap
    plt.tight_layout()
    plt.show()

In [None]:
def barplots():
    
    error_threshold = 10**2
        
    ### Loop through all chromosomes and collect data for all charts ###
    
    cell_line = "H1"
    chr_numbers = range(1,23)
    #error_threshold = 1
    time_threshold = 250
    
    late_data = []
    early_data = []
    coding_data = []
    noncoding_data = []
    at_data = []
    gc_data = []
    telomere_data, centromere_data, cfs_data, rfs_data, other_data = [], [], [], [], []
    
    err_early_data, err_late_data, err_coding_data, err_noncoding_data, err_at_data, err_gc_data, err_centromeres_data, err_telomeres_data, err_cfs_data, err_rfs_data, err_other_data = [], [], [], [], [], [], [], [], [], [], []
    
    common_sites = ["1A", "1B", "1C", "1D", "2A", "2B", "2C", "3B", "3C", "3D", "6E", "6F", 
        "7C", "7G", "7H", "7Q", "9A", "9B", "9C", "9D", "10A", "10B", "10C", 
        "11A", "11B", "12A", "12B", "12C", "16B", "16C", "16D", "17A", "17B", 
        "18B", "18C", "19A", "19B", "20A", "21A"]
    rare_sites = ["1E", "1F", "1G", "1H", "4A", "4B", "5A", "5B", "5C", "8A", "8B", "8C", 
        "13A", "13B", "13C", "14A", "14B", "15A", "15B", "16E", "17C", "18A",
        "20B", "21B", "22A", "22B"]
    
    for chr_number in chr_numbers:
        # Load the data for the current chromosome
        error = load_function(cell_line, chr_number, "error")
        time = load_function(cell_line, chr_number, "time_data")
    
        # Handle replication timing (Late vs Early)
        late_data.append(len(error[(time > time_threshold) & (error > error_threshold)])/len(error[(time > time_threshold)]))
        early_data.append(len(error[(time <= time_threshold) & (error > error_threshold)])/len(error[(time <= time_threshold)]))
        err_late_data.extend(error[(time > time_threshold)])
        err_early_data.extend(error[(time <= time_threshold)])
    
        # Handle coding vs non-coding regions
        positions_coding = load_function_pos(chr_number, "coding")
        positions_noncoding = list(set(range(len(error))) - set(positions_coding))
        error_coding = error[positions_coding]
        error_noncoding = error[positions_noncoding]
        coding_data.append(len(error_coding[error_coding > error_threshold])/len(error_coding))
        noncoding_data.append(len(error_noncoding[error_noncoding > error_threshold])/len(error_noncoding))
        err_coding_data.extend(error_coding)
        err_noncoding_data.extend(error_noncoding)
    
        # Handle base composition (AT vs GC)
        positions_A = load_function_pos(chr_number, "bases", base='A')
        positions_T = load_function_pos(chr_number, "bases", base='T')
        positions_G = load_function_pos(chr_number, "bases", base='G')
        positions_C = load_function_pos(chr_number, "bases", base='C')
        positions_AT = np.concatenate((positions_A, positions_T))
        positions_GC = np.concatenate((positions_G, positions_C))
        error_at = error[positions_AT]
        error_gc = error[positions_GC]
        at_data.append(len(error_at[error_at > error_threshold])/len(error_at))
        gc_data.append(len(error_gc[error_gc > error_threshold])/len(error_gc))
        err_at_data.extend(error_at)
        err_gc_data.extend(error_gc)
    
    
        # Handle genomic regions (Telomeres, Centromeres, CFS, RFS, Others)
        positions_telomeres = load_function_pos(chr_number, "telomeres")
        positions_centromeres = load_function_pos(chr_number, "centromeres")
        error_telomeres = error[positions_telomeres]
        error_centromeres = error[positions_centromeres]
        common_sites_chr = [site for site in common_sites if re.match(f"^{chr_number}[A-Z]$", site)]
        rare_sites_chr = [site for site in rare_sites if re.match(f"^{chr_number}[A-Z]$", site)]
        error_common_sites = []
        error_rare_sites = []
        # Ensure that positions are within bounds
        for site in common_sites_chr:
            positions_common_sites = load_function_pos(chr_number, "fragile_sites", site_letter=site[-1])
            positions_common_sites = np.intersect1d(positions_common_sites, np.arange(len(error)))  # Bounds check
            error_common_sites.extend(error[positions_common_sites])
        for site in rare_sites_chr:
            positions_rare_sites = load_function_pos(chr_number, "fragile_sites", site_letter=site[-1])
            positions_rare_sites = np.intersect1d(positions_rare_sites, np.arange(len(error)))  # Bounds check
            error_rare_sites.extend(error[positions_rare_sites])
        # Define other regions as everything outside telomeres, centromeres, and fragile sites
        positions_fragile = np.concatenate((positions_telomeres, positions_centromeres, positions_common_sites, positions_rare_sites))
        positions_other = list(set(range(len(error))) - set(positions_fragile))
        error_other = error[positions_other]
        telomere_data.append(len(error_telomeres[error_telomeres > error_threshold])/len(error_telomeres))
        centromere_data.append(len(positions_centromeres[positions_centromeres > error_threshold])/len(positions_centromeres))
        if len(error_common_sites) != 0:
            cfs_data.append(len(np.array(error_common_sites)[np.array(error_common_sites) > error_threshold])/len(error_common_sites))
        if len(error_rare_sites) != 0:
            rfs_data.append(len(np.array(error_rare_sites)[np.array(error_rare_sites) > error_threshold])/len(error_rare_sites))
        other_data.append(len(error_other[error_other > error_threshold])/len(error_other))
        err_centromeres_data.extend(error_centromeres)
        err_telomeres_data.extend(error_telomeres)
        err_cfs_data.extend(error_common_sites)
        err_rfs_data.extend(error_rare_sites)
        err_other_data.extend(error_other)
    
    data_total = [early_data, late_data,
    coding_data, noncoding_data,
    at_data, gc_data,
    telomere_data, cfs_data, rfs_data, other_data]
    
    mean_data_total = [np.mean(dat) for dat in data_total]
    labels = ["Early", "Late", "Coding", "Non-coding", "AT", "GC", "Telomeres", "CFS", "RFS", "Other"]
    
    plt.figure(figsize=(10, 6))
    plt.bar(labels, mean_data_total)
    
    # Add labels and title
    plt.xlabel("Categories")
    plt.ylabel("Percentage")
    plt.title("Percentage Bar Plot by Category")
    
    # Display the plot
    plt.tight_layout()
    
    saveQ=True
    if saveQ:
        plt.savefig('figures/barplots_err.pdf', bbox_inches='tight', transparent=True)
    plt.show()

    labels = ["Early", "Late"]
    plot_relative_kdes([np.array(err_early_data),np.array(err_late_data)], labels, log_scale=True, x_min=10**-8, x_max=10**6,
                   plot_title="", x_title="error", normalize = False,
                   bw_adjust=1, saveQ=True)
    labels = ["Coding", "Non-coding"]
    plot_relative_kdes([np.array(err_coding_data),np.array(err_noncoding_data)], labels, log_scale=True, x_min=10**-8, x_max=10**6,
                   plot_title="", x_title="error", normalize = False,
                   bw_adjust=1, saveQ=True)
    labels = ["GC","AT"]
    plot_relative_kdes([np.array(err_gc_data),np.array(err_at_data)], labels, log_scale=True, x_min=10**-8, x_max=10**6,
                   plot_title="", x_title="error", normalize = False,
                   bw_adjust=1, saveQ=True)
    labels = ["RFS","CFS","Centromeres", "Telomeres"]
    plot_relative_kdes([np.array(err_rfs_data),np.array(err_cfs_data),
                    np.array(err_centromeres_data),np.array(err_telomeres_data)], labels, log_scale=True,
                   x_min=10**-8, x_max=10**6,
                   plot_title="", x_title="error", normalize = False,
                   bw_adjust=1, saveQ=True)
        

#### Tests ####