##### <font color='red'>Warning:</font> The following code modifies data, code, and fitting files. Proceed with caution. ######

#### Import functions ####

In [1]:
from ipynb.fs.full.utilities_v2 import *

#### Data generation ####

In [None]:
### Generate data (replace bigWig and MAT files) ###
# Skip if data/whole-genome_timing_data and data/whole-genome_missing_data files already exist

# Define cell lines
cell_lines_BigWig = ["HeLa-S3","BJ1","IMR90","HUVEC","K562","GM12878","HepG2","MCF-7"]
cell_lines_HighRes = ["H1","H9","HCT"]
cell_lines = cell_lines_BigWig + cell_lines_HighRes
chr_numbers = range(1, 23)

# Loop over cell lines, chromosomes, and intervals
for cell_line in cell_lines:
    for chr_number in chr_numbers:
        file_name = cell_line+'_chr['+str(chr_number)+']'
        print(f'{cell_line} - chr {chr_number}', end="\r")
        if cell_line in cell_lines_BigWig:
            datagenBigWig(cell_line, chr_number, 0, 1, resolution=1000, alld=True, dtscale=6, saveQ=True, info=file_name)
        elif cell_line in cell_lines_HighRes:
            datagenHighRes(cell_line, chr_number, 0, 1, resolution=1000, alld=True, dtscale=6, saveQ=True, info=file_name)

#### Fitting ####

In [None]:
# Model parameters
cell_line = "HeLa-S3"
chr_number = 1
hpcQ = False # Option to run in HPC for whole-genome results
if hpcQ:
    parser = argparse.ArgumentParser()
    parser.add_argument("-cl", required=False)
    parser.add_argument("-cn", required=False)
    args = parser.parse_args()
    if len(sys.argv)>1 :
        if '-cl' in sys.argv:
            cell_line = str(args.cl)
        if '-cn' in sys.argv:
            chr_number = int(args.cn)

chrpos_min = 10000
chrpos_max = 20000
x = np.linspace(chrpos_min, chrpos_max, chrpos_max - chrpos_min)  # Chromosome positions
fork_speed = 1.4 # Fork speed
resolution = 1000 # (1 kb)
scale_factor = 6 # Scales the data
all_dataQ = False # Picks whether to fit an entire genome


# Fitting parameters
int_width = 2000
def int_widthf(time_data): return int(len(time_data)/int_width)
fit_step = 2
iterations = 100
err_threshold = 15

# Saving (Warning: replaces existing files)
saveQ = False
file_name = f'{cell_line}_chr[{chr_number}]' if all_dataQ else f'{cell_line}_chr[{chr_number}]_{chrpos_min}-{chrpos_max}'

# Single files
sing_filesQ = True
if sing_filesQ:
    # Data generation
    time_data = datagenfs(cell_line, chr_number, chrpos_min, chrpos_max, resolution, all_dataQ, scale_factor, saveQ, file_name)
    # Fitting
    fire_rates, time_sim = fitfunction(time_data, fork_speed, int_widthf(time_data), fit_step, iterations, err_threshold, saveQ, file_name)

# Multiple file fitting (long computation)
mult_fileQ = False
if mult_fileQ:
    # Whole-genome parameters
    cell_lines = ["HeLa-S3","BJ1","IMR90","HUVEC","K562","GM12878","HepG2","MCF-7"]
    chr_range = range(1,23)
    for cell_line_i in cell_lines:
        for chr_number_i in chr_range:
            print(cell_line_i+' chr '+str(chr_number_i)+'/22')
            file_name = cell_line_i+'_chr['+str(chr_number_i)+']' if all_dataQ else cell_line_i+'_chr['+str(chr_number_i)+']_'+str(chrpos_min)+'-'+str(chrpos_max)
            # Data generation
            time_data = datagenfs(cell_line_i, chr_number_i, chrpos_min, chrpos_max, resolution, all_dataQ, scale_factor, saveQ, file_name)
            # Fitting
            fire_rates, time_sim = fitfunction(time_data, fork_speed, int_widthf(time_data), fit_step, iterations, err_threshold, saveQ, file_name)

#### bedgraph file generation ####

In [2]:
data_types = ['error', 'fire_rates']

cell_lines = ["HeLa-S3","BJ1","IMR90","HUVEC","K562","GM12878","HepG2","MCF-7"]
genome_build = 'hg19'
for data_type in data_types:
    for cell_line in cell_lines:
        txt_to_bedgraph(cell_line, data_type=data_type, genome_build=genome_build)

cell_lines = ["H1","H9","HCT"]
genome_build = 'hg38'
for data_type in data_types:
    for cell_line in cell_lines:
        txt_to_bedgraph(cell_line, data_type=data_type, genome_build=genome_build)

BEDGRAPH file created: data/whole-genome_firing_rates/bedgraph_files/fire_rates_HCT.bedgraphphph

#### BCS file generation ####

In [None]:
# Skip if data/bcs_scripts files already exist
# These should be run using Beacon Calculus. HPC recommended.
# Single files #
sing_filesQ = False
if sing_filesQ:

    cell_line = "HeLa-S3"
    chr_number = 1
    chrpos_min = 3000
    chrpos_max = 3200
    fork_speed = 1.4
    resolution = 1000 # (1 kb)
    fire_rates = np.loadtxt('data/whole-genome_firing_rates/fire_rates_' + cell_line + '_chr[' + str(chr_number) + '].txt', dtype='float64')[chrpos_min:chrpos_max]
    
    bcs_gen(cell_line, chr_number, chrpos_min, chrpos_max, fork_speed, fire_rates, resolution)

# Multiple bcs generation (long computation) #
mult_fileQ = False
if mult_fileQ:
    
    cell_lines = ["HeLa-S3","BJ1","IMR90","HUVEC","K562","GM12878","HepG2","MCF-7","H1","H9","HCT"]
    chr_range = range(1,23)
    fork_speed = 1.4
    resolution = 1000
    
    for cell_line_i in cell_lines:
        
        for chr_number_i in chr_range:
            
            fire_rates = np.loadtxt('data/whole-genome_firing_rates/fire_rates_' + cell_line_i + '_chr[' + str(chr_number_i) + '].txt', dtype='float64')
            int_step = 10000 # sets the number of bcs simulation points
            partition_intervals = [[i, min(i + int_step, len(fire_rates))] for i in range(0, len(fire_rates), int_step)]
            
            for partition_intervals_i in partition_intervals:
                
                bcs_gen(cell_line_i, chr_number_i, partition_intervals_i[0], partition_intervals_i[1], fork_speed, fire_rates[partition_intervals_i[0]:partition_intervals_i[1]], resolution)

#### BCS simulation output ####

In [None]:
# Example usage
cell_lines = ["HeLa-S3"]  #, "BJ1", "IMR90"] #, "HUVEC", "K562", "GM12878", "HepG2", "MCF-7"]
chr_numbers = [1]
sim_number=500

process_intervals(cell_lines, chr_numbers, sim_number=sim_number)

In [9]:
# Join all 10,000 kb files into one
cell_line = "HeLa-S3"
chr_number = 1

join_files(cell_line, chr_number, "fork_directionality")

In [4]:
cell_lines = ["HeLa-S3"]  #, "BJ1", "IMR90"] #, "HUVEC", "K562", "GM12878", "HepG2", "MCF-7"]
chr_numbers = [1]

compute_interorigin_intervals(cell_lines, chr_numbers)

#### Genome location generation ####

In [2]:
# Centromeres and telomeres
for chr_number in range(1,23):
    gen_positions_centromere_telomeres(chr_number)

In [2]:
# Fragile sites
for chr_number in range(1,23):
    for site_letter in list(string.ascii_uppercase[:13]):
        gen_positions_fragile_sites(chr_number, site_letter)

In [2]:
# Base regions
# Requires hg38.fa.gz from http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
local_genome_file = 'data/genome_regions/reference_genome/hg38.fa.gz'
gen_positions_bases(local_genome_file, chr_lengths)

#### GRO-Seq generation ####

In [None]:
# Requires BED files from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM340901 (IMR90)