In [41]:
import gzip
import pandas as pd
import numpy as np
from itertools import groupby
from sklearn.cluster import DBSCAN
import hicstraw
import os

## функции

функция читает merged_nodups.txt файл, фильтрует межхромосомные перестройки и возвращает их в виде df

In [42]:
# Read and filte interchroms interactions
def read_and_filter_data(merged_nodups_file):
    columns = ['str1','chr1', 'pos1', 'frag1', 'str2', 'chr2', 'pos2', 
               'frag2', 'mapq1', 'cigar1', 'sequence1', 'mapq2', 
               'cigar2', 'sequence2', 'readname1', 'readname2']
    
    with gzip.open(merged_nodups_file, 'rt') as f:
        data = pd.read_csv(f, delimiter=' ', header=None, names=columns, index_col=False)
        data = data[data['chr1'] != data['chr2']].copy()
    
    return data

достает hic файл, соответствующий merged_nodups файлу, чтобы проверить потом версию референсной сборки

In [43]:
def get_hic_file(merged_nodups_file, suffix='inter_30.hic'):
    base_name = os.path.basename(merged_nodups_file)  
    prefix = '_'.join(base_name.split('_')[:3])  

    parent_dir = os.path.dirname(os.path.dirname(merged_nodups_file))
    hic_file = os.path.join(parent_dir, 'hic', f'{prefix}_{suffix}')
    
    return hic_file

создает библиотеку с длинами хромосом

In [44]:
# Extract chrom lengths
def extract_chromosome_lengths(file_path):
    chromosome_lengths = {}
    
    with open(file_path, 'r') as f:
        for line in f:
            if line.startswith('#'): 
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 9:
                chr_name = fields[9]
                chr_length = int(fields[8]) 
                chromosome_lengths[chr_name] = chr_length
    
    return chromosome_lengths

проверка референсной сборки и возврат библиотеки с длинами хромосом, соответствующей версии сборки

In [45]:
def check_genomeV_and_extract_chrom_dict(hicfile_path, t2t_path):
    hic = hicstraw.HiCFile(os.path.join(hicfile_path))
    genome_id = hic.getGenomeID().split('/')[-1]
    if genome_id == "hg19_canonical.chrom.sizes":
        chrom_file = os.path.join(os.path.dirname(hicfile_path), '../genome/male.hg19.chrom.sizes')
        chrom_dict = {}
        try:
            with open(chrom_file, 'r') as f:
                for line in f:
                    chrom_name, chrom_size = line.strip().split('\t')
                    chrom_dict[chrom_name] = int(chrom_size)
            return chrom_dict
        except FileNotFoundError:
            print(f"Error: Chromosome size file not found at {chrom_file}")
            return None
        
    elif genome_id == "hgT2T.chromsizes":
        return extract_chromosome_lengths(t2t_path)
    
    else:
        print("Error: Genome version chrom size file doesn't exist")
        return None

разбивка хромосом на бины с заданным размером бина

In [46]:
# Create bin df with bin coordinates and chrrom names
def create_bins(data, binsize, chromosome_lengths):
    bins = []
    bin_index = 0
    
    for chrom, chrom_length in chromosome_lengths.items():
        bin_start = 1
        while bin_start <= chrom_length:
            bin_end = min(bin_start + binsize - 1, chrom_length)
            bins.append((chrom, bin_start, bin_end, bin_index))
            bin_start = bin_end + 1
            bin_index += 1
    
    bins_df = pd.DataFrame(bins, columns=['chrom', 'bin_start', 'bin_end', 'bin_index'])
    
    return bins_df

соотнесение размеченных бинов с координатами замапленных ридов

In [47]:
# Add bin indexes to df with all information
def assign_bins_to_interactions(data, bins_df):
    bins_dict = {}
    
    for _, row in bins_df.iterrows():
        chrom = row['chrom']
        bin_start = row['bin_start']
        bin_end = row['bin_end']
        bin_index = row['bin_index']
        
        if chrom not in bins_dict:
            bins_dict[chrom] = []
        bins_dict[chrom].append((bin_start, bin_end, bin_index))
    
    def find_bin_index(chrom, pos):
        if chrom in bins_dict:
            for bin_start, bin_end, bin_index in bins_dict[chrom]:
                if bin_start <= pos <= bin_end:
                    return int(bin_index)
        return -1

    data['bin1_index'] = data.apply(lambda row: find_bin_index(row['chr1'], row['pos1']), axis=1)
    data['bin2_index'] = data.apply(lambda row: find_bin_index(row['chr2'], row['pos2']), axis=1)
    
    return data

построение матрицы по размеченным бинам с уже исключеными внутрихромосомными взаимодействия

In [48]:
def create_symmetric_matrix(bin_df, data_with_bins):
    matrix_size = len(bin_df) 
    symmetric_matrix = np.zeros((matrix_size, matrix_size), dtype=int)

    for _, row in data_with_bins.iterrows():
        bin1 = row['bin1_index']
        bin2 = row['bin2_index']
        
        symmetric_matrix[bin1, bin2] += 1
        symmetric_matrix[bin2, bin1] += 1
    
    symmetric_matrix_df = pd.DataFrame(symmetric_matrix, index=bin_df['bin_index'], columns=bin_df['bin_index'])
    symmetric_matrix_df.index.name = 'bin1_index'
    symmetric_matrix_df.columns.name = 'bin2_index'
    
    return symmetric_matrix_df

нормализация матрицы, где значение пиксиля делится на произведение длин взаимодействующих хромосом для этого пикселя

дальше в алгоритме не будет применяться функция, с ней еще хуже детектировались перестройки

In [49]:
def normalize_contact_matrix(contact_matrix, bins_df, chromosome_lengths):
    bin_to_chrom = bins_df.set_index('bin_index')['chrom'].to_dict()
    
    df_long = contact_matrix.stack().reset_index()
    df_long.columns = ['bin1_index', 'bin2_index', 'value']

    df_long['chrom1'] = df_long['bin1_index'].map(bin_to_chrom)
    df_long['chrom2'] = df_long['bin2_index'].map(bin_to_chrom)
    
    df_long['normalization_factor'] = df_long['chrom1'].map(chromosome_lengths) * df_long['chrom2'].map(chromosome_lengths)
    df_long['normalized_value'] = df_long['value'] / df_long['normalization_factor']

    df_long = df_long.drop(columns=['value', 'chrom1', 'chrom2', 'normalization_factor'])

    df_long = df_long.groupby(['bin1_index', 'bin2_index'])['normalized_value'].mean().reset_index()

    try:
        normalized_matrix = df_long.pivot(index='bin1_index', columns='bin2_index', values='normalized_value').fillna(0)
    except Exception as e:
        print("Error converting to pivot:", str(e))
        return df_long 

    normalized_matrix.index.name = 'bin1_index'
    normalized_matrix.columns.name = 'bin2_index'

    normalized_matrix.index.name = 'bin1_index'
    normalized_matrix.columns.name = 'bin2_index'
    
    return normalized_matrix

функция "пробегает" по всей матрице скользящим окном размера `region_size` и создает лист с координатами подматриц и суммой пикселей. лист сортируется по убыванию сумм в подматрицах

In [51]:
# Detect translocation regions
def find_top_balanced_high_contact_regions(matrix, region_size=10, threshold=20, tolerance=0.1):
    region_sums = []
    visited_coords = set()  
    
    for i in range(matrix.shape[0] - region_size + 1):
        for j in range(matrix.shape[1] - region_size + 1):
            if (i, j) in visited_coords or (j, i) in visited_coords:
                continue
            
            region = matrix[i:i+region_size, j:j+region_size]
            region_filtered = region * (region > threshold)
            
            row_sums = np.sum(region_filtered, axis=1)
            col_sums = np.sum(region_filtered, axis=0)
            
            row_sum_mean = np.mean(row_sums)
            col_sum_mean = np.mean(col_sums)
            
            with np.errstate(divide='ignore', invalid='ignore'):
                if np.all(np.abs(row_sums - row_sum_mean) / row_sum_mean < tolerance) and \
                   np.all(np.abs(col_sums - col_sum_mean) / col_sum_mean < tolerance):
                    region_sum = np.sum(region_filtered)
                    region_sums.append(((i, j), region_sum, region))
                
                visited_coords.add((i, j))
                visited_coords.add((j, i))
    
    top_regions = sorted(region_sums, key=lambda x: x[1], reverse=True)
    cluster_coords = cluster_and_select_best_region([coords for coords, region_sum, region in top_regions], eps=2, min_samples=5)
    
    top_regions_filtered = [region for region in top_regions if region[0] in cluster_coords]
    return [(coords, region_sum) for coords, region_sum, region in top_regions_filtered]


функция выполняет кластеризацию координатных точек и выбирает лучшие регионы на основе результатов кластеризации.

функция возвращает только одну точку для каждого кластера — первую в списке, поскольку она содержит максимальное значение сумм пикселей.
не находит кластеры, если точки расположены слишком далеко друг от друга (параметр `eps`) или если в данных есть много шума. минимальное количество подматриц, необходимое для кластерицации задается параметром `min_samples`

In [50]:
# Cluster windows with high values
def cluster_and_select_best_region(coords_list, eps=1.5, min_samples=2):

    if len(coords_list) == 0:
        return []

    coords_array = np.array(coords_list)
    clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(coords_array)
    labels = clustering.labels_
    best_regions = []

    for label in set(labels):
        if label == -1:
            continue 

        cluster_indices = np.where(labels == label)[0]
        cluster_coords = [coords_list[i] for i in cluster_indices]

        best_coords = cluster_coords[0]  
        best_regions.append(best_coords)
    
    return best_regions


выполняет фильтрацию регионов на основе их сумм пикселей и выделяет потенциально истинные регионы.

возвращает список регионов с пометкой о том, является ли регион истинным или ложным, в зависимости от значения разницы между соседними величинами сумм подматриц. вычисляется на основе среднего значения и стандартного отклонения сумм пикселей. 

условия попадания в true перестройки, выдаваемые системой:
1. разница между значениями не должна быть значительной. если есть резкий "скачек" или "ступень" в значениях, алгоритм отсекает все значения после ступеени
2. значения попадают в 90-100% самых больших сумм подматриц
3. значение является выбросом в выборке сумм подматриц (большое по выборке)

In [52]:
def filter_true_regions(region_list, threshold_factor=2, percentile=90):
    if not region_list:
        return []
    
    region_sums = np.array([sum_val for _, sum_val in region_list])
    mean_val = np.mean(region_sums)
    std_dev = np.std(region_sums)

    if len(region_sums) > 50:
        threshold_factor = max(threshold_factor, np.log10(len(region_sums)))
    
    potential_true_regions = []
    for i in range(len(region_sums)):
        sum_val = region_sums[i]
        if sum_val > mean_val + threshold_factor * std_dev:
            if i > 0:
                previous_sum = region_sums[i-1]
                difference = abs(previous_sum - sum_val)
                if difference / previous_sum < 0.1:
                    potential_true_regions.append((region_list[i][0], sum_val))
            else:
                potential_true_regions.append((region_list[i][0], sum_val))
    
    if not potential_true_regions:
        return [(region, sum_val, False) for region, sum_val in region_list]
    
    percentile_value = np.percentile(region_sums, percentile)
    
    true_regions = [(region, sum_val) for region, sum_val in potential_true_regions if sum_val >= percentile_value]
    true_regions_set = set([region for region, _ in true_regions])
    
    marked_regions = []
    false_found = False
    
    for region, sum_val in region_list:
        if region in true_regions_set and not false_found:
            marked_regions.append((region, sum_val, True))
        else:
            marked_regions.append((region, sum_val, False))
            false_found = True 
    
    return marked_regions

полученные регионы (первые координаты бинов), определенные как true или false, мапятся с хромосомами.

In [53]:
def map_coords_to_chromosome(coords_list, bin1_df):

    mapped_results = []
    
    for coords, region_sum, b in coords_list:
        chrom1 = bin1_df.loc[bin1_df['bin_index'] == coords[0], 'chrom'].values[0]
        chrom2 = bin1_df.loc[bin1_df['bin_index'] == coords[1], 'chrom'].values[0]
        if chrom1 != chrom2:
            mapped_results.append(((chrom1, chrom2), region_sum, b))
    
    return mapped_results

функция читаеет translocations.txt и создает библиотеку с парами истинных транслокация

In [54]:
def read_true_rearrangements(file_path):
    true_rearrangements = set()
    
    with open(file_path, 'r') as file:
        next(file)
        
        for line in file:
            parts = line.strip().split('\t')
            chr1 = parts[1]
            chr2 = parts[4]
            true_rearrangements.add((chr1, chr2))
    
    return true_rearrangements


изначалный вариант, который бал предложен. если заранее не знать количество перестроек

TP  - система выдает пару (true), которая присутствует в файле translocations.txt

FP  - система выдает пару (true), которой нет в файле translocations.txt

FN  - система выдает пару (false), которая есть в файле 

TN  - система выдает пару (false), которой нет в файле

In [55]:
def calculate_tpr_fpr(predictions, true_rearrangements):
    if len(predictions) == 0:
        print("No predictions found. Skipping TPR and FPR calculation.")
        return None, None

    TP = 0 
    FP = 0 
    FN = 0
    TN = 0 

    for prediction in predictions:
        chromosome_pair, score, is_true = prediction
        
        if is_true:
            if chromosome_pair in true_rearrangements:
                TP += 1  
            else:
                FP += 1 
        else:
            if chromosome_pair in true_rearrangements:
                FN += 1
            else:
                TN += 1           
                
    TPR = TP / (TP + FN) if (TP + FN) > 0 else 0  
    FPR = FP / (FP + TN) if (FP + TN) > 0 else 0

    return TPR, FPR



функции для рассчета TPR, FPR, написанные для последней схемы, где по ID проверяется образец и задается нужное количество перестроек для него.

по схеме 11*276 для эталона, где 14 истины

In [None]:
def read_translocations(file_path, embryo_id_of_file):
    translocations_set = set()
    with open(file_path, 'r') as file:
        next(file)
        for line in file:            
            embryo_id, chr1, _, _, chr2, _, _ = line.strip().split('\t')
            if embryo_id == embryo_id_of_file:
                translocations_set.add(f"{embryo_id} {chr1} {chr2}")
    
    return translocations_set

def get_embryo_id_from_filename(filename):
    parts = filename.split('_')
    embryo_id = '_'.join(parts[:2])  # Предполагается, что ID находится до первого подчеркивания
    return embryo_id

def generate_predictions_set(embryo_id, mapped_chromosomes):
    predictions_set = set()
    
    for (chr1, chr2), score, is_true in mapped_chromosomes:
        if is_true:
            predictions_set.add(f"{embryo_id} {chr1} {chr2}")
    
    return predictions_set

def calculate_metrics(predictions_set, translocations_set, total_possible_combinations):

    TP = len(predictions_set & translocations_set)
    FP = len(predictions_set - translocations_set)
    FN = len(translocations_set - predictions_set)
    TN = total_possible_combinations - (TP + FP + FN)
    
    TPR = TP / (TP + FN) if (TP + FN) > 0 else 0
    FPR = FP / (FP + TN) if (FP + TN) > 0 else 0
    
    return TPR, FPR, TP, FP, FN, TN

## запуск кода для всех образцов по схеме проверки 11*276

In [56]:
merged_nodups_dir = '~/test_embrio/data/merged_nodups/'
chrom_len_t2t = '~/test_embrio/data/genome/GCF_009914755.1_T2T-CHM13v2.0_assembly_report.txt'
true_rearrangements_file = '~/test_embrio/data/genome/translocations.txt'

translocations_file = '~/test_embrio/data/genome/translocations.txt'

total_possible_combinations = 11 * 276 

total_TPR, total_FPR = 0, 0
sample_count = 0

for filename in os.listdir(merged_nodups_dir):
    if filename.endswith('_merged_nodups.txt.gz'):
        merged_nodups_file = os.path.join(merged_nodups_dir, filename)
        
        hic_file = get_hic_file(merged_nodups_file)
        chromosome_lengths = check_genomeV_and_extract_chrom_dict(hic_file, chrom_len_t2t)
        data = read_and_filter_data(merged_nodups_file)
        
        binsize = 5000000
        bins_df = create_bins(data, binsize, chromosome_lengths)
        data_with_bins = assign_bins_to_interactions(data, bins_df)
        
        contact_matrix = create_symmetric_matrix(bins_df, data_with_bins)
        top_balanced_regions = find_top_balanced_high_contact_regions(contact_matrix.values, region_size=10, threshold=0, tolerance=0.99)
        
        true_regions = filter_true_regions([(coords, region_sum) for coords, region_sum in top_balanced_regions], threshold_factor=2, percentile=90)
        mapped_chromosomes = map_coords_to_chromosome(true_regions, bins_df)
        
        embryo_id = get_embryo_id_from_filename(filename)
        translocations_set = read_translocations(translocations_file, embryo_id)
        predictions_set = generate_predictions_set(embryo_id, mapped_chromosomes)
        
        if not predictions_set:
            print(f"No potential regions found for sample: {filename}, skipping this sample.")
            continue

        TPR, FPR, TP, FP, FN, TN = calculate_metrics(predictions_set, translocations_set, total_possible_combinations)

        print(f"Sample: {filename}")
        print(predictions_set)
        print(f"True Positive Rate (TPR): {TPR:.2f}")
        print(f"False Positive Rate (FPR): {FPR:.2f}")
        print(f"True Positives (TP): {TP}")
        print(f"False Positives (FP): {FP}")
        print(f"False Negatives (FN): {FN}")
        print(f"True Negatives (TN): {TN}")
        print("-------------------------------")

        total_TPR += TPR
        total_FPR += FPR
        sample_count += 1

if sample_count > 0:
    avg_TPR = total_TPR / sample_count
    avg_FPR = total_FPR / sample_count
    print(f"Average True Positive Rate (TPR): {avg_TPR:.2f}")
    print(f"Average False Positive Rate (FPR): {avg_FPR:.2f}")
else:
    print("No samples processed.")


No potential regions found for sample: IlI_K3_BGI_merged_nodups.txt.gz, skipping this sample.
Sample: HAN_K5_BGI_merged_nodups.txt.gz
{'HAN_K5 chr1 chr4'}
True Positive Rate (TPR): 0.00
False Positive Rate (FPR): 0.00
True Positives (TP): 0
False Positives (FP): 1
False Negatives (FN): 0
True Negatives (TN): 3035
-------------------------------
Sample: Kaz3_K_Moscow_merged_nodups.txt.gz
{'Kaz3_K chr17 chr19'}
True Positive Rate (TPR): 0.00
False Positive Rate (FPR): 0.00
True Positives (TP): 0
False Positives (FP): 1
False Negatives (FN): 0
True Negatives (TN): 3035
-------------------------------
Sample: Fuks2_K1_ENC_merged_nodups.txt.gz
{'Fuks2_K1 chr17 chr19'}
True Positive Rate (TPR): 0.00
False Positive Rate (FPR): 0.00
True Positives (TP): 0
False Positives (FP): 1
False Negatives (FN): 0
True Negatives (TN): 3035
-------------------------------
Sample: Kira1_K1_ENC_merged_nodups.txt.gz
{'Kira1_K1 chr17 chr19'}
True Positive Rate (TPR): 0.00
False Positive Rate (FPR): 0.00
True P

## запуск кода по первой схеме проверки на TPR и FPR

In [None]:
merged_nodups_dir = '~/test_embrio/data/merged_nodups/'
chrom_len_t2t = '~/test_embrio/data/genome/GCF_009914755.1_T2T-CHM13v2.0_assembly_report.txt'
true_rearrangements_file = '~/test_embrio/data/genome/translocations.txt'


true_rearrangements = read_true_rearrangements(true_rearrangements_file)

total_TPR = 0
total_FPR = 0
sample_count = 0

for filename in os.listdir(merged_nodups_dir):
    if filename.endswith('_merged_nodups.txt.gz'):
        merged_nodups_file = os.path.join(merged_nodups_dir, filename)
        
        hic_file = get_hic_file(merged_nodups_file)
        chromosome_lengths = check_genomeV_and_extract_chrom_dict(hic_file, chrom_len_t2t)
        data = read_and_filter_data(merged_nodups_file)
        
        binsize = 5000000
        bins_df = create_bins(data, binsize, chromosome_lengths)
        data_with_bins = assign_bins_to_interactions(data, bins_df)
        
        contact_matrix = create_symmetric_matrix(bins_df, data_with_bins)
        top_balanced_regions = find_top_balanced_high_contact_regions(contact_matrix.values, region_size=10, threshold=0, tolerance=0.99)
        
        true_regions = filter_true_regions([(coords, region_sum) for coords, region_sum in top_balanced_regions], threshold_factor=2, percentile=90)
        mapped_chromosomes = map_coords_to_chromosome(true_regions, bins_df)
        
        TPR, FPR = calculate_tpr_fpr(mapped_chromosomes, true_rearrangements)
        
        if TPR is None or FPR is None:
            print(f"No potential regions found for sample: {filename}, skipping this sample.")
            continue
        
        print(f"Sample: {filename}")
        print(mapped_chromosomes[:5])
        if TPR is not None:
            print(f"True Positive Rate (TPR): {TPR:.2f}")
        if FPR is not None:
            print(f"False Positive Rate (FPR): {FPR:.2f}")
        print("-------------------------------")
        
        total_TPR += TPR
        total_FPR += FPR
        sample_count += 1

if sample_count > 0:
    average_TPR = total_TPR / sample_count
    average_FPR = total_FPR / sample_count
    print(f"Average True Positive Rate (TPR) across all samples: {average_TPR:.2f}")
    print(f"Average False Positive Rate (FPR) across all samples: {average_FPR:.2f}")
else:
    print("No valid samples found.")

Sample: IlI_K3_BGI_merged_nodups.txt.gz
True Positive Rate (TPR): 0.00
False Positive Rate (FPR): 0.00
-------------------------------
Sample: HAN_K5_BGI_merged_nodups.txt.gz
True Positive Rate (TPR): 0.00
False Positive Rate (FPR): 0.07
-------------------------------
Sample: Kaz3_K_Moscow_merged_nodups.txt.gz
True Positive Rate (TPR): 0.00
False Positive Rate (FPR): 0.06
-------------------------------
Sample: Fuks2_K1_ENC_merged_nodups.txt.gz
True Positive Rate (TPR): 0.00
False Positive Rate (FPR): 0.00
-------------------------------
Sample: Kira1_K1_ENC_merged_nodups.txt.gz
True Positive Rate (TPR): 0.00
False Positive Rate (FPR): 0.00
-------------------------------
Sample: Pash_e2_Moscow_merged_nodups.txt.gz
True Positive Rate (TPR): 0.00
False Positive Rate (FPR): 0.00
-------------------------------
Sample: Vla1_e_BGI_merged_nodups.txt.gz
True Positive Rate (TPR): 1.00
False Positive Rate (FPR): 0.00
-------------------------------
Sample: BTR_e3_Moscow_merged_nodups.txt.gz
T

## запуск кода для сета образцов с перестройками

In [None]:
merged_nodups_dir = '~/test_embrio/data/merged_nodups/'
chrom_len_t2t = '~/test_embrio/data/genome/GCF_009914755.1_T2T-CHM13v2.0_assembly_report.txt'
true_rearrangements_file = '~/test_embrio/data/genome/translocations.txt'
prefix_set_df = pd.read_csv(true_rearrangements_file, sep='\t')
prefix_set = tuple(set(list(prefix_set_df['ID'])))

true_rearrangements = read_true_rearrangements(true_rearrangements_file)

total_TPR = 0
total_FPR = 0
sample_count = 0

for filename in os.listdir(merged_nodups_dir):
    if filename.startswith(prefix_set):
        merged_nodups_file = os.path.join(merged_nodups_dir, filename)
        
        hic_file = get_hic_file(merged_nodups_file)
        chromosome_lengths = check_genomeV_and_extract_chrom_dict(hic_file, chrom_len_t2t)
        data = read_and_filter_data(merged_nodups_file)
        
        binsize = 5000000
        bins_df = create_bins(data, binsize, chromosome_lengths)
        data_with_bins = assign_bins_to_interactions(data, bins_df)
        
        contact_matrix = create_symmetric_matrix(bins_df, data_with_bins)
        top_balanced_regions = find_top_balanced_high_contact_regions(contact_matrix.values, region_size=10, threshold=0, tolerance=0.99)
        
        true_regions = filter_true_regions([(coords, region_sum) for coords, region_sum in top_balanced_regions], threshold_factor=2, percentile=90)
        mapped_chromosomes = map_coords_to_chromosome(true_regions, bins_df)
        
        embryo_id = get_embryo_id_from_filename(filename)
        translocations_set = read_translocations(translocations_file, embryo_id)
        predictions_set = generate_predictions_set(embryo_id, mapped_chromosomes)
        
        if not predictions_set:
            print(f"No potential regions found for sample: {filename}, skipping this sample.")
            continue

        TPR, FPR, TP, FP, FN, TN = calculate_metrics(predictions_set, translocations_set, total_possible_combinations)

        print(f"Sample: {filename}")
        print(predictions_set)
        print(f"True Positive Rate (TPR): {TPR:.2f}")
        print(f"False Positive Rate (FPR): {FPR:.2f}")
        print(f"True Positives (TP): {TP}")
        print(f"False Positives (FP): {FP}")
        print(f"False Negatives (FN): {FN}")
        print(f"True Negatives (TN): {TN}")
        print("-------------------------------")

        total_TPR += TPR
        total_FPR += FPR
        sample_count += 1

if sample_count > 0:
    avg_TPR = total_TPR / sample_count
    avg_FPR = total_FPR / sample_count
    print(f"Average True Positive Rate (TPR): {avg_TPR:.2f}")
    print(f"Average False Positive Rate (FPR): {avg_FPR:.2f}")
else:
    print("No samples processed.")