In [18]:
import numpy as np
import pandas as pd


def calculate_array_coverage(real_start, real_end, start_indices):
    real_length = real_end - real_start + 1
    coverage_mask = [False] * real_length
    end_indices = start_indices + 39

    for start, end in zip(start_indices, end_indices):
        overlap_start = max(real_start, start)
        overlap_end = min(real_end, end)

        if overlap_start <= overlap_end:
            for i in range(overlap_start, overlap_end + 1):
                coverage_mask[i - real_start] = True

    total_overlap = sum(coverage_mask)
    coverage = total_overlap / real_length
    return coverage


file_path1 = './/results/coords/bass_ntm_domain_test.ProteinBERTcomb123456.csv'
file_path2 = './/data(test)/coords/bass_ntm_domain_test_coord.csv'

data1 = pd.read_csv(file_path1, sep="\t")
data2 = pd.read_csv(file_path2, sep="\t")
merged_data = pd.merge(data1, data2, on='id')


def convert_to_int_list(indexes_str):
    if indexes_str == '[]':
        return []
    else:
        return list(map(int, indexes_str.strip('[]').split()))


merged_data.rename(columns={'indexes': 'indexes_above_cutoff'}, inplace=True)

merged_data['indexes_above_cutoff'] = merged_data['indexes_above_cutoff'].apply(convert_to_int_list)

merged_data['coverage'] = merged_data.apply(
    lambda row: calculate_array_coverage(row['real_start'], row['real_end'], np.array(row['indexes_above_cutoff'])),
    axis=1
)


def calculate_differences(row):
    if not row['indexes_above_cutoff']:
        diff_start = np.nan
        max_diff_start = row['max_indexes'] - row['real_start']
        diff_end = np.nan
        max_diff_end = row['max_indexes'] + 39 - row['real_end']
        max_indexes = int(row['max_indexes'])
        coverage = np.nan
        max_indexes_coverage = calculate_array_coverage(row['real_start'], row['real_end'], np.array([max_indexes]))
        max_indexes_pred = row['max_indexes_pred']
        return coverage, max_indexes_coverage, diff_start, max_diff_start, diff_end, max_diff_end, max_indexes, max_indexes_pred
    predicted_start = row['indexes_above_cutoff'][0]
    predicted_end = row['indexes_above_cutoff'][-1] + 39
    diff_start = predicted_start - row['real_start']
    max_diff_start = row['max_indexes'] - row['real_start']
    diff_end = predicted_end - row['real_end']
    max_diff_end = row['max_indexes'] + 39 - row['real_end']
    max_indexes = int(row['max_indexes'])
    coverage = calculate_array_coverage(row['real_start'], row['real_end'], np.array(row['indexes_above_cutoff']))
    max_indexes_coverage = calculate_array_coverage(row['real_start'], row['real_end'], np.array([max_indexes]))
    max_indexes_pred = row['max_indexes_pred']
    return coverage, max_indexes_coverage, diff_start, max_diff_start, diff_end, max_diff_end, max_indexes, max_indexes_pred


merged_data[
    ['coverage', 'max_indexes_coverage', 'diff_start', 'max_diff_start', 'diff_end', 'max_diff_end', 'max_indexes',
     'max_indexes_pred']] = merged_data.apply(
    lambda row: pd.Series(calculate_differences(row)), axis=1
)

merged_data

Unnamed: 0,id,cutoff,indexes_above_cutoff,max_indexes,max_indexes_pred,real_start,real_end,coverage,max_indexes_coverage,diff_start,max_diff_start,diff_end,max_diff_end
0,C01_SEU47807.1,0.5,[],0.0,0.250307,1,22,,1.0,,-1.0,,17.0
1,C01_ATW48405.1,0.5,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12]",0.0,0.942380,13,36,1.0,1.0,-13.0,-13.0,15.0,3.0
2,C01_OKJ57450.1,0.5,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",0.0,1.000000,13,34,1.0,1.0,-13.0,-13.0,20.0,5.0
3,C01_ADI09086.1,0.5,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",3.0,0.999799,12,31,1.0,1.0,-12.0,-9.0,23.0,11.0
4,C01_OKH98982.1,0.5,"[0, 1, 2, 3, 4, 5, 6, 7]",2.0,0.999895,11,31,1.0,1.0,-11.0,-9.0,15.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,Ccya03_OLP17403.1,0.5,"[25, 26, 27, 28, 29, 30, 31, 32]",28.0,0.977103,42,64,1.0,1.0,-17.0,-14.0,7.0,3.0
123,Cnlr26_EPF21720.1,0.5,[],0.0,0.000092,8,36,,1.0,,-8.0,,3.0
124,Cnlr26_ALB42078.1,0.5,[],1.0,0.280639,11,36,,1.0,,-10.0,,4.0
125,Cnlr26_OBQ14685.1,0.5,"[0, 1, 2, 3]",1.0,0.986887,11,39,1.0,1.0,-11.0,-10.0,3.0,1.0
