In [3]:
from generate_pdb_utils import count_dataset

for dataset in ['imatinib', 'dasatinib', 'bosutinib']:
    print(count_dataset(f'../../data/{dataset}/{dataset}_binding.txt'))

(16, 4, 359, 5032, 14.016713091922005)
(16, 4, 328, 5066, 15.445121951219512)
(8, 3, 196, 2817, 14.372448979591837)


In [27]:
from generate_pdb_utils import read_file

num_samples_train, sequences_train, targets_train, pdb_ids_train = read_file('../atp/test.txt')

lens = sorted([(len(i), id) for i, id in zip(sequences_train, pdb_ids_train) if len(i) > 500])

print(len(lens))
print(lens[-10:])

7
[(505, '5DN6A'), (530, '5BSMA'), (569, '5ECKA'), (606, '5E84F'), (630, '5D6JA'), (664, '5CYRB'), (863, '4XJXB')]


In [23]:

def count_dataset(targets, do_slice=None):
    '''
    Return
    - Total Number of proteins
    - total number of positive residues
    - total number of negative residues
    - P/N ratio
    '''
    num_samples = len(targets)
    if do_slice == 'train':
        targets = targets[:int(num_samples*0.8)]
    elif do_slice == 'test':
        targets = targets[int(num_samples*0.8):]
    
    total_num_positive_residues = 0
    total_num_negative_residues = 0
    for target in targets:
        total_num_positive_residues += sum(target)
        total_num_negative_residues += len(target) - sum(target)
    return len(targets), total_num_positive_residues, total_num_negative_residues, round(total_num_negative_residues/total_num_positive_residues, 2)


print('ATP')
print(count_dataset(read_file('../atp/train.txt')[2]))
print(count_dataset(read_file('../atp/test.txt')[2]))

for dataset in ['imatinib', 'dasatinib', 'bosutinib']:
    # print(count_dataset(f'../../data/{dataset}/{dataset}_binding.txt'))
    print(dataset)
    print(count_dataset(read_file(f'../../data/{dataset}/{dataset}_binding.txt')[2], do_slice='train'))
    print(count_dataset(read_file(f'../../data/{dataset}/{dataset}_binding.txt')[2], do_slice='test'))

ATP
(388, 5657, 142086, 25.12)
(41, 674, 14159, 21.01)
imatinib
(16, 290, 3945, 13.6)
(4, 69, 1087, 15.75)
dasatinib
(16, 269, 4158, 15.46)
(4, 59, 908, 15.39)
bosutinib
(8, 136, 2045, 15.04)
(3, 60, 772, 12.87)


In [3]:
from generate_pdb_utils import find_close_edit_distance
import os


def test(dataset_type, log_limit):
    base_path = f'../../data/{dataset_type}'
    find_close_edit_distance(base_path, f'{dataset_type}_binding.txt', ratio_threshold=0.6, log_limit=log_limit)

In [10]:
test('imatinib', log_limit=20)

8 groups:
[['2HYYA', '2OIQA', '3MS9A', '3OEZA', '4CSVA', '1IEPA', '7N9GA', '6NPUA', '3GVUA', '2PL0A', '3K5VA'], ['4R7IA', '6JOLA', '1T46A'], ['5MQTA'], ['3FW1A'], ['1XBBA'], ['3HECA'], ['6KTNA'], ['4BKJA']]
20 closest pairs:
2OIQA-3OEZA: ratio:0.019011406844106463, distance: 5
2HYYA-6NPUA: ratio:0.03415559772296015, distance: 9
3MS9A-1IEPA: ratio:0.04081632653061224, distance: 11
2HYYA-1IEPA: ratio:0.0446927374301676, distance: 12
3MS9A-6NPUA: ratio:0.045368620037807186, distance: 12
1IEPA-3K5VA: ratio:0.05, distance: 14
1IEPA-6NPUA: ratio:0.055762081784386616, distance: 15
2HYYA-3MS9A: ratio:0.06439393939393939, distance: 17
2HYYA-7N9GA: ratio:0.06627680311890838, distance: 17
7N9GA-6NPUA: ratio:0.07003891050583658, distance: 18
3MS9A-3K5VA: ratio:0.07622504537205081, distance: 21
6NPUA-3K5VA: ratio:0.08363636363636363, distance: 23
3MS9A-7N9GA: ratio:0.0854368932038835, distance: 22
2HYYA-3K5VA: ratio:0.0947176684881603, distance: 26
1IEPA-7N9GA: ratio:0.09541984732824428, distance: 

In [33]:
# find close edit distance and save
import random
from generate_pdb_utils import save_lines

def edit_and_save(dataset_type, ratio_threshold=0.6):
    print(f'Finding close edit distance..')
    base_path = f'../../data/{dataset_type}'
    unions = find_close_edit_distance(base_path, f'{dataset_type}_binding.txt', ratio_threshold=ratio_threshold)
    pdb_id_order = []
    for union in unions:
        pdb_id_order += union
    tmp_array = pdb_id_order[:int(len(pdb_id_order)*0.8)]
    random.shuffle(tmp_array)
    pdb_id_order = tmp_array + pdb_id_order[int(len(pdb_id_order)*0.8):]
    print(f'pdb_id_order: {pdb_id_order[:10]}..')
    save_lines(os.path.join(base_path, f'{dataset_type}_binding.txt'), pdb_id_order)


edit_and_save('imatinib', ratio_threshold=0.6)

Finding close edit distance..
8 groups:
[['2PL0A', '7N9GA', '2OIQA', '3OEZA', '3GVUA', '4CSVA', '3K5VA', '1IEPA', '6NPUA', '3MS9A', '2HYYA'], ['1T46A', '6JOLA', '4R7IA'], ['3FW1A'], ['5MQTA'], ['1XBBA'], ['3HECA'], ['6KTNA'], ['4BKJA']]
10 closest pairs:
2OIQA-3OEZA: ratio:0.019011406844106463, distance: 5
6NPUA-2HYYA: ratio:0.03415559772296015, distance: 9
1IEPA-3MS9A: ratio:0.04081632653061224, distance: 11
1IEPA-2HYYA: ratio:0.0446927374301676, distance: 12
6NPUA-3MS9A: ratio:0.045368620037807186, distance: 12
3K5VA-1IEPA: ratio:0.05, distance: 14
1IEPA-6NPUA: ratio:0.055762081784386616, distance: 15
3MS9A-2HYYA: ratio:0.06439393939393939, distance: 17
7N9GA-2HYYA: ratio:0.06627680311890838, distance: 17
7N9GA-6NPUA: ratio:0.07003891050583658, distance: 18
pdb_id_order: ['2PL0A', '5MQTA', '6NPUA', '3MS9A', '4CSVA', '3GVUA', '7N9GA', '2OIQA', '3K5VA', '6JOLA']..


In [12]:
test('dasatinib', 20)

7 groups:
[['2Y6OA', '2ZVAA', '3G5DA', '3QLGA', '3K54A', '3OCTA', '3SXRA', '4XEYB', '4XLIA', '7N9GA', '5H2UA', '5I9YA'], ['3LFAA', '3OHTA'], ['5BVWA', '6BSDA'], ['4QMSA'], ['5OWRA'], ['5VCVA'], ['7ERKA']]
20 closest pairs:
5BVWA-6BSDA: ratio:0.021739130434782608, distance: 6
3K54A-3OCTA: ratio:0.053388090349075976, distance: 13
3G5DA-3QLGA: ratio:0.05825242718446602, distance: 15
4XLIA-7N9GA: ratio:0.13618677042801555, distance: 35
3LFAA-3OHTA: ratio:0.15963855421686746, distance: 53
2Y6OA-5I9YA: ratio:0.3527272727272727, distance: 97
4XEYB-7N9GA: ratio:0.3973727422003284, distance: 121
4XEYB-4XLIA: ratio:0.40770465489566615, distance: 127
3OCTA-3SXRA: ratio:0.4166666666666667, distance: 105
2ZVAA-3QLGA: ratio:0.4269230769230769, distance: 111
2ZVAA-3G5DA: ratio:0.4448742746615087, distance: 115
3K54A-3SXRA: ratio:0.4532803180914513, distance: 114
3G5DA-5H2UA: ratio:0.5335892514395394, distance: 139
3QLGA-5H2UA: ratio:0.5381679389312977, distance: 141
3G5DA-7N9GA: ratio:0.5415019762845

In [35]:
edit_and_save('dasatinib', ratio_threshold=0.6)

Finding close edit distance..
7 groups:
[['3G5DA', '5H2UA', '2ZVAA', '7N9GA', '3QLGA', '4XLIA', '3SXRA', '3OCTA', '3K54A', '4XEYB', '5I9YA', '2Y6OA'], ['6BSDA', '5BVWA'], ['3LFAA', '3OHTA'], ['4QMSA'], ['5OWRA'], ['5VCVA'], ['7ERKA']]
10 closest pairs:
6BSDA-5BVWA: ratio:0.021739130434782608, distance: 6
3OCTA-3K54A: ratio:0.053388090349075976, distance: 13
3G5DA-3QLGA: ratio:0.05825242718446602, distance: 15
7N9GA-4XLIA: ratio:0.13618677042801555, distance: 35
3LFAA-3OHTA: ratio:0.15963855421686746, distance: 53
5I9YA-2Y6OA: ratio:0.3527272727272727, distance: 97
7N9GA-4XEYB: ratio:0.3973727422003284, distance: 121
4XLIA-4XEYB: ratio:0.40770465489566615, distance: 127
3OCTA-3SXRA: ratio:0.4166666666666667, distance: 105
2ZVAA-3QLGA: ratio:0.4269230769230769, distance: 111
pdb_id_order: ['3K54A', '2Y6OA', '7N9GA', '5I9YA', '3OCTA', '5H2UA', '3SXRA', '4XLIA', '3OHTA', '4XEYB']..


In [13]:
test('bosutinib', 20)

5 groups:
[['3UE4A', '4MXOA', '4MXXA', '4MXYA', '4MXZA', '5I9XA'], ['4QMNA', '5AJQA'], ['5VC3A'], ['5VCYA'], ['6OP9A']]
20 closest pairs:
4MXYA-4MXZA: ratio:0.0, distance: 0
4MXOA-4MXXA: ratio:0.0037593984962406013, distance: 1
4MXOA-4MXYA: ratio:0.007518796992481203, distance: 2
4MXOA-4MXZA: ratio:0.007518796992481203, distance: 2
4MXXA-4MXYA: ratio:0.011278195488721804, distance: 3
4MXXA-4MXZA: ratio:0.011278195488721804, distance: 3
3UE4A-4MXOA: ratio:0.5597014925373134, distance: 150
3UE4A-4MXXA: ratio:0.5634328358208955, distance: 151
3UE4A-4MXYA: ratio:0.5671641791044776, distance: 152
3UE4A-4MXZA: ratio:0.5671641791044776, distance: 152
4MXOA-5I9XA: ratio:0.5927272727272728, distance: 163
4MXXA-5I9XA: ratio:0.5927272727272728, distance: 163
4QMNA-5AJQA: ratio:0.5932504440497336, distance: 167
4MXYA-5I9XA: ratio:0.6, distance: 165
4MXZA-5I9XA: ratio:0.6, distance: 165
3UE4A-5I9XA: ratio:0.628158844765343, distance: 174
4MXOA-6OP9A: ratio:0.6715867158671587, distance: 182
3UE4A-6O

In [36]:
edit_and_save('bosutinib', ratio_threshold=0.6)

Finding close edit distance..
5 groups:
[['4MXYA', '4MXXA', '4MXOA', '3UE4A', '4MXZA', '5I9XA'], ['4QMNA', '5AJQA'], ['5VC3A'], ['5VCYA'], ['6OP9A']]
10 closest pairs:
4MXYA-4MXZA: ratio:0.0, distance: 0
4MXXA-4MXOA: ratio:0.0037593984962406013, distance: 1
4MXYA-4MXOA: ratio:0.007518796992481203, distance: 2
4MXOA-4MXZA: ratio:0.007518796992481203, distance: 2
4MXYA-4MXXA: ratio:0.011278195488721804, distance: 3
4MXXA-4MXZA: ratio:0.011278195488721804, distance: 3
4MXOA-3UE4A: ratio:0.5597014925373134, distance: 150
4MXXA-3UE4A: ratio:0.5634328358208955, distance: 151
4MXYA-3UE4A: ratio:0.5671641791044776, distance: 152
3UE4A-4MXZA: ratio:0.5671641791044776, distance: 152
pdb_id_order: ['4MXYA', '3UE4A', '4MXOA', '4MXXA', '4MXZA', '5AJQA', '5I9XA', '4QMNA', '5VC3A', '5VCYA']..
