In [7]:
import numpy as np
import os

In [8]:
print(os.getcwd())
os.chdir('/usr/users/fatma.chafra01/ColabDesign')
print(os.getcwd())
#can't find colabdesign script if not in ColabDesign main directory

/home/mpg01/MBPC/fatma.chafra01/ColabDesign/af/examples
/home/mpg01/MBPC/fatma.chafra01/ColabDesign


In [9]:
from colabdesign.af.alphafold.common import residue_constants

In [4]:
# forming the bias matrix to fix the nb core positions
# total sequence length of nb is: 127
total_seq = "MAEVQLVESGGGLVQPGGSLRLSCTTSTSLFSITTMGWYRQAPGKQRELVASIKRGGGTNYADSMKGRFTISRDNARNTVFLEMNNLTTEDTAVYYCNAAILAYTGEVTNYWGQGTQVTVSSGQAGQ"
print('total seq length', len(total_seq))
# structurally represented seq length 117 (including the 7 residue gap in between the two sides of the structure) 
seq = 'VQLVESGGGLVQPGGSLRLSCTTSTSLFSITTMGWYRQAPGKQRELVASIKRGGGTNYADSMKGRFTISRDNARNTVFLEMNNLTTEDTAVYYCNAAILAYTGEVTNYWGQGTQVTV'
print('seq length in structure', len(seq))
# make sure to also delete the last part that is not in the structure from the original sequence because it will not be present in the later generated sequence
print('seq length', len(seq))


total seq length 127
seq length in structure 117
seq length 117


In [5]:
# previously, thought that there would be a truncation of 19 aa from the start of the sequence upto the point of the first aa of the longest uninterrupted stretch (19-120 positions)
# because of the flanking missing structures of the sequence, the aa seq got truncated so had to change the intervals from (1, 27), (35,53), (59,99), (112,127) to:
# fix_pos = [(19, 27), (35,53), (59,99), (112,120)]
# but this is wrong because then the binder length should have been 120-19 +1 = 102 but it is 117!
# fixing this means removing the first 3 aa that are flanking from the first interval and the last 7 amino acids that are flanking from the last interval:
fix_pos = [(4, 27), (35,53), (59,99), (112,120)]
# then have to restart the indeces from 1 because the first aa is the fourth aa (subtract 3 from each position):
fix_pos = [(1, 24), (32,50), (56,96), (109,117)]
print(fix_pos)

[(1, 24), (32, 50), (56, 96), (109, 117)]


In [10]:
# instead trying to make a bias matrix as suggested here: https://github.com/sokrypton/ColabDesign/issues/107
binder_len = 117
# get a bias matrix of the dimensions (no of rows: binder length, no of cols: all 20 amino acids as a one hot)
bias = np.zeros((binder_len,20))
for item in fix_pos:
  start = item[0] -1
  end = item[1] -1
  print(start, end)
  while start <= end:
    aa = seq[start]
    print(start, aa)
    bias[start,residue_constants.restype_order[str(aa)]] = 1e8
    print(f'bias added to:{start} as {aa}')
    start += 1
    # because the index changed once the pdb file was truncated
    # bias[start-19,residue_constants.restype_order[str(aa)]] = 1e8
    # order comes from residue_constants.restype_order[str(aa)]
    
print('bias matrix', bias)

0 23
0 V
bias added to:0 as V
1 Q
bias added to:1 as Q
2 L
bias added to:2 as L
3 V
bias added to:3 as V
4 E
bias added to:4 as E
5 S
bias added to:5 as S
6 G
bias added to:6 as G
7 G
bias added to:7 as G
8 G
bias added to:8 as G
9 L
bias added to:9 as L
10 V
bias added to:10 as V
11 Q
bias added to:11 as Q
12 P
bias added to:12 as P
13 G
bias added to:13 as G
14 G
bias added to:14 as G
15 S
bias added to:15 as S
16 L
bias added to:16 as L
17 R
bias added to:17 as R
18 L
bias added to:18 as L
19 S
bias added to:19 as S
20 C
bias added to:20 as C
21 T
bias added to:21 as T
22 T
bias added to:22 as T
23 S
bias added to:23 as S
31 49
31 T
bias added to:31 as T
32 M
bias added to:32 as M
33 G
bias added to:33 as G
34 W
bias added to:34 as W
35 Y
bias added to:35 as Y
36 R
bias added to:36 as R
37 Q
bias added to:37 as Q
38 A
bias added to:38 as A
39 P
bias added to:39 as P
40 G
bias added to:40 as G
41 K
bias added to:41 as K
42 Q
bias added to:42 as Q
43 R
bias added to:43 as R
44 E
bias 

In [11]:
# check whether the bias matrix makes sense by randomly printing out a row that has to contain a serine (pos 19 according to prev numbering, now pos 19 - 3 so 16)
print(bias[16,:])
# I don't know the order of the one hot but it is not exactly the alphabetically ordered classical one hot
def find_rows_without_value(arr, value):
    # Check each row for the presence of the value
    rows_with_value = np.any(np.isclose(arr, value), axis=1)

    # Get the indices of rows that don't have the value
    rows_without_value = np.where(~rows_with_value)[0]

    return rows_without_value

print("Rows without 1.0e+08 in bias matrix:", find_rows_without_value(bias, 1.0e+08))

[0.e+00 0.e+00 0.e+00 0.e+00 0.e+00 0.e+00 0.e+00 0.e+00 0.e+00 0.e+00
 1.e+08 0.e+00 0.e+00 0.e+00 0.e+00 0.e+00 0.e+00 0.e+00 0.e+00 0.e+00]
Rows without 1.0e+08 in bias matrix: [ 24  25  26  27  28  29  30  50  51  52  53  54  96  97  98  99 100 101
 102 103 104 105 106 107]


In [13]:
# check the bias matrix shape:
bias.shape == (binder_len,20)

True

In [14]:
# write a function to output the bias matrix into a csv file with filename bias_matrix_8ee2.csv
# Save the matrix to a CSV file
np.savetxt("bias_matrix_8ee2.csv",bias, delimiter=",", fmt="%.2f")