In [9]:
# code from Phil Romero

In [16]:
from random import choice
import numpy
from itertools import product 

# start with a family alignment (here I have 4 members, 10 positions)
alignment = [
['A','A','P','F'],
['S','T','I','L'],
['Q','Q','Q','N'],
['F','T','F','T'],
['R','R','R','R'],
['S','K','K','R'],
['A','V','I','L'],
['V','H','K','D'],
['W','W','W','W'],
['I','L','M','F']]

#alignment = []
# for record in screed.open( 'my_fasta' ):
#   alignment += list( record.sequence )

# contacts for the alignment
contacts = ((0,4),(0,9),(1,2),(3,4),(5,8),(6,7),(7,9),(8,9)) # this could be up to N choose 2 possible contacts

In [17]:
# this list displays the amino acid options at every position in the alignment
# there could be up to 21 options per position
# this is something like the resfile for Rosetta design
solution_space = [sorted(set(pos)) for pos in alignment]

print solution_space

[['A', 'F', 'P'], ['I', 'L', 'S', 'T'], ['N', 'Q'], ['F', 'T'], ['R'], ['K', 'R', 'S'], ['A', 'I', 'L', 'V'], ['D', 'H', 'K', 'V'], ['W'], ['F', 'I', 'L', 'M']]


In [18]:
# these are the regression parameters for the "Hamming kernel"
one_body_terms = []
for pos in range(len(solution_space)):
    for aa in solution_space[pos]:
        one_body_terms.append((pos,aa))

print one_body_terms

[(0, 'A'), (0, 'F'), (0, 'P'), (1, 'I'), (1, 'L'), (1, 'S'), (1, 'T'), (2, 'N'), (2, 'Q'), (3, 'F'), (3, 'T'), (4, 'R'), (5, 'K'), (5, 'R'), (5, 'S'), (6, 'A'), (6, 'I'), (6, 'L'), (6, 'V'), (7, 'D'), (7, 'H'), (7, 'K'), (7, 'V'), (8, 'W'), (9, 'F'), (9, 'I'), (9, 'L'), (9, 'M')]


In [19]:
# these are the regression parameters for the "structure-based kernel"
pairwise_terms = []
for pair in contacts:
    pos1 = pair[0]
    pos2 = pair[1]
    for aa1 in solution_space[pos1]:
        for aa2 in solution_space[pos2]:
            pairwise_terms.append(((pos1,aa1),(pos2,aa2)))

print pairwise_terms

[((0, 'A'), (4, 'R')), ((0, 'F'), (4, 'R')), ((0, 'P'), (4, 'R')), ((0, 'A'), (9, 'F')), ((0, 'A'), (9, 'I')), ((0, 'A'), (9, 'L')), ((0, 'A'), (9, 'M')), ((0, 'F'), (9, 'F')), ((0, 'F'), (9, 'I')), ((0, 'F'), (9, 'L')), ((0, 'F'), (9, 'M')), ((0, 'P'), (9, 'F')), ((0, 'P'), (9, 'I')), ((0, 'P'), (9, 'L')), ((0, 'P'), (9, 'M')), ((1, 'I'), (2, 'N')), ((1, 'I'), (2, 'Q')), ((1, 'L'), (2, 'N')), ((1, 'L'), (2, 'Q')), ((1, 'S'), (2, 'N')), ((1, 'S'), (2, 'Q')), ((1, 'T'), (2, 'N')), ((1, 'T'), (2, 'Q')), ((3, 'F'), (4, 'R')), ((3, 'T'), (4, 'R')), ((5, 'K'), (8, 'W')), ((5, 'R'), (8, 'W')), ((5, 'S'), (8, 'W')), ((6, 'A'), (7, 'D')), ((6, 'A'), (7, 'H')), ((6, 'A'), (7, 'K')), ((6, 'A'), (7, 'V')), ((6, 'I'), (7, 'D')), ((6, 'I'), (7, 'H')), ((6, 'I'), (7, 'K')), ((6, 'I'), (7, 'V')), ((6, 'L'), (7, 'D')), ((6, 'L'), (7, 'H')), ((6, 'L'), (7, 'K')), ((6, 'L'), (7, 'V')), ((6, 'V'), (7, 'D')), ((6, 'V'), (7, 'H')), ((6, 'V'), (7, 'K')), ((6, 'V'), (7, 'V')), ((7, 'D'), (9, 'F')), ((7, 'D')

In [21]:
# lets generate 50 random sequences from our solution space
sequences = [''.join([choice(pos) for pos in solution_space]) for i in range(50)]
print sequences

['FTNTRSLVWM', 'FSNFRRVVWF', 'FLNFRRAHWF', 'PLQTRRLHWF', 'PLQTRSVVWM', 'PSQFRSADWM', 'PSNFRSIHWF', 'FLQFRRVVWL', 'PTNTRRLHWF', 'ALNFRSVVWL', 'FTNFRRVVWM', 'FSQFRRIHWM', 'ALQTRKVHWL', 'PLQFRKLDWL', 'FTNFRKVDWI', 'AIQTRKVKWF', 'FSNFRRAVWM', 'FSQFRRIVWL', 'ASNTRRIVWI', 'FTNTRSLDWI', 'PTQFRRIKWF', 'FTNTRKIDWM', 'PLQTRKIVWF', 'PINFRSLKWL', 'PTNFRRLKWM', 'PTQTRKLKWI', 'PSQTRRAVWL', 'ALNFRKAHWI', 'ATQFRSIKWF', 'PIQTRSIDWL', 'FTQFRSLHWI', 'FLQTRKAVWI', 'ASQFRKIDWF', 'PLNTRKVKWL', 'PIQTRSAKWI', 'FSQFRKAVWL', 'AINTRRLHWF', 'ASNFRSIVWI', 'FLQFRKVHWI', 'ATNTRRIKWI', 'PSQTRRVDWF', 'ATQFRSLDWM', 'PIQFRKLKWF', 'PSQFRSLHWI', 'ATNTRKLVWI', 'PIQTRSVVWM', 'FLQFRKAKWM', 'FLNFRSLVWL', 'PTNTRSLDWF', 'AIQTRRIKWL']


In [22]:
# now we want to generate M by N feature matricies, where M is the number of sequences and N is the number of regression parameters
# these X matrices are binary and indicate the presence (=1) or absence (=0) of a particular interaction within a protein sequence

# X matrix for the one-body terms (for Hamming kernel)
X_one_body = []
for seq in sequences:
    x = [1 if seq[term[0]]==term[1] else 0 for term in one_body_terms] # if a sequence has a particular amino acid at a particular position, put a 1 at that term, else put a 0
    X_one_body.append(x)


print X_one_body

[[0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1], [0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0], [0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0], [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0], [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1], [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1], [0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0], [0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1], [0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0

In [23]:
# X matrix for the pairwise terms (for str-based kernel)
X_pairwise = []
for seq in sequences:
    x = [1 if (seq[term[0][0]]==term[0][1] and seq[term[1][0]]==term[1][1]) else 0 for term in pairwise_terms] # if a sequence has a particular amino acid pair at a particular contact, put a 1 at that term, else put a 0
    X_pairwise.append(x)


print X_pairwise

[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1], [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0

In [24]:

# these X matrices can then be used as inputs for any type of regression or learning algorithm

# we can generate the kernel functions:
X = numpy.array(X_one_body)
K_hamming = numpy.dot(X,X.T)


print X
print K_hamming

[[0 1 0 ..., 0 0 1]
 [0 1 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 ..., 
 [0 1 0 ..., 0 1 0]
 [0 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 1 0]]
[[10  5  4 ...,  7  7  3]
 [ 5 10  7 ...,  6  4  3]
 [ 4  7 10 ...,  6  4  3]
 ..., 
 [ 7  6  6 ..., 10  5  3]
 [ 7  4  4 ...,  5 10  3]
 [ 3  3  3 ...,  3  3 10]]


In [25]:


X = numpy.array(X_pairwise)
K_structure = numpy.dot(X,X.T)


print X
print K_structure

[[0 1 0 ..., 0 0 1]
 [0 1 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 ..., 
 [0 1 0 ..., 0 1 0]
 [0 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 1 0]]
[[8 1 1 ..., 3 3 1]
 [1 8 5 ..., 2 1 1]
 [1 5 8 ..., 3 1 1]
 ..., 
 [3 2 3 ..., 8 1 1]
 [3 1 1 ..., 1 8 1]
 [1 1 1 ..., 1 1 8]]
