In [None]:
import pandas 

df = pandas.read_csv( '../crystal_structures/

In [4]:
from random import choice
import numpy

# start with a family alignment (here I have 4 members, 10 positions)
alignment = [
['A','A','P','F'],
['S','T','I','L'],
['Q','Q','Q','N'],
['F','T','F','T'],
['R','R','R','R'],
['S','K','K','R'],
['A','V','I','L'],
['V','H','K','D'],
['W','W','W','W'],
['I','L','M','F']]

# contacts for the alignment
contacts = ((0,4),(0,9),(1,2),(3,4),(5,8),(6,7),(7,9),(8,9)) # this could be up to N choose 2 possible contacts


# this list displays the amino acid options at every position in the alignment
# there could be up to 21 options per position
# this is something like the resfile for Rosetta design
solution_space = [sorted(set(pos)) for pos in alignment]


# these are the regression parameters for the "Hamming kernel"
one_body_terms = []
for pos in range(len(solution_space)):
    for aa in solution_space[pos]:
        one_body_terms.append((pos,aa))


# these are the regression parameters for the "structure-based kernel"
pairwise_terms = []
for pair in contacts:
    pos1 = pair[0]
    pos2 = pair[1]
    for aa1 in solution_space[pos1]:
        for aa2 in solution_space[pos2]:
            pairwise_terms.append(((pos1,aa1),(pos2,aa2)))




# lets generate 50 random sequences from our solution space
sequences = [''.join([choice(pos) for pos in solution_space]) for i in range(50)]



# now we want to generate M by N feature matricies, where M is the number of sequences and N is the number of regression parameters
# these X matrices are binary and indicate the presence (=1) or absence (=0) of a particular interaction within a protein sequence

# X matrix for the one-body terms (for Hamming kernel)
X_one_body = []
for seq in sequences:
    x = [1 if seq[term[0]]==term[1] else 0 for term in one_body_terms] # if a sequence has a particular amino acid at a particular position, put a 1 at that term, else put a 0
    X_one_body.append(x)


# X matrix for the pairwise terms (for str-based kernel)
X_pairwise = []
for seq in sequences:
    x = [1 if (seq[term[0][0]]==term[0][1] and seq[term[1][0]]==term[1][1]) else 0 for term in pairwise_terms] # if a sequence has a particular amino acid pair at a particular contact, put a 1 at that term, else put a 0
    X_pairwise.append(x)



# these X matrices can then be used as inputs for any type of regression or learning algorithm

# we can generate the kernel functions:
X = numpy.array(X_one_body)
K_hamming = numpy.dot(X,X.T)


X = numpy.array(X_pairwise)
K_structure = numpy.dot(X,X.T)



X_one_body

[[1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  0],
 [0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1],
 [0,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  1],
 [0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  0],
 [0,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0],
 [0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  0],
 [0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  0],
 [0,
  1,
  0