I restructure a bit of the code here and also some of the data. The researchers have not been consistent in the enumeration of the peptides neither the domains so its kinda difficult to structure it all together. 

In [1]:
import os 
os.chdir('E:\Ecole\Year 3\Projet 3A')
import pandas as pd
import numpy as np 

class Domain:
    
    def __init__(self, name):
        self.name = name
        self.thresholds = None
        self.thetas = None

class Peptide:
    
    def __init__(self, name):
        self.name = name
        self.sequence = None
        self.sequence_bis = None ##Sequence bis are the last five amino acids
        self.energy_ground = 0.0 ##Anticipating the calculation of a ground state energy for the peptide
        
class Data:
    
    def __init__(self):
        temp_df = pd.read_excel('Data_PDZ/MDSM_01_stiffler_bis.xls')
        self.aminoacids = [acid.encode('utf-8') for acid in list(temp_df.columns[:20])]
        self.df = temp_df.T
        self.domains = [Domain(domain.encode('utf-8')) for domain in list(self.df.columns)]
        self.domain_names = [domain.name for domain in self.domains]
        self.pep_seqs = []
        self.pep_names = []
        with open('Data_PDZ/peptides.free') as f:
            for line in f:
                x = line.split()
                self.pep_seqs.append(x[1])
                self.pep_names.append(x[0])
        self.peptides = [Peptide(name) for name in self.pep_names]
        
    def create_domains(self):
        for domain in self.domains:
            domain.thetas = self.df[domain.name][:100]
            domain.thetas = np.asarray(domain.thetas)
            domain.thetas = domain.thetas.reshape(5,20)
            domain.thresholds = np.asarray(self.df[domain.name][100:])   
    
    def create_peptides(self):
        for i in range(len(self.pep_seqs)):
            self.peptides[i].sequence = self.pep_seqs[i]
            self.peptides[i].sequence_bis = list(self.pep_seqs[i])[5:]        

In [2]:
PDZ_Data = Data()

In [3]:
PDZ_Data.create_domains()
PDZ_Data.create_peptides()

In [4]:
PDZ_Data.peptides[10].sequence_bis

['D', 'D', 'L', 'E', 'I']

Now we have created the preliminary data with the binding energy values and the peptide sequences. The last thing left to do is to get the data from the interaction matrix for each of the domain

In [5]:
fp_interaction_matrix = pd.read_excel('Data_PDZ/fp_interaction_matrix.xlsx')
for column in fp_interaction_matrix.columns:
    fp_interaction_matrix.loc[fp_interaction_matrix[column] == 0.0, column] = -1.0
fp_interaction_matrix = fp_interaction_matrix.rename(columns=lambda x: str(x).replace(" ", ""))

In [6]:
def evaluate_score(domain, peptide):
    score = 0.0
    for i in range(5):
        j = PDZ_Data.aminoacids.index(peptide.sequence_bis[i])
        score += domain.thetas[i,j]
    return score - domain.thresholds[0]
    

In [7]:
evaluate_score(PDZ_Data.domains[16], PDZ_Data.peptides[9])

10.72625

In [8]:
def sigmoid(x, a=1):
    return 1.0/(1+np.exp(-1.0*a*x))
def log_modified(x):
    if x > 0:
        return np.log(1+np.exp(-x))
    else:
        return -x + np.log(1+np.exp(x))

Let us take one particular ligand and make mutations to this ligand. 

In [9]:
test_peptide = PDZ_Data.peptides[3]
print test_peptide.name

ASIC2


In [10]:
print test_peptide.sequence_bis

['E', 'E', 'I', 'A', 'C']


Let us calculate the **energy** associated for each peptide in our data set. Once calculated for one peptide we shall calculate it for all the peptides in our data set. These values would then also be considered as fixed for the purposes of modeling the robustness of the specificity of the peptide-domain interaction. 

In [11]:
score_natural = 0.0
print test_peptide.name
for i in range(len(PDZ_Data.domain_names)):
    temp = evaluate_score(PDZ_Data.domains[i], test_peptide)
    alpha = fp_interaction_matrix[test_peptide.name][i]
    ## As a sanity check we print the values of alpha as well
    ## We remark that ASIC2 doesnt bind to any of the PDZ Domains that we consider and thus all values should be -1
    #print alpha
    if alpha > 0:
        alpha = +1.0
    score = temp*alpha
    temp2 = log_modified(score)
    score_natural += temp2 
print score_natural

ASIC2
2.58576358448


Now that we have calculated the energies for one peptide, let us calculate the ground state energies for all the peptides in the system. We shall write a simple function which does this given a peptide

In [12]:
def evaluate_energy(peptide):
    score_natural = 0.0
    for i in range(len(PDZ_Data.domain_names)): 
        temp = evaluate_score(PDZ_Data.domains[i], peptide)
        alpha = fp_interaction_matrix[peptide.name][i]
        if alpha > 0:
            alpha = +1.0
        score = temp*alpha
        temp2 = log_modified(score)
        score_natural += temp2 
    return score_natural

In [13]:
for pep in PDZ_Data.peptides:
    pep.energy_ground = evaluate_energy(pep)

In [14]:
#for pep in PDZ_Data.peptides:
   # print pep.name, pep.energy_ground


## Simulation 
Now we shall start with the real Monte Carlo step. Our algorithm is based on the famous Metropolis algorithm. We start with a given peptide and its sequence. To each peptide is associated a particular energy(which we calculated above). We expect that under mutations of the sequence, this energy will change. Depending on whether the energy changes or not after a point mutation, we shall accept or reject the mutation. 

Let us first start off by writing some convenience functions to make point mutations

In [15]:
def convert2seq(seq_int):
    return [PDZ_Data.aminoacids[i] for i in seq_int]
def convert2int(seq_pep):
    return [PDZ_Data.aminoacids.index(pep) for pep in seq_pep]

Lets take a peptide like one from the Claudin family. The advantage with the claudin family is that they bind more than one of the given PDZ domains. Let us take Claudin14 

In [16]:
print PDZ_Data.pep_names.index('Claudin14')

81


In [17]:
test_peptide = PDZ_Data.peptides[81]

In [18]:
print test_peptide.name
print test_peptide.sequence_bis
print test_peptide.energy_ground

Claudin14
['L', 'N', 'D', 'Y', 'V']
21.3830491799


In [19]:
base_seq = convert2int(test_peptide.sequence_bis)

In [20]:
print base_seq
print PDZ_Data.aminoacids

[3, 11, 18, 13, 2]
['G', 'A', 'V', 'L', 'I', 'M', 'P', 'F', 'W', 'S', 'T', 'N', 'Q', 'Y', 'C', 'K', 'R', 'H', 'D', 'E']


To make a mutation we need two numbers, one a number between 0 and 4 which will tell us the position to be mutated and a number between 0 and 19 which will tell us the amino acid to put in that position. We can do this easily by making two calls to the randomint function in numpy. 

In [21]:
y = np.random.randint(5)
z = np.random.randint(20)
print y, z

4 5


In [22]:
mut_seq = base_seq
mut_seq[y] = z

In [23]:
print mut_seq
print convert2seq(mut_seq)

[3, 11, 18, 13, 5]
['L', 'N', 'D', 'Y', 'M']


In [24]:
def eval_score(domain, sequence):
    score = 0.0
    for i in range(5):
        score += domain.thetas[i,sequence[i]]
    return score - domain.thresholds[0]
print evaluate_score(PDZ_Data.domains[16], PDZ_Data.peptides[9])
## Sanity Check
temp = PDZ_Data.peptides[9]
print eval_score(PDZ_Data.domains[16], convert2int(temp.sequence_bis))


10.72625
10.72625


In [25]:
def eval_energy(peptide, sequence):
    score_natural = 0.0
    for i in range(len(PDZ_Data.domain_names)): 
        temp = eval_score(PDZ_Data.domains[i], sequence)
        alpha = fp_interaction_matrix[peptide.name][i]
        if alpha > 0:
            alpha = +1.0
        score = temp*alpha
        temp2 = log_modified(score)
        score_natural += temp2 
    return score_natural
##Sanity Check
print eval_energy(test_peptide, convert2int(test_peptide.sequence_bis))
print test_peptide.energy_ground

21.3830491799
21.3830491799


In [26]:
print eval_energy(test_peptide, mut_seq)

13.1859409601


In [30]:
Nb_runs = 1000
mutated_sequences = []
mutated_energies = []
mut_seq = base_seq
for i in range(Nb_runs+1):
    y = np.random.randint(5)
    z = np.random.randint(20)
    mut_seq[y] = z
    mutated_sequences.append(list(mut_seq))
    energy = eval_energy(test_peptide, mut_seq)
    if energy < test_peptide.energy_ground:
        print mut_seq, energy
    mutated_energies.append(eval_energy(test_peptide, mut_seq))

[16, 0, 11, 2, 2] 15.8447112062
[16, 0, 11, 2, 8] 14.7346642042
[0, 14, 18, 7, 5] 17.6475912333
[3, 14, 18, 7, 5] 11.3647506798
[3, 14, 12, 7, 5] 20.0155851587
[3, 14, 12, 7, 5] 20.0155851587
[16, 15, 19, 14, 2] 19.7455689788
[4, 15, 19, 14, 2] 20.7329508673
[15, 10, 18, 13, 16] 10.7714246891
[0, 10, 18, 13, 16] 20.7491059733
[0, 10, 18, 13, 3] 17.1377532276
[0, 10, 18, 13, 15] 20.8336830225
[15, 11, 2, 7, 17] 18.5848713302
[15, 9, 19, 2, 17] 20.927889282
[0, 17, 18, 13, 1] 16.2000511073
[15, 17, 18, 13, 1] 10.3800784238
[3, 15, 18, 11, 16] 19.8423142432
