In [1]:
import ecc_tools as tools
import numpy as np
import pandas as pd
from scipy import linalg
from sklearn.preprocessing import OneHotEncoder
#import emachine as EM
from direct_info import direct_info

from data_processing import data_processing
import Bio.PDB, warnings
pdb_list = Bio.PDB.PDBList()
pdb_parser = Bio.PDB.PDBParser()
from scipy.spatial import distance_matrix
from Bio import BiopythonWarning
warnings.simplefilter('ignore', BiopythonWarning)

from scipy.sparse import csr_matrix
from joblib import Parallel, delayed
import timeit

import matplotlib.pyplot as plt
%matplotlib inline

import sys
import numpy as np
from scipy import linalg
from sklearn.preprocessing import OneHotEncoder
import expectation_reflection as ER
from direct_info import direct_info
from joblib import Parallel, delayed

tools.hide_toggle()

# Compare DCA and ER Sensitivity
* ER does better than traditional methods with small sample size
* Want to analyse a Protein family with large number of sequences to test sensitivity
    * DCA paper uses PF00071 and PF04542

## PF04542

#### Read In Protein Structure

In [2]:
# Read in Protein structure
data_path = '../protein/Pfam-A.full'
pfam_id = 'PF00186'
pfam_id = 'PF04542'
pdb = np.load('%s/%s/pdb_refs.npy'%(data_path,pfam_id))

# Pre-Process Structure Data
# delete 'b' in front of letters (python 2 --> python 3)
pdb = np.array([pdb[t,i].decode('UTF-8') for t in range(pdb.shape[0]) \
         for i in range(pdb.shape[1])]).reshape(pdb.shape[0],pdb.shape[1])

# Print number of pdb structures in Protein ID folder
npdb = pdb.shape[0]
print('number of pdb structures:',npdb)

# Print PDB array 
print(pdb)
print(pdb[0])

# Create pandas dataframe for protein structure
df = pd.DataFrame(pdb,columns = ['PF','seq','id','uniprot_start','uniprot_start',\
                                 'pdb_id','chain','pdb_start','pdb_end'])
df.head()
tools.hide_toggle()

number of pdb structures: 163
[['PF04542' '893' 'RPOE_ECOLI' ... 'A' '25' '92']
 ['PF04542' '13722' 'RPOE_RHOS4' ... 'C' '29' '97']
 ['PF04542' '13722' 'RPOE_RHOS4' ... 'G' '29' '97']
 ...
 ['PF04542' '100994' 'O67268_AQUAE' ... 'A' '14' '86']
 ['PF04542' '100994' 'O67268_AQUAE' ... 'A' '14' '86']
 ['PF04542' '100994' 'O67268_AQUAE' ... 'G' '14' '86']]
['PF04542' '893' 'RPOE_ECOLI' '25' '92' '1OR7' 'A' '25' '92']


#### Process Protein Data
* Want to implement some changes to original code
    * Enforce AA distance requirement &rightarrow; |i-j|>5

In [3]:
ipdb = 0
print('seq:',int(pdb[ipdb,1]))

# data processing
s0,cols_removed = data_processing(data_path,pfam_id,ipdb,\
                gap_seqs=0.2,gap_cols=0.2,prob_low=0.004,conserved_cols=0.9)

tools.hide_toggle()

seq: 893


In [4]:
# number of positions
n_var = s0.shape[1]
print("Number of residue positions:",n_var)

# number of aminoacids at each position
mx = np.array([len(np.unique(s0[:,i])) for i in range(n_var)])
#mx = np.array([m for i in range(n_var)])
print("Number of different amino acids at each position",mx)

mx_cumsum = np.insert(mx.cumsum(),0,0)
i1i2 = np.stack([mx_cumsum[:-1],mx_cumsum[1:]]).T 
print("(Sanity Check) Column indices of first and (",i1i2[0],") and last (",i1i2[-1],") positions")
print("(Sanity Check) Column indices of second and (",i1i2[1],") and second to last (",i1i2[-2],") positions")


# number of variables
mx_sum = mx.sum()
print("Total number of variables",mx_sum)

# number of bias term
n_linear = mx_sum - n_var

tools.hide_toggle()

Number of residue positions: 63
Number of different amino acids at each position [16 18 17 18 15 20  7 20 16 16 10 18 17 16 18 18 18 20 19 16 13  2  9 12
  9 11  8 13 12 14 12 15 16 19 20 15 15 11 16 18 19 17 16 10 18 13  4  9
 18 17 11  9 17 11 16 15 16 13 19 17 14 13 16]
(Sanity Check) Column indices of first and ( [ 0 16] ) and last ( [905 921] ) positions
(Sanity Check) Column indices of second and ( [16 34] ) and second to last ( [892 905] ) positions
Total number of variables 921


#### Generate OneHot Encoded Sequence (s0 &rightarrow; s)

In [5]:
onehot_encoder = OneHotEncoder(sparse=False,categories='auto')
# s is OneHot encoder format, s0 is original sequnce matrix
s = onehot_encoder.fit_transform(s0)
#print("Amino Acid sequence Matrix\n",s0)
#print("OneHot sequence Matrix\n",s)
#print("An individual element of the OneHot sequence Matrix (size:",
#      s.shape,") --> ",s[0], " has length ",s[0].shape)

tools.hide_toggle()

#### Use Expectation Reflection to infer w, and H
* I want to start saving these files to read in (save on computation time)

In [8]:
# Define wight matrix with variable for each possible amino acid at each sequence position
w = np.zeros((mx.sum(),mx.sum())) 
h0 = np.zeros(mx.sum())

# Expectation Reflection
# Expectation Reflection
#=========================================================================================
def predict_w(s,i0,i1i2,niter_max,l2):
    #print('i0:',i0)
    i1,i2 = i1i2[i0,0],i1i2[i0,1]

    x = np.hstack([s[:,:i1],s[:,i2:]])
    y = s[:,i1:i2]

    h01,w1 = ER.fit(x,y,niter_max,l2)

    return h01,w1

#-------------------------------
# parallel
start_time = timeit.default_timer()
#res = Parallel(n_jobs = 4)(delayed(predict_w)\
res = Parallel(n_jobs = 8)(delayed(predict_w)\
#res = Parallel(n_jobs = 32)(delayed(predict_w)\
        (s,i0,i1i2,niter_max=10,l2=100.0)\
        for i0 in range(n_var))

run_time = timeit.default_timer() - start_time
print('run time:',run_time)
#----------------niter_max,l2)
for i0 in range(n_var):
    i1,i2 = i1i2[i0,0],i1i2[i0,1]

    h01 = res[i0][0]
    w1 = res[i0][1]

    h0[i1:i2] = h01
    w[:i1,i1:i2] = w1[:i1,:]
    w[i2:,i1:i2] = w1[i1:,:]

# make w symmetric
w = (w + w.T)/2.

# Calculate Direct Information from infered weights and Original Sequence Data
di = direct_info(s0,w)

# Save w and di for future use
np.save('ER_w.npy',w)
np.save('ER_DI.npy',di)

tools.hide_toggle()

KeyboardInterrupt: 

#### Calculate Direct Information and Plot against Contact Map

In [None]:
ct = tools.contact_map(pdb,ipdb,cols_removed)

# Calculate Direct Information from infered weights and Original Sequence Data
di = direct_info(s0,w)

# Plot Contact Map and Direct Information 
plt.subplot2grid((1,2),(0,0))
plt.title('Contact Map')
plt.imshow(ct,cmap='rainbow_r',origin='lower')
plt.xlabel('i')
plt.ylabel('j')
plt.colorbar(fraction=0.045,pad=0.05)

plt.subplot2grid((1,2),(0,1))
plt.title('Direct Information')
plt.imshow(di,cmap='rainbow',origin='lower')
plt.xlabel('i')
plt.ylabel('j')
plt.clim(0,0.01)
plt.colorbar(fraction=0.045, pad=0.05)

plt.tight_layout(h_pad=1, w_pad=1.5)
plt.show()

tools.hide_toggle()