In [1]:
import os.path

import numpy as np
import pandas as pd
from scipy import linalg
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OneHotEncoder

import Bio.PDB, warnings
pdb_list = Bio.PDB.PDBList()
pdb_parser = Bio.PDB.PDBParser()
from scipy.spatial import distance_matrix
from Bio import BiopythonWarning
warnings.simplefilter('ignore', BiopythonWarning)

from joblib import Parallel, delayed
import timeit

import matplotlib.pyplot as plt
%matplotlib inline

# # --- Import our Code ---# #
#import emachine as EM
from direct_info import direct_info

# import data processing and general DCA_ER tools
from data_processing import data_processing
import ecc_tools as tools
from pathlib import Path
np.random.seed(1)


def check_symmetric(a, rtol=1e-05, atol=1e-08):
    return np.allclose(a, a.T, rtol=rtol, atol=atol)

In [2]:
data_path = Path('/home', 'eclay','Pfam-A.full')
data_path = Path('Pfam-A.full')
DCA_ER_dir = Path('/home/evan/PycharmProjects/DCA_ER/')



pfam_id = 'PF00023'
pfam_id = 'PF00011'
pfam_id = 'PF00186'

n_jobs = 26
create_new = True


if 0:
    DCA_ER_dir = '/home/eclay/DCA_ER/'
    msa_npy_file = '/home/eclay/Pfam-A.full/%s/msa.npy' % pfam_id # Hurricane Location
    msa_fa_file  = '/home/eclay/Pfam-A.full/%s/msa.fa' % pfam_id # Hurricane Location
    pdb_ref_file = '/home/eclay/Pfam-A.full/%s/pdb_refs.npy' % pfam_id # Hurricane Location
if 1:
    DCA_ER_dir = '/home/evan/DCA_ER/'
    msa_npy_file = '/home/evan/DCA_ER/Pfam-A.full/%s/msa.npy' % pfam_id
    msa_fa_file  = '/home/evan/DCA_ER/Pfam-A.full/%s/msa.fa' % pfam_id
    pdb_ref_file = '/home/evan/DCA_ER/Pfam-A.full/%s/pdb_refs.npy' % pfam_id
    out_dir = '%sprotein_data/di/' % DCA_ER_dir
if 0:
    DCA_ER_dir = '/home/ecresswell/DCA_ER/'
    msa_npy_file = '/home/ecresswell/DCA_ER/Pfam-A.full/%s/msa.npy' % pfam_id
    msa_fa_file  = '/home/ecresswell/DCA_ER/Pfam-A.full/%s/msa.fa' % pfam_id
    pdb_ref_file = '/home/ecresswell/DCA_ER/Pfam-A.full/%s/pdb_refs.npy' % pfam_id
    out_dir = '%sprotein_data/di/' % DCA_ER_dir
if 1:
    DCA_ER_dir = '/data/cresswellclayec/DCA_ER/'
    msa_npy_file = '/data/cresswellclayec/DCA_ER/Pfam-A.full/%s/msa.npy' % pfam_id
    msa_fa_file  = '/data/cresswellclayec/DCA_ER/Pfam-A.full/%s/msa.fa' % pfam_id
    pdb_ref_file = '/data/cresswellclayec/DCA_ER/Pfam-A.full/%s/pdb_refs.npy' % pfam_id
    out_dir = '%sprotein_data/di/' % DCA_ER_dir




# Set DCA_ER directory
DCA_dir = os.getcwd()

# Define data directories
# Need to think on best way to do this..
# Referencing the same dataframe may be useful so we dont always have to load individual ref files...
individual_pdb_ref_file = Path(data_path, pfam_id, 'pdb_refs.npy')
pdb = np.load(individual_pdb_ref_file)
processed_data_dir = "%s/protein_data/data_processing_output" % DCA_dir

# delete 'b' in front of letters (python 2 --> python 3)
pdb = np.array([pdb[t,i].decode('UTF-8') for t in range(pdb.shape[0]) \
         for i in range(pdb.shape[1])]).reshape(pdb.shape[0],pdb.shape[1])


# Print number of pdb structures in Protein ID folder
npdb = pdb.shape[0]
print('number of pdb structures:',npdb)

# Create pandas dataframe for protein structure
pdb_df = pd.DataFrame(pdb,columns = ['PF','seq','id','uniprot_start','uniprot_end',\
                                 'pdb_id','chain','pdb_start','pdb_end'])
print(pdb_df.head())

# print("Direct Information from Expectation reflection:\n",di)
def no_diag(mat, diag_l, s_index=None, make_big=False):
    rows, columns = mat.shape
    if make_big:
        new_mat = 100. * np.ones((rows,columns))
    else:
        new_mat = np.zeros((rows,columns))
    for row in range(rows):
        for col in range(columns):
            if s_index is None:
                if abs(row-col) > diag_l:
                    new_mat[row, col] = mat[row ,col]
            else:
                if abs(s_index[row]-s_index[col]) > diag_l:
                    new_mat[row, col] = mat[row ,col]    
    return new_mat


number of pdb structures: 372
        PF seq            id uniprot_start uniprot_end pdb_id chain pdb_start  \
0  PF00186  69  Q5KZ26_GEOKA             1         160   1ZDR     B         1   
1  PF00186  69  Q5KZ26_GEOKA             1         160   1ZDR     A         1   
2  PF00186  83  Q81R22_BACAN             2         160   3JWK     B         2   
3  PF00186  83  Q81R22_BACAN             2         160   3S9U     B         2   
4  PF00186  83  Q81R22_BACAN             2         160   3FL9     H         2   

  pdb_end  
0     160  
1     160  
2     160  
3     160  
4     160  


In [3]:
import ecc_tools as tools
ipdb = 0
printing = True
print('seq:',int(pdb[ipdb,1]))



s0,cols_removed, s_index, tpdb, orig_seq_len = data_processing(data_path, pfam_id, ipdb,\
                gap_seqs=0.2, gap_cols=0.2, prob_low=0.004, conserved_cols=0.9, printing=printing, \
                                                               out_dir=processed_data_dir)
print('\n\n\nPreprocessed reference Sequence: ', s0[tpdb])

# npy2fa does not remove cols this way we are as close to original as possible
msa_outfile, ref_outfile, s0, cols_removed, s_index, tpdb, orig_seq_len  = tools.npy2fa(pfam_id, msa_npy_file,\
                                                                                        pdb_ref_file=pdb_ref_file,\
                                                                                        ipdb=ipdb, preprocess=True,\
                                                                                        gap_seqs=.2, gap_cols=.2, \
                                                                                        prob_low=.004, \
                                                                                        conserved_cols=.9, \
                                                                                        letter_format=False, \
                                                                                        first_10=True)

print('\n\nfirst 50 S: ', s0.shape, '\n\n')
pdb_id = pdb_df.iloc[ipdb]['pdb_id']
pdb_chain = pdb_df.iloc[ipdb]['chain']

pdb_pfam_map_file = Path('%s/protein_data/pdb_data/pdb_pfam_mapping.csv' % DCA_dir)
pdb_map_df = pd.read_csv(pdb_pfam_map_file, sep=',', header=1)
print(pdb_map_df.head())

pdb_id_map_df = pdb_map_df.loc[pdb_map_df['PDB']==pdb_id.lower()]
pdb_pfam_map = pdb_id_map_df.loc[pdb_id_map_df['CHAIN']==pdb_chain]


seq: 69
pdb ref example (pdb[0])  (after UTF-8 decode, removing 'b'):
 ['PF00186' '69' 'Q5KZ26_GEOKA' '1' '160' '1ZDR' 'B' '1' '160']
tpdb (s_ipdb) is :  69
#

-------------------------Remove Gaps--------------------------#
Shape of s is :  (7750, 918)
s = 
 [['-' '-' '-' ... '-' '-' '-']
 ['-' '-' '-' ... '-' '-' '-']
 ['-' '-' '-' ... '-' '-' '-']
 ...
 ['-' '-' '-' ... '-' '-' '-']
 ['-' '-' '-' ... '-' '-' '-']
 ['-' '-' '-' ... '-' '-' '-']]
s[tpdb] shape is  (160,)
s = 
 [['-' '-' '-' ... 'Y' 'E' 'K']
 ['M' 'I' 'S' ... 'W' 'E' 'R']
 ['-' 'L' 'A' ... 'Y' 'E' 'R']
 ...
 ['-' '-' '-' ... '-' '-' '-']
 ['-' 'V' 'S' ... 'Y' 'E' 'R']
 ['-' 'F' 'S' ... 'Y' 'E' 'K']]
though s still has gaps, s[69] does not:
 ['M' 'I' 'S' 'H' 'I' 'V' 'A' 'M' 'D' 'E' 'N' 'R' 'V' 'I' 'G' 'K' 'D' 'N'
 'R' 'L' 'P' 'W' 'H' 'L' 'P' 'A' 'D' 'L' 'A' 'Y' 'F' 'K' 'R' 'V' 'T' 'M'
 'G' 'H' 'A' 'I' 'V' 'M' 'G' 'R' 'K' 'T' 'F' 'E' 'A' 'I' 'G' 'R' 'P' 'L'
 'P' 'G' 'R' 'D' 'N' 'V' 'V' 'V' 'T' 'R' 'N' 'R' 'S' 'F' 'R' 'P' 

In [4]:
# Compute Frequencies printing everything out
from inference_dca import direct_info_dca
print(s0.shape)
seq_wt_file = None
seq_wt_file = '%s/protein_data/data_processing_output/seq_weight_%s.npy' % (DCA_ER_dir, pfam_id)

# ----------- DCA DI (MF) calculation --------------------------------------------- #

mf_di, fi, fij, c, cinv, w, w2d, fi_pydca, fij_pydca, c_pydca, c_inv_pydca, w_pydca, w2d_pydcak, di_pydca, ma_inv,seq_ints \
= direct_info_dca(s0, seq_wt_outfile=seq_wt_file, first10=True)
print('c[0]:',c[0])
print('c_pydca[0]', c_pydca[0])
def compute_sequences_weight(alignment_data=None, seqid=None):
    """Computes weight of sequences. The weights are calculated by lumping
    together sequences whose identity is greater that a particular threshold.
    For example, if there are m similar sequences, each of them will be assigned
    a weight of 1/m. Note that the effective number of sequences is the sum of
    these weights.

    Parameters
    ----------
        alignmnet_data : np.array()
            Numpy 2d array of the alignment data, after the alignment is put in
            integer representation
        seqid : float
            Value at which beyond this sequences are considered similar. Typical
            values could be 0.7, 0.8, 0.9 and so on

    Returns
    -------
        seqs_weight : np.array()
            A 1d numpy array containing computed weights. This array has a size
            of the number of sequences in the alignment data.
    """
    alignment_shape = alignment_data.shape
    num_seqs = alignment_shape[0]
    seqs_len = alignment_shape[1]
    seqs_weight = np.zeros((num_seqs,), dtype=np.float64)
    inv_seqs_weight = np.zeros((num_seqs,), dtype=np.float64)

    #count similar sequences
    for i in range(num_seqs):
        seq_i = alignment_data[i]
        for j in range(num_seqs):
            seq_j = alignment_data[j]
            iid = np.sum(seq_i==seq_j)
            if np.float64(iid)/np.float64(seqs_len) > seqid:
                seqs_weight[i] += 1
    #compute the weight of each sequence in the alignment
    for i in range(num_seqs): inv_seqs_weight[i] = 1.0/float(seqs_weight[i])
    np.save('first10_seq_weight_pydca.npy', seqs_weight)
    return seqs_weight, inv_seqs_weight


def compute_single_site_freqs(alignment_data=None,
        num_site_states=None, seqs_weight=None):
    """Computes single site frequency counts for a particular aligmnet data.

    Parameters
    ----------
        alignment_data : np.array()
            A 2d numpy array of alignment data represented in integer form.

        num_site_states : int
            An integer value fo the number of states a sequence site can have
            including a gap state. Typical value is 5 for RNAs and 21 for
            proteins.

        seqs_weight : np.array()
            A 1d numpy array of sequences weight

    Returns
    -------
        single_site_freqs : np.array()
            A 2d numpy array of of data type float64. The shape of this array is
            (seqs_len, num_site_states) where seqs_len is the length of sequences
            in the alignment data.
    """
    alignment_shape = alignment_data.shape
    #num_seqs = alignment_shape[0]
    seqs_len = alignment_shape[1]
    m_eff = np.sum(seqs_weight)
    print('pydca m_eff = %f' % m_eff)
    print(m_eff)
    single_site_freqs = np.zeros(shape = (seqs_len, num_site_states),
        dtype = np.float64)
    for i in range(seqs_len):
        
        # CHANGE
        #for a in range(1, num_site_states + 1):#we need gap states single site freqs too
        for a in range(num_site_states):#we need gap states single site freqs too

            column_i = alignment_data[:,i]
            freq_ia = np.sum((column_i==a)*seqs_weight)
            if freq_ia >0:
                print('site %d-%d freq and count: ' % (i, a), freq_ia, np.sum((column_i==a)))
            single_site_freqs[i, a-1] = freq_ia/m_eff
    return single_site_freqs


def get_reg_single_site_freqs(single_site_freqs = None, seqs_len = None,
        num_site_states = None, pseudocount = None):
    """Regularizes single site frequencies.

    Parameters
    ----------
        single_site_freqs : np.array()
            A 2d numpy array of single site frequencies of shape
            (seqs_len, num_site_states). Note that gap state frequencies are
            included in this data.
        seqs_len : int
            The length of sequences in the alignment data
        num_site_states : int
            Total number of states that a site in a sequence can accommodate. It
            includes gap states.
        pseudocount : float
            This is the value of the relative pseudo count of type float.
            theta = lambda/(meff + lambda), where meff is the effective number of
            sequences and lambda is the real pseudo count.

    Returns
    -------
        reg_single_site_freqs : np.array()
            A 2d numpy array of shape (seqs_len, num_site_states) of single site
            frequencies after they are regularized.
    """
    reg_single_site_freqs = single_site_freqs
    theta_by_q = np.float64(pseudocount)/np.float64(num_site_states)
    for i in range(seqs_len):
        for a in range(num_site_states):
            reg_single_site_freqs[i, a] = theta_by_q + \
                (1.0 - pseudocount)*reg_single_site_freqs[i, a]
    return reg_single_site_freqs


def compute_pair_site_freqs(alignment_data=None, num_site_states=None, seqs_weight=None):
    """Computes pair site frequencies for an alignmnet data.

    Parameters
    ----------
        alignment_data : np.array()
            A 2d numpy array conatining alignment data. The residues in the
            alignment are in integer representation.
        num_site_states : int
            The number of possible states including gap state that sequence
            sites can accomodate. It must be an integer
        seqs_weight:
            A 1d numpy array of sequences weight

    Returns
    -------
        pair_site_freqs : np.array()
            A 3d numpy array of shape
            (num_pairs, num_site_states, num_site_states) where num_pairs is
            the number of unique pairs we can form from sequence sites. The
            pairs are assumed to in the order (0, 1), (0, 2) (0, 3), ...(0, L-1),
            ... (L-1, L). This ordering is critical and any change must be
            documented.
    """
    alignment_shape = alignment_data.shape
    num_seqs = alignment_shape[0]
    seqs_len = alignment_shape[1]
    num_site_pairs = (seqs_len -1)*seqs_len/2
    num_site_pairs = np.int64(num_site_pairs)
    m_eff = np.sum(seqs_weight)
    pair_site_freqs = np.zeros(
        shape=(num_site_pairs, num_site_states - 1, num_site_states - 1),
        dtype = np.float64
    )
    for i in range(seqs_len - 1):
        column_i = alignment_data[:, i]
        for j in range(i+1, seqs_len):
            pair_site = int((seqs_len * (seqs_len - 1)/2) - (seqs_len - i) * ((seqs_len - i) - 1)/2  + j  - i - 1)
            column_j = alignment_data[:, j]
            for a in range(1, num_site_states):
                count_ai = column_i==a
                for b in range(1, num_site_states):
                    count_bj = column_j==b
                    count_ai_bj = count_ai * count_bj
                    freq_ia_jb = np.sum(count_ai_bj*seqs_weight)
                    if freq_ia_jb > 0.0:
                        print('freq for %d-%d, %d-%d:' %(i,a,j,b), freq_ia_jb)
                    pair_site_freqs[pair_site, a-1, b-1] += freq_ia_jb/m_eff
    return pair_site_freqs


(6573, 10)
(6573, 10)
ma_inv (sequences weight shape:  (6573,)
tais meff = 1498.723575
(10, 21)
meff for our MF =  1498.7235751716441
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for

freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211
freq for 3-0, 9-0: 0.013483309669821211


freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.07331402303915104
freq for 6-0, 9-0: 0.0733

(45, 20, 20)
200
c[0]: [ 0.02324263 -0.00056689 -0.00056689 -0.00056689 -0.00056689 -0.00056689
 -0.00056689 -0.00056689 -0.00056689 -0.00056689 -0.00057484 -0.00056689
 -0.00056689 -0.00056689 -0.00056689 -0.00056689 -0.00056689 -0.00056689
 -0.00056689 -0.00056689  0.00056689  0.00056689  0.00056689  0.00056689
  0.00055895  0.00056689  0.00056689  0.00056557  0.00056689  0.00056425
  0.00055895  0.00056689  0.00056689  0.00056689  0.00056689  0.00056689
  0.00056689  0.00056292  0.00056689  0.00056689  0.00056623  0.00056292
  0.00056689  0.0005664   0.00056576  0.00056557  0.00056674  0.00056292
  0.00055895  0.00056689  0.00056689  0.0005661   0.00056689  0.00056682
  0.00056557  0.00055895  0.00056672  0.00056658  0.00056689  0.00055895
  0.00056292  0.00056676  0.00056689  0.00056689  0.00056658  0.00055895
  0.00055895  0.00055895  0.00056689  0.00056557  0.00056557  0.00056689
  0.00056689  0.00056689  0.00056689  0.00055895  0.00056689  0.00056682
  0.00056689  0.00056292  0.

In [5]:
theta = .2
#try:
 #   seq_weight = np.load('first10_seq_weight_pydca.npy')
#except(FileNotFoundError):
seq_int_count, seq_weight = compute_sequences_weight(alignment_data=s0, seqid=float(1.-theta))
pydca_fi = compute_single_site_freqs(alignment_data=s0, num_site_states=21, seqs_weight=seq_weight)


pydca m_eff = 1498.723575
1498.7235751716441
site 0-10 freq and count:  1.0 1
site 0-20 freq and count:  1497.7235751716441 6572
site 1-4 freq and count:  61.1207123556262 159
site 1-7 freq and count:  519.9221889424228 2630
site 1-9 freq and count:  378.2686072612425 1789
site 1-10 freq and count:  21.444616790205025 51
site 1-17 freq and count:  213.30826684450244 735
site 1-20 freq and count:  304.65918297764506 1209
site 2-0 freq and count:  167.87334367100928 606
site 2-1 freq and count:  26.846696448712578 65
site 2-3 freq and count:  27.15833333333333 59
site 2-4 freq and count:  13.116733511586453 46
site 2-5 freq and count:  63.114436667055834 373
site 2-6 freq and count:  31.332367557660326 95
site 2-7 freq and count:  102.63651432077248 275
site 2-8 freq and count:  71.1758604275334 188
site 2-11 freq and count:  106.75429455658062 550
site 2-13 freq and count:  24.73790872386737 158
site 2-14 freq and count:  32.32732683982684 68
site 2-15 freq and count:  376.9330772841479

In [6]:
pydca_fij = compute_pair_site_freqs(alignment_data=s0, num_site_states=21, seqs_weight=seq_weight)

freq for 0-10, 1-7: 1.0
freq for 0-20, 1-4: 61.1207123556262
freq for 0-20, 1-7: 518.9221889424228
freq for 0-20, 1-9: 378.2686072612425
freq for 0-20, 1-10: 21.444616790205025
freq for 0-20, 1-17: 213.30826684450244
freq for 0-20, 1-20: 304.65918297764506
freq for 0-10, 2-15: 1.0
freq for 0-20, 2-1: 26.846696448712578
freq for 0-20, 2-3: 27.15833333333333
freq for 0-20, 2-4: 13.116733511586453
freq for 0-20, 2-5: 63.114436667055834
freq for 0-20, 2-6: 31.332367557660326
freq for 0-20, 2-7: 102.63651432077248
freq for 0-20, 2-8: 71.1758604275334
freq for 0-20, 2-11: 106.75429455658062
freq for 0-20, 2-13: 24.73790872386737
freq for 0-20, 2-14: 32.32732683982684
freq for 0-20, 2-15: 375.9330772841479
freq for 0-20, 2-16: 192.56629459298549
freq for 0-20, 2-17: 98.89363526346244
freq for 0-20, 2-19: 25.243509984639015
freq for 0-20, 2-20: 138.01324198847078
freq for 0-10, 3-6: 1.0
freq for 0-20, 3-1: 20.056524255549927
freq for 0-20, 3-4: 97.20576749284079
freq for 0-20, 3-5: 20.08242296

freq for 1-4, 7-1: 1.5
freq for 1-4, 7-3: 4.0
freq for 1-4, 7-4: 1.3333333333333333
freq for 1-4, 7-6: 1.6666666666666665
freq for 1-4, 7-7: 4.7
freq for 1-4, 7-8: 2.0
freq for 1-4, 7-9: 3.0
freq for 1-4, 7-10: 11.465151515151515
freq for 1-4, 7-13: 1.0833333333333335
freq for 1-4, 7-14: 1.5
freq for 1-4, 7-16: 2.7111111111111112
freq for 1-4, 7-17: 18.843621269324593
freq for 1-7, 7-1: 18.925833333333333
freq for 1-7, 7-3: 34.73899935770645
freq for 1-7, 7-4: 9.442589763177999
freq for 1-7, 7-6: 16.056856977445214
freq for 1-7, 7-7: 47.00348368453369
freq for 1-7, 7-8: 11.716988011988011
freq for 1-7, 7-9: 38.69142364241281
freq for 1-7, 7-10: 91.14830417534019
freq for 1-7, 7-13: 47.63732877116044
freq for 1-7, 7-14: 48.551340006001126
freq for 1-7, 7-15: 5.5
freq for 1-7, 7-16: 14.521177436491836
freq for 1-7, 7-17: 78.35602207898123
freq for 1-7, 7-19: 20.064547435135673
freq for 1-9, 7-1: 10.990873015873015
freq for 1-9, 7-3: 22.653308211396443
freq for 1-9, 7-4: 5.069444444444445

freq for 2-14, 5-18: 13.93982683982684
freq for 2-14, 5-19: 1.0625
freq for 2-15, 5-4: 14.471059240323946
freq for 2-15, 5-9: 17.42941919191919
freq for 2-15, 5-15: 11.204799586378533
freq for 2-15, 5-17: 205.88768243485754
freq for 2-15, 5-18: 24.572650225628756
freq for 2-15, 5-19: 2.537581699346405
freq for 2-15, 5-20: 0.3333333333333333
freq for 2-16, 5-4: 2.0
freq for 2-16, 5-9: 9.157818749518079
freq for 2-16, 5-17: 90.82648777531826
freq for 2-16, 5-18: 23.464466089466086
freq for 2-16, 5-19: 4.417247386759582
freq for 2-17, 5-4: 0.5
freq for 2-17, 5-9: 4.9213744588744595
freq for 2-17, 5-15: 4.0
freq for 2-17, 5-17: 41.13316883737397
freq for 2-17, 5-18: 3.2695406445406445
freq for 2-17, 5-19: 3.670670995670996
freq for 2-19, 5-9: 1.0
freq for 2-19, 5-17: 11.932795698924732
freq for 2-19, 5-18: 2.2916666666666665
freq for 2-19, 5-19: 1.5
freq for 2-20, 5-4: 3.0
freq for 2-20, 5-9: 9.916666666666668
freq for 2-20, 5-15: 2.0
freq for 2-20, 5-17: 50.235500339695
freq for 2-20, 5-1

freq for 3-7, 4-10: 3.0
freq for 3-7, 4-17: 102.04653632759283
freq for 3-7, 4-19: 1.2750000000000001
freq for 3-7, 4-20: 1.0
freq for 3-9, 4-6: 1.7857142857142856
freq for 3-9, 4-7: 327.11234955746124
freq for 3-9, 4-9: 20.055232294938175
freq for 3-9, 4-10: 14.000548471136707
freq for 3-9, 4-17: 147.2269250279664
freq for 3-9, 4-19: 1.0
freq for 3-10, 4-7: 143.3625754554074
freq for 3-10, 4-10: 4.066666666666666
freq for 3-10, 4-17: 34.18345864661654
freq for 3-15, 4-7: 15.156509592102813
freq for 3-15, 4-9: 1.0
freq for 3-15, 4-10: 2.0
freq for 3-15, 4-17: 7.79766573295985
freq for 3-17, 4-7: 42.67783605283605
freq for 3-17, 4-9: 2.25
freq for 3-17, 4-17: 23.821690087433275
freq for 3-19, 4-7: 9.33066378066378
freq for 3-19, 4-9: 3.0
freq for 3-19, 4-10: 2.6666666666666665
freq for 3-19, 4-17: 8.166666666666668
freq for 3-20, 4-6: 2.0
freq for 3-20, 4-7: 5.25
freq for 3-20, 4-9: 2.0
freq for 3-20, 4-10: 1.0
freq for 3-20, 4-17: 7.125
freq for 3-20, 4-19: 1.0
freq for 3-20, 4-20: 20.

freq for 4-6, 5-4: 0.5
freq for 4-6, 5-9: 3.94757326007326
freq for 4-6, 5-17: 2.9166666666666665
freq for 4-6, 5-19: 1.5
freq for 4-7, 5-4: 19.844406378222168
freq for 4-7, 5-9: 42.09390408081979
freq for 4-7, 5-15: 20.90479958637853
freq for 4-7, 5-17: 397.3444296330416
freq for 4-7, 5-18: 158.06013634521315
freq for 4-7, 5-19: 10.972992851201731
freq for 4-7, 5-20: 0.35416666666666663
freq for 4-9, 5-4: 2.895518207282913
freq for 4-9, 5-9: 2.575847763347763
freq for 4-9, 5-17: 23.345383661088796
freq for 4-9, 5-18: 20.245920745920746
freq for 4-9, 5-19: 0.42647058823529416
freq for 4-10, 5-4: 2.0
freq for 4-10, 5-9: 2.333333333333333
freq for 4-10, 5-17: 12.144141670390283
freq for 4-10, 5-18: 7.244755244755245
freq for 4-17, 5-4: 10.144444444444446
freq for 4-17, 5-9: 14.731165916301956
freq for 4-17, 5-15: 3.0
freq for 4-17, 5-17: 221.20979998178413
freq for 4-17, 5-18: 54.70823028091515
freq for 4-17, 5-19: 13.109806295399515
freq for 4-19, 5-9: 1.3333333333333335
freq for 4-19, 

freq for 5-4, 8-2: 16.80439257365728
freq for 5-4, 8-5: 5.333333333333334
freq for 5-4, 8-11: 1.0
freq for 5-4, 8-15: 2.142857142857143
freq for 5-9, 8-1: 1.0
freq for 5-9, 8-2: 23.48239538239538
freq for 5-9, 8-5: 6.833333333333333
freq for 5-9, 8-11: 1.0
freq for 5-9, 8-15: 7.0
freq for 5-9, 8-16: 5.416439545248597
freq for 5-15, 8-1: 1.0
freq for 5-15, 8-2: 2.833333333333333
freq for 5-15, 8-5: 2.263157894736842
freq for 5-15, 8-15: 6.25
freq for 5-15, 8-16: 1.0
freq for 5-17, 8-1: 22.42834765405098
freq for 5-17, 8-2: 170.8151461546316
freq for 5-17, 8-5: 62.47442639876463
freq for 5-17, 8-11: 8.827380952380953
freq for 5-17, 8-15: 115.88546756568982
freq for 5-17, 8-16: 81.61338035977333
freq for 5-18, 8-1: 0.08333333333333333
freq for 5-18, 8-2: 93.06094939011942
freq for 5-18, 8-5: 20.98514073645653
freq for 5-18, 8-11: 3.0
freq for 5-18, 8-15: 41.261602536462775
freq for 5-18, 8-16: 31.728673901223644
freq for 5-19, 8-2: 10.426470588235293
freq for 5-19, 8-15: 3.5
freq for 5-19

freq for 7-10, 9-13: 24.384349573257396
freq for 7-10, 9-14: 38.460923245141515
freq for 7-10, 9-15: 14.427093628549866
freq for 7-10, 9-16: 11.151103471691707
freq for 7-10, 9-17: 3.1736146632566067
freq for 7-13, 9-2: 2.1870370370370367
freq for 7-13, 9-3: 14.176478599135558
freq for 7-13, 9-5: 1.1798941798941798
freq for 7-13, 9-6: 5.789468124468124
freq for 7-13, 9-7: 1.9861111111111112
freq for 7-13, 9-8: 17.296964721102654
freq for 7-13, 9-9: 12.451864035087718
freq for 7-13, 9-10: 1.2
freq for 7-13, 9-11: 7.665656565656565
freq for 7-13, 9-12: 8.296283245739767
freq for 7-13, 9-13: 11.566443001443002
freq for 7-13, 9-14: 30.05780105310946
freq for 7-13, 9-15: 9.21220950102529
freq for 7-13, 9-16: 8.847374670342486
freq for 7-13, 9-17: 4.844088161580421
freq for 7-14, 9-2: 15.889787740639132
freq for 7-14, 9-3: 7.599127187106728
freq for 7-14, 9-5: 1.0
freq for 7-14, 9-6: 1.0136986301369864
freq for 7-14, 9-7: 3.5
freq for 7-14, 9-8: 9.262364243943193
freq for 7-14, 9-9: 6.75
fre

In [7]:
# Compute Frequencies printing everything out
from inference_dca import direct_info_dca
print(s0.shape)
seq_wt_file = None
seq_wt_file = '%s/protein_data/data_processing_output/first10_seq_weight_%s.npy' % (DCA_ER_dir, pfam_id)

# ----------- DCA DI (MF) calculation --------------------------------------------- #

mf_di, fi, fij, c, cinv, w, w2d, fi_pydca, fij_pydca, c_pydca, c_inv_pydca, w_pydca, w2d_pydcak, di_pydca, ma_inv, seq_ints \
= direct_info_dca(s0, seq_wt_outfile=seq_wt_file, first10=True)
print('c[0]:',c[0])
print('c_pydca[0]', c_pydca[0])

(6573, 10)
(6573, 10)
ma_inv (sequences weight shape:  (6573,)
tais meff = 1498.723575
(10, 21)
meff for our MF =  1498.7235751716441
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for 0-0, 9-0: 0.07331402303915104
freq for

freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.01185407392989529
freq for 2-0, 9-0: 0.0118

freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417
freq for 5-0, 9-0: 0.021632144291582417


freq for 8-0, 9-0: 0.008608913055166482
freq for 8-0, 9-0: 0.008608913055166482
freq for 8-0, 9-0: 0.008608913055166482
freq for 8-0, 9-0: 0.008608913055166482
freq for 8-0, 9-0: 0.008608913055166482
freq for 8-0, 9-0: 0.008608913055166482
freq for 8-0, 9-0: 0.008608913055166482
freq for 8-0, 9-0: 0.008608913055166482
freq for 8-0, 9-0: 0.008608913055166482
freq for 8-0, 9-0: 0.008608913055166482
freq for 9-0, 9-0: 0.0006672344497453262
freq for 9-0, 9-0: 0.0006672344497453262
freq for 9-0, 9-0: 0.0006672344497453262
freq for 9-0, 9-0: 0.0006672344497453262
freq for 9-0, 9-0: 0.0006672344497453262
freq for 9-0, 9-0: 0.0006672344497453262
freq for 9-0, 9-0: 0.0006672344497453262
freq for 9-0, 9-0: 0.0006672344497453262
freq for 9-0, 9-0: 0.0006672344497453262
freq for 9-0, 9-0: 0.0006672344497453262
freq for 9-0, 9-0: 0.0006672344497453262
freq for 9-0, 9-0: 0.0006672344497453262
freq for 9-0, 9-0: 0.0006672344497453262
freq for 9-0, 9-0: 0.0006672344497453262
freq for 9-0, 9-0: 0.00066

c[0]: [ 0.02324263 -0.00056689 -0.00056689 -0.00056689 -0.00056689 -0.00056689
 -0.00056689 -0.00056689 -0.00056689 -0.00056689 -0.00057484 -0.00056689
 -0.00056689 -0.00056689 -0.00056689 -0.00056689 -0.00056689 -0.00056689
 -0.00056689 -0.00056689  0.00056689  0.00056689  0.00056689  0.00056689
  0.00055895  0.00056689  0.00056689  0.00056557  0.00056689  0.00056425
  0.00055895  0.00056689  0.00056689  0.00056689  0.00056689  0.00056689
  0.00056689  0.00056292  0.00056689  0.00056689  0.00056623  0.00056292
  0.00056689  0.0005664   0.00056576  0.00056557  0.00056674  0.00056292
  0.00055895  0.00056689  0.00056689  0.0005661   0.00056689  0.00056682
  0.00056557  0.00055895  0.00056672  0.00056658  0.00056689  0.00055895
  0.00056292  0.00056676  0.00056689  0.00056689  0.00056658  0.00055895
  0.00055895  0.00055895  0.00056689  0.00056557  0.00056557  0.00056689
  0.00056689  0.00056689  0.00056689  0.00055895  0.00056689  0.00056682
  0.00056689  0.00056292  0.00056689  0.00056

In [8]:
print('pydca sequences weight array length: ', seq_weight.shape)

pydca sequences weight array length:  (6573,)


In [9]:
print(ma_inv.shape)

(6573,)


In [10]:
print(seq_int_count[:15])
print(seq_ints[:15])
print('\n\nthe difference between our seq_weight and their seq_weights for all %d sequences is %d.... QED\n\n' % 
      (s0.shape[0], abs(seq_weight - seq_ints).sum()))


print(seq_weight[:15])
print(ma_inv[:15])

[ 1.  6.  2. 98.  3. 34. 98. 11. 20. 22.  3.  4. 19. 24. 23.]
[ 1.  6.  2. 98.  3. 34. 98. 11. 20. 22.  3.  4. 19. 24. 23.]


the difference between our seq_weight and their seq_weights for all 6573 sequences is 140010.... QED


[1.         0.16666667 0.5        0.01020408 0.33333333 0.02941176
 0.01020408 0.09090909 0.05       0.04545455 0.33333333 0.25
 0.05263158 0.04166667 0.04347826]
[1.         0.16666667 0.5        0.01020408 0.33333333 0.02941176
 0.01020408 0.09090909 0.05       0.04545455 0.33333333 0.25
 0.05263158 0.04166667 0.04347826]


In [11]:
print(fi.shape)
print(fij.shape)

(10, 21)
(10, 10, 21, 21)
