In [1]:
import os.path

import numpy as np
import pandas as pd
from scipy import linalg
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OneHotEncoder

import Bio.PDB, warnings
pdb_list = Bio.PDB.PDBList()
pdb_parser = Bio.PDB.PDBParser()
from scipy.spatial import distance_matrix
from Bio import BiopythonWarning
warnings.simplefilter('ignore', BiopythonWarning)

from joblib import Parallel, delayed
import timeit

import matplotlib.pyplot as plt
%matplotlib inline

# # --- Import our Code ---# #
#import emachine as EM
from direct_info import direct_info

# import data processing and general DCA_ER tools
from data_processing import data_processing
import ecc_tools as tools
from pathlib import Path
np.random.seed(1)


def check_symmetric(a, rtol=1e-05, atol=1e-08):
    return np.allclose(a, a.T, rtol=rtol, atol=atol)

In [2]:
data_path = Path('/home', 'eclay','Pfam-A.full')
data_path = Path('Pfam-A.full')
DCA_ER_dir = Path('/home/evan/PycharmProjects/DCA_ER/')



pfam_id = 'PF00023'
pfam_id = 'PF00011'
pfam_id = 'PF00186'

n_jobs = 26
create_new = True


if 0:
    DCA_ER_dir = '/home/eclay/DCA_ER/'
    msa_npy_file = '/home/eclay/Pfam-A.full/%s/msa.npy' % pfam_id # Hurricane Location
    msa_fa_file  = '/home/eclay/Pfam-A.full/%s/msa.fa' % pfam_id # Hurricane Location
    pdb_ref_file = '/home/eclay/Pfam-A.full/%s/pdb_refs.npy' % pfam_id # Hurricane Location
if 1:
    DCA_ER_dir = '/home/evan/DCA_ER/'
    msa_npy_file = '/home/evan/DCA_ER/Pfam-A.full/%s/msa.npy' % pfam_id
    msa_fa_file  = '/home/evan/DCA_ER/Pfam-A.full/%s/msa.fa' % pfam_id
    pdb_ref_file = '/home/evan/DCA_ER/Pfam-A.full/%s/pdb_refs.npy' % pfam_id
    out_dir = '%sprotein_data/di/' % DCA_ER_dir
if 0:
    DCA_ER_dir = '/home/ecresswell/DCA_ER/'
    msa_npy_file = '/home/ecresswell/DCA_ER/Pfam-A.full/%s/msa.npy' % pfam_id
    msa_fa_file  = '/home/ecresswell/DCA_ER/Pfam-A.full/%s/msa.fa' % pfam_id
    pdb_ref_file = '/home/ecresswell/DCA_ER/Pfam-A.full/%s/pdb_refs.npy' % pfam_id
    out_dir = '%sprotein_data/di/' % DCA_ER_dir
if 1:
    DCA_ER_dir = '/data/cresswellclayec/DCA_ER/'
    msa_npy_file = '/data/cresswellclayec/DCA_ER/Pfam-A.full/%s/msa.npy' % pfam_id
    msa_fa_file  = '/data/cresswellclayec/DCA_ER/Pfam-A.full/%s/msa.fa' % pfam_id
    pdb_ref_file = '/data/cresswellclayec/DCA_ER/Pfam-A.full/%s/pdb_refs.npy' % pfam_id
    out_dir = '%sprotein_data/di/' % DCA_ER_dir




# Set DCA_ER directory
DCA_dir = os.getcwd()

# Define data directories
# Need to think on best way to do this..
# Referencing the same dataframe may be useful so we dont always have to load individual ref files...
individual_pdb_ref_file = Path(data_path, pfam_id, 'pdb_refs.npy')
pdb = np.load(individual_pdb_ref_file)
processed_data_dir = "%s/protein_data/data_processing_output" % DCA_dir

# delete 'b' in front of letters (python 2 --> python 3)
pdb = np.array([pdb[t,i].decode('UTF-8') for t in range(pdb.shape[0]) \
         for i in range(pdb.shape[1])]).reshape(pdb.shape[0],pdb.shape[1])


# Print number of pdb structures in Protein ID folder
npdb = pdb.shape[0]
print('number of pdb structures:',npdb)

# Create pandas dataframe for protein structure
pdb_df = pd.DataFrame(pdb,columns = ['PF','seq','id','uniprot_start','uniprot_end',\
                                 'pdb_id','chain','pdb_start','pdb_end'])
print(pdb_df.head())

# print("Direct Information from Expectation reflection:\n",di)
def no_diag(mat, diag_l, s_index=None, make_big=False):
    rows, columns = mat.shape
    if make_big:
        new_mat = 100. * np.ones((rows,columns))
    else:
        new_mat = np.zeros((rows,columns))
    for row in range(rows):
        for col in range(columns):
            if s_index is None:
                if abs(row-col) > diag_l:
                    new_mat[row, col] = mat[row ,col]
            else:
                if abs(s_index[row]-s_index[col]) > diag_l:
                    new_mat[row, col] = mat[row ,col]    
    return new_mat


number of pdb structures: 372
        PF seq            id uniprot_start uniprot_end pdb_id chain pdb_start  \
0  PF00186  69  Q5KZ26_GEOKA             1         160   1ZDR     B         1   
1  PF00186  69  Q5KZ26_GEOKA             1         160   1ZDR     A         1   
2  PF00186  83  Q81R22_BACAN             2         160   3JWK     B         2   
3  PF00186  83  Q81R22_BACAN             2         160   3S9U     B         2   
4  PF00186  83  Q81R22_BACAN             2         160   3FL9     H         2   

  pdb_end  
0     160  
1     160  
2     160  
3     160  
4     160  


In [3]:
import ecc_tools as tools
ipdb = 0
printing = True
print('seq:',int(pdb[ipdb,1]))



s0,cols_removed, s_index, tpdb, orig_seq_len = data_processing(data_path, pfam_id, ipdb,\
                gap_seqs=0.2, gap_cols=0.2, prob_low=0.004, conserved_cols=0.9, printing=printing, \
                                                               out_dir=processed_data_dir)
print('\n\n\nPreprocessed reference Sequence: ', s0[tpdb])

# npy2fa does not remove cols this way we are as close to original as possible
msa_outfile, ref_outfile, s0, cols_removed, s_index, tpdb, orig_seq_len  = tools.npy2fa(pfam_id, msa_npy_file,\
                                                                                        pdb_ref_file=pdb_ref_file,\
                                                                                        ipdb=ipdb, preprocess=True,\
                                                                                        gap_seqs=.2, gap_cols=.2, \
                                                                                        prob_low=.004, \
                                                                                        conserved_cols=.9, \
                                                                                        letter_format=False, \
                                                                                        first_10=True)

print('\n\nfirst 50 S: ', s0.shape, '\n\n')
pdb_id = pdb_df.iloc[ipdb]['pdb_id']
pdb_chain = pdb_df.iloc[ipdb]['chain']

pdb_pfam_map_file = Path('%s/protein_data/pdb_data/pdb_pfam_mapping.csv' % DCA_dir)
pdb_map_df = pd.read_csv(pdb_pfam_map_file, sep=',', header=1)
print(pdb_map_df.head())

pdb_id_map_df = pdb_map_df.loc[pdb_map_df['PDB']==pdb_id.lower()]
pdb_pfam_map = pdb_id_map_df.loc[pdb_id_map_df['CHAIN']==pdb_chain]


seq: 69
pdb ref example (pdb[0])  (after UTF-8 decode, removing 'b'):
 ['PF00186' '69' 'Q5KZ26_GEOKA' '1' '160' '1ZDR' 'B' '1' '160']
tpdb (s_ipdb) is :  69
#

-------------------------Remove Gaps--------------------------#
Shape of s is :  (7750, 918)
s = 
 [['-' '-' '-' ... '-' '-' '-']
 ['-' '-' '-' ... '-' '-' '-']
 ['-' '-' '-' ... '-' '-' '-']
 ...
 ['-' '-' '-' ... '-' '-' '-']
 ['-' '-' '-' ... '-' '-' '-']
 ['-' '-' '-' ... '-' '-' '-']]
s[tpdb] shape is  (160,)
s = 
 [['-' '-' '-' ... 'Y' 'E' 'K']
 ['M' 'I' 'S' ... 'W' 'E' 'R']
 ['-' 'L' 'A' ... 'Y' 'E' 'R']
 ...
 ['-' '-' '-' ... '-' '-' '-']
 ['-' 'V' 'S' ... 'Y' 'E' 'R']
 ['-' 'F' 'S' ... 'Y' 'E' 'K']]
though s still has gaps, s[69] does not:
 ['M' 'I' 'S' 'H' 'I' 'V' 'A' 'M' 'D' 'E' 'N' 'R' 'V' 'I' 'G' 'K' 'D' 'N'
 'R' 'L' 'P' 'W' 'H' 'L' 'P' 'A' 'D' 'L' 'A' 'Y' 'F' 'K' 'R' 'V' 'T' 'M'
 'G' 'H' 'A' 'I' 'V' 'M' 'G' 'R' 'K' 'T' 'F' 'E' 'A' 'I' 'G' 'R' 'P' 'L'
 'P' 'G' 'R' 'D' 'N' 'V' 'V' 'V' 'T' 'R' 'N' 'R' 'S' 'F' 'R' 'P' 

In [15]:

def compute_sequences_weight(alignment_data=None, seqid=None):
    """Computes weight of sequences. The weights are calculated by lumping
    together sequences whose identity is greater that a particular threshold.
    For example, if there are m similar sequences, each of them will be assigned
    a weight of 1/m. Note that the effective number of sequences is the sum of
    these weights.

    Parameters
    ----------
        alignmnet_data : np.array()
            Numpy 2d array of the alignment data, after the alignment is put in
            integer representation
        seqid : float
            Value at which beyond this sequences are considered similar. Typical
            values could be 0.7, 0.8, 0.9 and so on

    Returns
    -------
        seqs_weight : np.array()
            A 1d numpy array containing computed weights. This array has a size
            of the number of sequences in the alignment data.
    """
    alignment_shape = alignment_data.shape
    num_seqs = alignment_shape[0]
    seqs_len = alignment_shape[1]
    seqs_weight = np.zeros((num_seqs,), dtype=np.float64)
    inv_seqs_weight = np.zeros((num_seqs,), dtype=np.float64)

    #count similar sequences
    for i in range(num_seqs):
        seq_i = alignment_data[i]
        for j in range(num_seqs):
            seq_j = alignment_data[j]
            iid = np.sum(seq_i==seq_j)
            if np.float64(iid)/np.float64(seqs_len) > seqid:
                seqs_weight[i] += 1
    #compute the weight of each sequence in the alignment
    for i in range(num_seqs): inv_seqs_weight[i] = 1.0/float(seqs_weight[i])
    np.save('first10_seq_weight_pydca.npy', seqs_weight)
    return seqs_weight, inv_seqs_weight


def compute_single_site_freqs(alignment_data=None,
        num_site_states=None, seqs_weight=None):
    """Computes single site frequency counts for a particular aligmnet data.

    Parameters
    ----------
        alignment_data : np.array()
            A 2d numpy array of alignment data represented in integer form.

        num_site_states : int
            An integer value fo the number of states a sequence site can have
            including a gap state. Typical value is 5 for RNAs and 21 for
            proteins.

        seqs_weight : np.array()
            A 1d numpy array of sequences weight

    Returns
    -------
        single_site_freqs : np.array()
            A 2d numpy array of of data type float64. The shape of this array is
            (seqs_len, num_site_states) where seqs_len is the length of sequences
            in the alignment data.
    """
    alignment_shape = alignment_data.shape
    #num_seqs = alignment_shape[0]
    seqs_len = alignment_shape[1]
    m_eff = np.sum(seqs_weight)
    print('pydca m_eff = %f' % m_eff)
    print(m_eff)
    single_site_freqs = np.zeros(shape = (seqs_len, num_site_states),
        dtype = np.float64)
    for i in range(seqs_len):
        
        #for a in range(1, num_site_states + 1):#we need gap states single site freqs too
        for a in range(num_site_states-1):#we need gap states single site freqs too  ## ECC CHANGE 
                                            ## why do you need gap states? also shifted aa range to 0-20

            column_i = alignment_data[:,i]
            freq_ia = np.sum((column_i==a)*seqs_weight)
            # single_site_freqs[i, a-1] = freq_ia/m_eff
            single_site_freqs[i, a] = freq_ia/m_eff ## ECC CHANGE -- index offset no longer necessary
            if freq_ia >0:
                print('site %d-%d freq and count: ' % (i, a), single_site_freqs[i,a-1], np.sum((column_i==a)))
            
    return single_site_freqs


def get_reg_single_site_freqs(single_site_freqs = None, seqs_len = None,
        num_site_states = None, pseudocount = None):
    """Regularizes single site frequencies.

    Parameters
    ----------
        single_site_freqs : np.array()
            A 2d numpy array of single site frequencies of shape
            (seqs_len, num_site_states). Note that gap state frequencies are
            included in this data.
        seqs_len : int
            The length of sequences in the alignment data
        num_site_states : int
            Total number of states that a site in a sequence can accommodate. It
            includes gap states.
        pseudocount : float
            This is the value of the relative pseudo count of type float.
            theta = lambda/(meff + lambda), where meff is the effective number of
            sequences and lambda is the real pseudo count.

    Returns
    -------
        reg_single_site_freqs : np.array()
            A 2d numpy array of shape (seqs_len, num_site_states) of single site
            frequencies after they are regularized.
    """
    reg_single_site_freqs = single_site_freqs
    theta_by_q = np.float64(pseudocount)/np.float64(num_site_states)
    for i in range(seqs_len):
        # for a in range(num_site_states):
        for a in range(num_site_states-1): ## ECC CHANGE -- took away gap state
            reg_single_site_freqs[i, a] = theta_by_q + \
                (1.0 - pseudocount)*reg_single_site_freqs[i, a]
            # print('site %d-%d regularized freq: ' % (i, a), reg_single_site_freqs[i,a])
    return reg_single_site_freqs


def compute_pair_site_freqs(alignment_data=None, num_site_states=None, seqs_weight=None):
    """Computes pair site frequencies for an alignmnet data.

    Parameters
    ----------
        alignment_data : np.array()
            A 2d numpy array conatining alignment data. The residues in the
            alignment are in integer representation.
        num_site_states : int
            The number of possible states including gap state that sequence
            sites can accomodate. It must be an integer
        seqs_weight:
            A 1d numpy array of sequences weight

    Returns
    -------
        pair_site_freqs : np.array()
            A 3d numpy array of shape
            (num_pairs, num_site_states, num_site_states) where num_pairs is
            the number of unique pairs we can form from sequence sites. The
            pairs are assumed to in the order (0, 1), (0, 2) (0, 3), ...(0, L-1),
            ... (L-1, L). This ordering is critical and any change must be
            documented.
    """
    alignment_shape = alignment_data.shape
    num_seqs = alignment_shape[0]
    seqs_len = alignment_shape[1]
    num_site_pairs = (seqs_len -1)*seqs_len/2
    num_site_pairs = np.int64(num_site_pairs)
    m_eff = np.sum(seqs_weight)
    pair_site_freqs = np.zeros(
        shape=(num_site_pairs, num_site_states - 1, num_site_states - 1),
        dtype = np.float64
    )
    for i in range(seqs_len - 1):
        column_i = alignment_data[:, i]
        for j in range(i+1, seqs_len):
            pair_site = int((seqs_len * (seqs_len - 1)/2) - (seqs_len - i) * ((seqs_len - i) - 1)/2  + j  - i - 1)
            column_j = alignment_data[:, j]
            ## ECC CHANGE
            # for a in range(1, num_site_states):
            for a in range(num_site_states-1):
                count_ai = column_i==a
                
                ## ECC CHANGE
                # for b in range(1, num_site_states):
                for b in range(num_site_states-1):
                    count_bj = column_j==b
                    count_ai_bj = count_ai * count_bj
                    freq_ia_jb = np.sum(count_ai_bj*seqs_weight)
                    #if freq_ia_jb > 0.0:
                        # print('freq for %d-%d, %d-%d:' %(i,a,j,b), freq_ia_jb)
                    # pair_site_freqs[pair_site, a-1, b-1] += freq_ia_jb/m_eff
                    pair_site_freqs[pair_site, a, b] += freq_ia_jb/m_eff ## ECC CHANGE -- shift index not needed

    return pair_site_freqs

def get_reg_single_site_freqs(single_site_freqs = None, seqs_len = None,
        num_site_states = None, pseudocount = None):
    """Regularizes single site frequencies.

    Parameters
    ----------
        single_site_freqs : np.array()
            A 2d numpy array of single site frequencies of shape
            (seqs_len, num_site_states). Note that gap state frequencies are                         
            included in this data.                                                                   
        seqs_len : int
            The length of sequences in the alignment data                                            
        num_site_states : int
            Total number of states that a site in a sequence can accommodate. It                     
            includes gap states.                                                                     
        pseudocount : float
            This is the value of the relative pseudo count of type float.
            theta = lambda/(meff + lambda), where meff is the effective number of
            sequences and lambda is the real pseudo count.

    Returns
    -------
        reg_single_site_freqs : np.array()
            A 2d numpy array of shape (seqs_len, num_site_states) of single site
            frequencies after they are regularized.
    """
    reg_single_site_freqs = single_site_freqs
    theta_by_q = np.float64(pseudocount)/np.float64(num_site_states)
    for i in range(seqs_len):
        for a in range(num_site_states-1): ## ECC CHANGE -- not includeing gap states
            reg_single_site_freqs[i, a] = theta_by_q + \
                (1.0 - pseudocount)*reg_single_site_freqs[i, a]
            print('site %d-%d regularized freq: ' % (i, a), reg_single_site_freqs[i,a])

    return reg_single_site_freqs

def get_reg_pair_site_freqs(pair_site_freqs = None, seqs_len = None,
        num_site_states = None, pseudocount = None):
    """Regularizes pair site frequencies

    Parameters
    ----------
        pair_site_freqs : np.array()
            A 3d numpy array of shape (num_unique_site_pairs, num_site_states -1,
            num_site_states -1) containing raw pair site frequency counts where
            num_unique_site_pairs is the total number of unique site pairs
            excluding self pairing. Note that the order in with the pairing is
            done is important. It must be taken in (0, 1), (0,2), ...,
            (0, seqs_len-1), (1, 2)... order. Note that this data does not
            contain pairings with gap states.
        seqs_len : int
            The length of sequences in the alignment.
        num_site_states : int
            The total number of states that a site in the sequences can
            accommodate. This includes gap states.

    Returns
    -------
        reg_pair_site_freqs : np.array()
            A numpy array of shape the same as pair_site_freqs
    """
    reg_pair_site_freqs = pair_site_freqs
    theta_by_qsqrd = pseudocount/float(num_site_states * num_site_states)
    pair_counter = 0
    for i in range(seqs_len - 1):
        for j in range(i + 1, seqs_len):
            for a in range(num_site_states-1):
                for b in range(num_site_states-1):
                    reg_pair_site_freqs[pair_counter, a, b] = theta_by_qsqrd + \
                        (1.0 - pseudocount)*reg_pair_site_freqs[pair_counter, a, b]
            pair_counter += 1
    return reg_pair_site_freqs



# Comparing Frequency (single-site/pair-site) for first 10 AA
* We want to number of effective to be the same (m_eff/meff)
* We want counts to be the same
* from here single site should be the same

In [5]:
# Compute Frequencies printing everything out
from inference_dca import direct_info_dca
print(s0.shape)
seq_wt_file = None
seq_wt_file = '%s/protein_data/data_processing_output/seq_weight_%s.npy' % (DCA_ER_dir, pfam_id)

# ----------- DCA DI (MF) calculation --------------------------------------------- #

mf_di, fi, fij, c, cinv, w, w2d, fi_pydca, fij_pydca, c_pydca, c_inv_pydca, w_pydca, w2d_pydcak, di_pydca, ma_inv,seq_ints \
= direct_info_dca(s0, seq_wt_outfile=seq_wt_file, first10=True)
print('c[0]:',c[0])
print('c_pydca[0]', c_pydca[0])

(6573, 10)
(6573, 10)
ma_inv (sequences weight shape:  (6573,)
tais meff = 1498.723575
site 0-10 freq and count:  0.0006672344497453262 1.0
site 1-4 freq and count:  0.04078184487664864 159.0
site 1-7 freq and count:  0.3469099956493809 2630.0
site 1-9 freq and count:  0.2523938460218855 1789.0
site 1-10 freq and count:  0.014308587084011833 51.0
site 1-17 freq and count:  0.14232662405412083 735.0
site 2-0 freq and count:  0.11201087809123383 606.0
site 2-1 freq and count:  0.017913040732436537 65.0
site 2-3 freq and count:  0.018120975597666813 59.0
site 2-4 freq and count:  0.008751936467059468 46.0
site 2-5 freq and count:  0.04211212642052938 373.0
site 2-6 freq and count:  0.020906035026553794 95.0
site 2-7 freq and count:  0.06848261815659899 275.0
site 2-8 freq and count:  0.04749098606751544 188.0
site 2-11 freq and count:  0.07123014298641069 550.0
site 2-13 freq and count:  0.01650598491521976 158.0
site 2-14 freq and count:  0.02156990613570917 68.0
site 2-15 freq and count

c[0]: [ 2.32426304e-02 -5.66893424e-04 -5.66893424e-04 -5.66893424e-04
 -5.66893424e-04 -5.66893424e-04 -5.66893424e-04 -5.66893424e-04
 -5.66893424e-04 -5.66893424e-04 -5.74836691e-04 -5.66893424e-04
 -5.66893424e-04 -5.66893424e-04 -5.66893424e-04 -5.66893424e-04
 -5.66893424e-04 -5.66893424e-04 -5.66893424e-04 -5.66893424e-04
  5.66893424e-04  5.66893424e-04  5.66893424e-04  5.66893424e-04
  8.13952707e-05  5.66893424e-04  5.66893424e-04 -3.56298748e-03
  5.66893424e-04 -2.43779522e-03  3.96553102e-04  5.66893424e-04
  5.66893424e-04  5.66893424e-04  5.66893424e-04  5.66893424e-04
  5.66893424e-04 -1.12747115e-03  5.66893424e-04  5.66893424e-04
 -7.66569410e-04  3.53642939e-04  5.66893424e-04  3.51167524e-04
  4.62703704e-04  6.55585857e-05  3.18012055e-04 -2.48375840e-04
  1.52454228e-06  5.66893424e-04  5.66893424e-04 -2.81084469e-04
  5.66893424e-04  3.70393604e-04  3.10108827e-04 -2.42718675e-03
 -9.62712119e-04 -2.18645151e-04  5.66893424e-04  3.66377478e-04
 -7.54622808e-04  4

In [6]:
theta = .2
#try:
 #   seq_weight = np.load('first10_seq_weight_pydca.npy')
#except(FileNotFoundError):
seq_int_count, seq_weight = compute_sequences_weight(alignment_data=s0, seqid=float(1.-theta))
pydca_fi = compute_single_site_freqs(alignment_data=s0, num_site_states=21, seqs_weight=seq_weight)

reg_pydca_fi = get_reg_single_site_freqs(single_site_freqs = pydca_fi, seqs_len = s0.shape[1],
    num_site_states = 21, pseudocount = .5)


pydca m_eff = 1498.723575
1498.7235751716441
site 0-10 freq and count:  0.0 1
site 1-4 freq and count:  0.0 159
site 1-7 freq and count:  0.0 2630
site 1-9 freq and count:  0.0 1789
site 1-10 freq and count:  0.252393846021886 51
site 1-17 freq and count:  0.0 735
site 2-0 freq and count:  0.0 606
site 2-1 freq and count:  0.11201087809123392 65
site 2-3 freq and count:  0.0 59
site 2-4 freq and count:  0.018120975597666816 46
site 2-5 freq and count:  0.008751936467059468 373
site 2-6 freq and count:  0.042112126420529236 95
site 2-7 freq and count:  0.020906035026553797 275
site 2-8 freq and count:  0.06848261815659891 188
site 2-11 freq and count:  0.0 550
site 2-13 freq and count:  0.0 158
site 2-14 freq and count:  0.01650598491521975 68
site 2-15 freq and count:  0.021569906135709176 2133
site 2-16 freq and count:  0.25150273441250093 1004
site 2-17 freq and count:  0.12848686561224706 437
site 2-19 freq and count:  0.0 44
site 3-0 freq and count:  0.0 494
site 3-1 freq and count

In [8]:
# Compute Frequencies printing everything out
# from inference_dca import direct_info_dca
# print(s0.shape)
# seq_wt_file = None
# seq_wt_file = '%s/protein_data/data_processing_output/first10_seq_weight_%s.npy' % (DCA_ER_dir, pfam_id)

# # ----------- DCA DI (MF) calculation --------------------------------------------- #

# mf_di, fi, fij, c, cinv, w, w2d, fi_pydca, fij_pydca, c_pydca, c_inv_pydca, w_pydca, w2d_pydcak, di_pydca, ma_inv, seq_ints \
# = direct_info_dca(s0, seq_wt_outfile=seq_wt_file, first10=True)
# print('c[0]:',c[0])
# print('c_pydca[0]', c_pydca[0])

In [9]:
print('pydca sequences weight array length: ', seq_weight.shape)

pydca sequences weight array length:  (6573,)


In [10]:
print(ma_inv.shape)

(6573,)


### Difference in Sequence weights

In [11]:
print(seq_int_count[:15])
print(seq_ints[:15])
print('\n\nthe difference between our seq_weight (ma_inv) and their seq_weights for all %d sequences is %d.... QED\n\n' % 
      (s0.shape[0], abs(seq_weight - ma_inv).sum()))


print(seq_weight[:15])
print(ma_inv[:15])

[ 1.  6.  2. 98.  3. 34. 98. 11. 20. 22.  3.  4. 19. 24. 23.]
[ 1.  6.  2. 98.  3. 34. 98. 11. 20. 22.  3.  4. 19. 24. 23.]


the difference between our seq_weight (ma_inv) and their seq_weights for all 6573 sequences is 0.... QED


[1.         0.16666667 0.5        0.01020408 0.33333333 0.02941176
 0.01020408 0.09090909 0.05       0.04545455 0.33333333 0.25
 0.05263158 0.04166667 0.04347826]
[1.         0.16666667 0.5        0.01020408 0.33333333 0.02941176
 0.01020408 0.09090909 0.05       0.04545455 0.33333333 0.25
 0.05263158 0.04166667 0.04347826]


### Frequency Matrix Shapes

In [12]:
print(fi.shape)
print(pydca_fi.shape)


(10, 21)
(10, 10, 21, 21)
(10, 21)


### Difference in Single-Site Frequency

In [13]:
fi_diff = fi-reg_pydca_fi
print(fi_diff[0])
print(fi_diff[1])

[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.52347591]
[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.38777878e-17  0.00000000e+00  0.00000000e+00 -1.05471187e-15
  0.00000000e+00 -2.49800181e-16  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.38777878e-17  0.00000000e+00  0.00000000e+00
  1.25449075e-01]


### Pair-Site Frequency

In [16]:
pydca_fij = compute_pair_site_freqs(alignment_data=s0, num_site_states=21, seqs_weight=seq_weight)
reg_pydca_fij = get_reg_pair_site_freqs(pair_site_freqs = pydca_fij, seqs_len = s0.shape[1],
        num_site_states = 21, pseudocount = .5)


In [24]:
print(fij.shape)
print(pydca_fij.shape)
pair_counter = 0
fij_diffs = []
for i in range(s0.shape[1] - 1):
    for j in range(i + 1, s0.shape[1]):
        for a in range(21-1):
            for b in range(21-1):
                fij_diffs.append(reg_pydca_fij[pair_counter, a, b] - fij[i,j,a,b]) 
        pair_counter +=1

print(np.sum(fij_diffs))

(10, 10, 21, 21)
(45, 20, 20)
-4.54454182619024e-15
