In [1]:
"""pydca demo

Author: Evan Cresswell-Clay
"""
import os
import data_processing as dp
import ecc_tools as tools
import timeit
# import pydca-ER module
from pydca.erdca import erdca
from pydca.sequence_backmapper import sequence_backmapper
from pydca.msa_trimmer import msa_trimmer
from pydca.dca_utilities import dca_utilities
import numpy as np

# Import Bio data processing features 
import Bio.PDB, warnings
from Bio.PDB import *
pdb_list = Bio.PDB.PDBList()
pdb_parser = Bio.PDB.PDBParser()
from scipy.spatial import distance_matrix
from Bio import BiopythonWarning


pfam_id = 'PF00186'
protein_msa_file = 'orig_MSA_%s.fa'%pfam_id
protein_refseq_file = 'orig_ref_%s.fa'%pfam_id
#protein_msa_file = 'MSA_PF00186.fa'
#protein_refseq_file = 'ref_PF00186.fa'

## Use Polypeptide Sequence from PDB as your reference sequence in MSA.

In [2]:
data_path = '../Pfam-A.full'

# Read in Reference Protein Structure
pdb = np.load('%s/%s/pdb_refs.npy'%(data_path,pfam_id))                                                                                                                   
# convert bytes to str (python 2 to python 3)                                                                       
pdb = np.array([pdb[t,i].decode('UTF-8') for t in range(pdb.shape[0]) \
     for i in range(pdb.shape[1])]).reshape(pdb.shape[0],pdb.shape[1])
ipdb = 0
tpdb = int(pdb[ipdb,1])
print('Ref Sequence # should be : ',tpdb-1)

# Load Multiple Sequence Alignment
s = dp.load_msa(data_path,pfam_id)

# Load Polypeptide Sequence from PDB as reference sequence
print(pdb[ipdb,:])
pdb_id = pdb[ipdb,5]                                                                              
pdb_chain = pdb[ipdb,6]                                                                           
pdb_start,pdb_end = int(pdb[ipdb,7]),int(pdb[ipdb,8])                                             
#print('pdb id, chain, start, end, length:',pdb_id,pdb_chain,pdb_start,pdb_end,pdb_end-pdb_start+1)                        

#print('download pdb file')                                                                       
pdb_file = pdb_list.retrieve_pdb_file(str(pdb_id),file_format='pdb')                              
#pdb_file = pdb_list.retrieve_pdb_file(pdb_id)                                                    
#---------------------------------------------------------------------------------------------------------------------#            
chain = pdb_parser.get_structure(str(pdb_id),pdb_file)[0][pdb_chain] 
ppb = PPBuilder().build_peptides(chain)                                                       
#    print(pp.get_sequence())
print('peptide build of chain produced %d elements'%(len(ppb)))                               

found_match = True
matching_seq_dict = {}
for i,pp in enumerate(ppb):
    poly_seq = [char for char in str(pp.get_sequence())]                                      
    print('PDB Polypeptide Sequence: \n',poly_seq)
    #check that poly_seq matches up with given MSA
    
    pp_msa_file, pp_ref_file = tools.write_FASTA(poly_seq, s, pfam_id, number_form=False,processed=False)
    # Incorporate SequenceBackmapper to see if PP sequence is in the MSA already. 
    #Or if theres a close enough match
if 0:
    seqbackmapper = sequence_backmapper.SequenceBackmapper(msa_file = pp_msa_file,                                
    refseq_file = pp_ref_file,                                                         
    biomolecule = 'protein',                                                         
    )
    matching_seqs, matching_seqs_indx = seqbackmapper.find_matching_seqs_from_alignment()


    temp = {}
    for sequence_match in matching_seqs:
        seq_char_list = np.asarray([char for char in sequence_match])
        print('matching sequence :', seq_char_list)
        # remove gaps '-'
        seq_gaps = seq_char_list == '-'
        seq_char_list = seq_char_list[~seq_gaps]
        print('matching sequence :', seq_char_list)
        try:
            if all([seq_char == poly_seq[qq] for qq,seq_char in enumerate(seq_char_list)]):
                print("Direct Match in MSA")
                found_match = True
                break
            else:
                print([seq_char == poly_seq[qq] for qq,seq_char in enumerate(seq_char_list)]) 
        except(IndexError):
            break
        if found_match:
            break
else:
    #just add using muscle:
    #https://www.drive5.com/muscle/manual/addtomsa.html
    #https://www.drive5.com/muscle/downloads.htmL
    muscle_msa_file = 'PP_muscle_msa_'+pfam_id+'.fa'
    os.system("~/muscle -profile -in1 %s -in2 %s -out %s"%(pp_msa_file,pp_ref_file,muscle_msa_file))
    print("PP sequence added to alignment via MUSCLE")


        



Ref Sequence # should be :  68
shape of s (import from msa.npy):
 (7750, 918)
shape of s (after UTF-8 decode):
 (7750, 918)
['PF00186' '69' 'Q5KZ26_GEOKA' '1' '160' '1ZDR' 'B' '1' '160']
Structure exists: '/home/eclay/DCA_ER/zd/pdb1zdr.ent' 
peptide build of chain produced 1 elements
PDB Polypeptide Sequence: 
 ['M', 'I', 'S', 'H', 'I', 'V', 'A', 'M', 'D', 'E', 'N', 'R', 'V', 'I', 'G', 'K', 'D', 'N', 'R', 'L', 'P', 'W', 'H', 'L', 'P', 'A', 'D', 'L', 'A', 'Y', 'F', 'K', 'R', 'V', 'T', 'M', 'G', 'H', 'A', 'I', 'V', 'M', 'G', 'R', 'K', 'T', 'F', 'E', 'A', 'I', 'G', 'R', 'P', 'L', 'P', 'G', 'R', 'D', 'N', 'V', 'V', 'V', 'T', 'G', 'N', 'R', 'S', 'F', 'R', 'P', 'E', 'G', 'C', 'L', 'V', 'L', 'H', 'S', 'L', 'E', 'E', 'V', 'K', 'Q', 'W', 'I', 'A', 'S', 'R', 'A', 'D', 'E', 'V', 'F', 'I', 'I', 'G', 'G', 'A', 'E', 'L', 'F', 'R', 'A', 'T', 'M', 'P', 'I', 'V', 'D', 'R', 'L', 'Y', 'V', 'T', 'K', 'I', 'F', 'A', 'S', 'F', 'P', 'G', 'D', 'T', 'F', 'Y', 'P', 'P', 'I', 'S', 'D', 'D', 'E', 'W', 'E', 'I', '

In [3]:

# This does not yield the correct sequence because loading the alignment into PYDCA, the module
# trims bad sequences. ie for PF00186 s shape goes from 7750 to 7561
#print('Polypeptide sequence exists:\n msa[%d] = \n'%matching_seqs_indx[0],s[matching_seqs_indx[0]],'\n\n')

# create MSATrimmer instance 
trimmer = msa_trimmer.MSATrimmer(
    muscle_msa_file, biomolecule='protein', 
    refseq_file=pp_ref_file
)

# Adding the data_processing() curation from tools to erdca.
preprocessed_data,s_index, cols_removed,s_ipdb = trimmer.get_preprocessed_msa(printing=True, saving = False)

#write trimmed msa to file in FASTA format
preprocessed_data_outfile = 'MSA_PF00186_PreProcessed.fa'
with open(preprocessed_data_outfile, 'w') as fh:
    for seqid, seq in preprocessed_data:
        fh.write('>{}\n{}\n'.format(seqid, seq))
        


sequence indices which match ref seq: 
 [7561]
s_trimmed[7562] =  ['-', 'V', 'V', 'M', 'V', 'A', 'A', 'L', 'T', 'R', 'N', 'G', 'V', 'I', 'G', 'V', 'D', 'N', 'R', 'L', 'P', 'W', 'H', 'L', 'P', 'E', 'D', 'L', 'K', 'F', 'F', 'K', 'R', 'I', 'T', 'L', 'G', 'K', 'P', 'L', 'V', 'M', 'G', 'R', 'K', 'T', 'F', 'D', 'S', 'I', 'G', 'R', 'P', 'L', 'P', 'G', 'R', 'L', 'N', 'I', 'V', 'V', 'T', 'R', 'D', 'T', 'S', 'F', 'Q', 'R', 'E', 'G', 'V', 'R', 'V', 'C', 'H', 'D', 'L', 'A', 'D', 'A', 'L', 'Q', 'A', 'I', 'A', 'E', 'G', 'V', 'D', 'E', 'I', 'A', 'V', 'I', 'G', 'G', 'G', 'E', 'I', 'F', 'T', 'Q', 'A', 'M', 'P', 'R', 'A', 'S', 'R', 'L', 'Y', 'L', 'T', 'E', 'I', 'D', 'T', 'T', 'L', 'E', 'G', 'D', 'A', 'Y', 'F', 'P', 'P', 'L', 'D', 'D', 'R', 'Q', 'W', 'Q', 'E', 'I', '-', '-', '-', '-', '-', 'E', 'R', 'R', 'G', 'T', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-']
MSA trimmed by internal function



starting shape:  (7750, 160)


#-------------------------Remove Gaps--------------------------#
s