In [1]:
import os.path, sys

import numpy as np
import pandas as pd
from scipy import linalg
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OneHotEncoder

import Bio.PDB, warnings
pdb_list = Bio.PDB.PDBList()
pdb_parser = Bio.PDB.PDBParser()
from scipy.spatial import distance_matrix
from Bio import BiopythonWarning
warnings.simplefilter('ignore', BiopythonWarning)

from joblib import Parallel, delayed
import timeit

import matplotlib.pyplot as plt

# # --- Import our Code ---# #
#import emachine as EM
from direct_info import direct_info

# import data processing and general DCA_ER tools
from pathlib import Path
np.random.seed(1)

from Bio import SeqIO
from Bio.PDB import *
from scipy.spatial import distance_matrix
from Bio import pairwise2
from Bio.SubsMat.MatrixInfo import blosum62
pdb_parser = Bio.PDB.PDBParser()
from Bio.pairwise2 import format_alignment
from prody import *

print('Done with initial import')



Done with initial import


In [2]:
from data_processing import pdb2msa, data_processing_pdb2msa

import ecc_tools as tools


In [3]:
create_new = False
printing = True
removing_cols = True
remove_diagonals = False

data_path = Path('/data/cresswellclayec/DCA_ER/Pfam-A.full')
data_path = Path('/data/cresswellclayec/Pfam-A.full')

# Define data directories
DCA_ER_dir = '/data/cresswellclayec/DCA_ER' # Set DCA_ER directory
biowulf_dir = '%s/biowulf_pdb2msa' % DCA_ER_dir


out_dir = '%s/protein_data/di/' % biowulf_dir
out_metric_dir = '%s/protein_data/metrics/' % biowulf_dir

processed_data_dir = "%s/protein_data/data_processing_output" % biowulf_dir
pdb_dir = '%s/protein_data/pdb_data/' % biowulf_dir

# pdb_path = "/pdb/pdb/zd/pdb1zdr.ent.gz"
pdb_path = "/pdb/pdb/d1/pdb2d10.ent.gz"
n_cpus = 4
print('\n\nUnzipping %s' % pdb_path)

unzipped_pdb_filename = os.path.basename(pdb_path).replace(".gz", "")

pdb_out_path = "%s%s" % (pdb_dir, unzipped_pdb_filename)


print('Unzipping %s to %s' % (pdb_path, pdb_out_path))

# --------------------- Data Processing (should be saving correct row!!!!) --- #
import gzip, shutil
def gunzip(file_path,output_path):
    with gzip.open(file_path,"rb") as f_in, open(output_path,"wb") as f_out:
        shutil.copyfileobj(f_in, f_out)



gunzip(pdb_path, pdb_out_path)

print('Done unzipping pdb file')





Unzipping /pdb/pdb/d1/pdb2d10.ent.gz
Unzipping /pdb/pdb/d1/pdb2d10.ent.gz to /data/cresswellclayec/DCA_ER/biowulf_pdb2msa/protein_data/pdb_data/pdb2d10.ent
Done unzipping pdb file


In [6]:
## print(pdb_out_path)
print(pdb_dir)
pdb2msa_results = pdb2msa(pdb_out_path, pdb_dir, create_new=True)

if len(pdb2msa_results) > 1:
    fasta_file = pdb2msa_results[0]
    prody_df = pdb2msa_results[1]
else:
    prody_df = pdb2msa_results[0]



def get_tpdb_new(s, ali_start_indx, ali_end_indx, pfam_start_indx, pfam_end_indx, aligned_pdb_str):
    # Requires DataFrame from data_processing.pdb2msa() function as input
    from ecc_tools import hamming_distance
    alignment_len =  ali_end_indx - ali_start_indx
    #print(alignment_len, len(aligned_pdb_str), aligned_pdb_str)

    min_ham = alignment_len
    max_pair_score = 0
    min_indx = -1
    for i, seq in enumerate(s):
        gap_seq = seq == '-'  # returns True/False for gaps/no gaps
        subject = seq[~gap_seq]
        seq_str = ''.join(subject).upper()
        aligned_seq_str = seq_str[pfam_start_indx : pfam_end_indx]
        if len(aligned_seq_str) != alignment_len:
           continue
        ham_dist = hamming_distance(aligned_pdb_str, aligned_seq_str)
        alignments = pairwise2.align.globalxx(aligned_pdb_str, aligned_seq_str)
        
        if len(alignments) == 0:
            continue
        pair_score = alignments[0].score
#         if ham_dist < min_ham:
        if pair_score > max_pair_score:
            min_ham = ham_dist
            max_pair_score = pair_score
            min_indx = i
            print(format_alignment(*alignments[0]))
            best_alignment = alignments[0]
            print('match upgrade at' , i)
            print('%d: hamm dist=%d, pairwise score=%f\n' % (i, ham_dist, pair_score))


            pair_score = alignments[0].score

    gap_seq = s[min_indx] == '-'
    print('best match is sequence %d with hamming distance of %d (length %d)' % (min_indx, min_ham, len(s[min_indx][~gap_seq])))
    
    
    # get matching seq for both sequences (no gaps in either)
    aligned_pdb_char_array = np.array([char for char in best_alignment.seqA])
    aligned_ref_char_array = np.array([char for char in best_alignment.seqB])
    
    # get array of gaps for both sequences
    seqA_gaps = aligned_pdb_char_array == '-'
    seqB_gaps = aligned_ref_char_array == '-'
    aligned_gaps = np.logical_or(seqA_gaps, seqB_gaps)

    
    # create index array for reference sequence so we know which msa columns associated with aligned arrays
    pdb_count = 0
    ref_count = 0
    gap_ref_index = -1 * np.ones(len(aligned_ref_char_array), dtype=int)
    gap_pdb_index = -1 * np.ones(len(aligned_pdb_char_array), dtype=int)
    for i, char in enumerate(aligned_ref_char_array):
        if char !='-':
            gap_ref_index[i] = int(ref_count)
            ref_count += 1
        if aligned_pdb_char_array[i] !='-':
            gap_pdb_index[i] = int(pdb_count)
            pdb_count += 1            
#     print(gap_ref_index)    
    
    
    # get columns to remove (gap in PDB) in MSA
    pdb_gap_cols_in_ref = gap_ref_index[seqA_gaps]
    print(len(pdb_gap_cols_in_ref), pdb_gap_cols_in_ref)

    # get s_index for mapping msa to pdb sequence.
    pdb_s_index = gap_pdb_index[~aligned_gaps]
    print(len(pdb_s_index), pdb_s_index)
    
    # Extract further infor for aligned seqs.
    aligned_pdb_nogap = aligned_pdb_char_array[~aligned_gaps]
    aligned_ref_nogap = aligned_ref_char_array[~aligned_gaps]
    print('\n aligned PDB and ref seq:')
    print(len(aligned_pdb_nogap),aligned_pdb_nogap)
    print(len(aligned_ref_nogap),aligned_ref_nogap)
    
    # Trim By gaps in Ref seq (tbr). Then Trim By gaps in Pdb seq (tpb)
    s_tbr = s[:, ~gap_seq]
    s_tbr_tbp = np.delete(s_tbr, pdb_gap_cols_in_ref, axis=1)
    # printed ref seq should be the same as the fully alinged, gapless pdb and ref seqs above.
    print(len(s_tbr_tbp[min_indx]), s_tbr_tbp[min_indx])
    


    return min_indx, best_alignment, s_tbr_tbp, pdb_s_index


print(prody_df.head())

pdb_df = prody_df.iloc[0]
from data_processing import load_msa
pfam_id = pdb_df['Pfam']
pdb_seq = pdb_df['PDB Sequence']
pdb_id = pdb_df['PDB ID']
ali_start_indx = int(pdb_df['ali_start'])-1
ali_end_indx = int(pdb_df['ali_end'])-1
pfam_start_indx = int(pdb_df['hmm_start'])-1
pfam_end_indx = int(pdb_df['hmm_end'])-1

aligned_pdb_str  = pdb_df['PDB Sequence'][ali_start_indx:ali_end_indx]


print('PDB ID: %s, Pfam ID: %s' % (pdb_id, pfam_id))

np.random.seed(123456789)
#if not create_new and os.path.exists("%s/%s_pdb_df.csv" % (out_dir, pfam_id)):
if 0:
    print('Because create_new is False and files exist we will load preprocessed data:')
    if remove_cols:
        s = np.load("%s/%s_%s_preproc_msa.npy" % (out_dir, pfam_id, pdb_id))
        s_index = np.load("%s/%s_%s_preproc_sindex.npy" % (out_dir, pfam_id, pdb_id))
        removed_cols = np.load("%s/%s_%s_removed_cols.npy" % (out_dir, pfam_id, pdb_id))
        ref_seq = np.load("%s/%s_%s_preproc_refseq.npy" % (out_dir, pfam_id, pdb_id))
    else:
        s = np.load("%s/%s_%s_allCols_msa.npy" % (out_dir, pfam_id, pdb_id))
        s_index = np.load("%s/%s_%s_allCols_sindex.npy" % (out_dir, pfam_id, pdb_id))
        removed_cols = np.load("%s/%s_%s_removed_cols.npy" % (out_dir, pfam_id, pdb_id))
        ref_seq = np.load("%s/%s_%s_allCols_refseq.npy" % (out_dir, pfam_id, pdb_id))

    if not letter_format and isinstance(s[0][0], str):
        s = convert_letter2number(s)



# Load MSA
s = load_msa(data_path, pfam_id)
orig_seq_len = s.shape[1]
print('Original Sequence length: ', orig_seq_len)


# Using given MSA find best matching PDB structure from all available MSA sequences.

if printing:
    print("#\n\n--------------------- Find PDB Sequence in MSA ---------------#")




# Find PDB seq in MSA current
tpdb, alignment, s_trimmed = get_tpdb_new(s, ali_start_indx, ali_end_indx, pfam_start_indx, pfam_end_indx, aligned_pdb_str) # requires prody.searchPfam DF from pdb2msa as input
print(alignment)

print('s trimmed by reference sequences and aligned pdb sequence (both in alignment): ', s_trimmed.shape)


/data/cresswellclayec/DCA_ER/biowulf_pdb2msa/protein_data/pdb_data/
2d10
Record id 2D10:A, chain A
['UNP:P26043', 'UNP:RADI_MOUSE']
Record id 2D10:B, chain B
['UNP:P26043', 'UNP:RADI_MOUSE']
Record id 2D10:C, chain C
['UNP:P26043', 'UNP:RADI_MOUSE']
Record id 2D10:D, chain D
['UNP:P26043', 'UNP:RADI_MOUSE']
Record id 2D10:E, chain E
['UNP:O14745', 'UNP:NHERF_HUMAN']
Record id 2D10:F, chain F
['UNP:O14745', 'UNP:NHERF_HUMAN']
Record id 2D10:G, chain G
['UNP:O14745', 'UNP:NHERF_HUMAN']
Record id 2D10:H, chain H
['UNP:O14745', 'UNP:NHERF_HUMAN']

Chain A polypeptide 0 (length 295):  KPINVRVTTMDAELEFAIQPNTTGKQLFDQVVKTVGLREVWFFGLQYVDSKGYSTWLKLNKKVTQQDVKKENPLQFKFRAKFFPEDVSEELIQEITQRLFFLQVKEAILNDEIYCPPETAVLLASYAVQAKYGDYNKEIHKPGYLANDRLLPQRVLEQHKLTKEQWEERIQNWHEEHRGMLREDSMMEYLKIAQDLEMYGVNYFEIKNKKGTELWLGVDALGLNIYEHDDKLTPKIGFPWSEIRNISFNDKKFVIKPIDKKAPDFVFYAPRLRINKRILALCMGNHELYMRRRKP


@> Submitted Pfam search for sequence "KPINVRVTTMDAELEF...".


{'PF09380': {'accession': 'PF09380.13', 'class': 'Domain', 'id': 'FERM_C', 'locations': {'ali_end': '294', 'ali_start': '208', 'bitscore': '94.64', 'end': '295', 'cond_evalue': '5.9e-31', 'ind_evalue': '3.8e-27', 'evidence': 'hmmer v3.0', 'hmm_end': '91', 'hmm_start': '1', 'start': '208'}, 'type': 'Pfam-A'}, 'PF00373': {'accession': 'PF00373.21', 'class': 'Domain', 'id': 'FERM_M', 'locations': {'ali_end': '204', 'ali_start': '89', 'bitscore': '92.99', 'end': '204', 'cond_evalue': '2.6e-30', 'ind_evalue': '1.7e-26', 'evidence': 'hmmer v3.0', 'hmm_end': '116', 'hmm_start': '4', 'start': '86'}, 'type': 'Pfam-A'}, 'PF09379': {'accession': 'PF09379.13', 'class': 'Domain', 'id': 'FERM_N', 'locations': {'ali_end': '68', 'ali_start': '7', 'bitscore': '68.69', 'end': '69', 'cond_evalue': '4.9e-23', 'ind_evalue': '3.2e-19', 'evidence': 'hmmer v3.0', 'hmm_end': '63', 'hmm_start': '1', 'start': '7'}, 'type': 'Pfam-A'}}

Chain B polypeptide 0 (length 297):  KPINVRVTTMDAELEFAIQPNTTGKQLFDQVVKTVGLREVW

@> Submitted Pfam search for sequence "KPINVRVTTMDAELEF...".


{'PF09380': {'accession': 'PF09380.13', 'class': 'Domain', 'id': 'FERM_C', 'locations': {'ali_end': '296', 'ali_start': '208', 'bitscore': '96.47', 'end': '297', 'cond_evalue': '1.6e-31', 'ind_evalue': '1.0e-27', 'evidence': 'hmmer v3.0', 'hmm_end': '93', 'hmm_start': '1', 'start': '208'}, 'type': 'Pfam-A'}, 'PF00373': {'accession': 'PF00373.21', 'class': 'Domain', 'id': 'FERM_M', 'locations': {'ali_end': '204', 'ali_start': '89', 'bitscore': '92.97', 'end': '204', 'cond_evalue': '2.7e-30', 'ind_evalue': '1.7e-26', 'evidence': 'hmmer v3.0', 'hmm_end': '116', 'hmm_start': '4', 'start': '86'}, 'type': 'Pfam-A'}, 'PF09379': {'accession': 'PF09379.13', 'class': 'Domain', 'id': 'FERM_N', 'locations': {'ali_end': '68', 'ali_start': '7', 'bitscore': '68.67', 'end': '69', 'cond_evalue': '4.9e-23', 'ind_evalue': '3.2e-19', 'evidence': 'hmmer v3.0', 'hmm_end': '63', 'hmm_start': '1', 'start': '7'}, 'type': 'Pfam-A'}}

Chain C polypeptide 0 (length 297):  KPINVRVTTMDAELEFAIQPNTTGKQLFDQVVKTVGLREVW

@> Submitted Pfam search for sequence "KPINVRVTTMDAELEF...".


{'PF09380': {'accession': 'PF09380.13', 'class': 'Domain', 'id': 'FERM_C', 'locations': {'ali_end': '296', 'ali_start': '208', 'bitscore': '96.47', 'end': '297', 'cond_evalue': '1.6e-31', 'ind_evalue': '1.0e-27', 'evidence': 'hmmer v3.0', 'hmm_end': '93', 'hmm_start': '1', 'start': '208'}, 'type': 'Pfam-A'}, 'PF00373': {'accession': 'PF00373.21', 'class': 'Domain', 'id': 'FERM_M', 'locations': {'ali_end': '204', 'ali_start': '89', 'bitscore': '92.97', 'end': '204', 'cond_evalue': '2.7e-30', 'ind_evalue': '1.7e-26', 'evidence': 'hmmer v3.0', 'hmm_end': '116', 'hmm_start': '4', 'start': '86'}, 'type': 'Pfam-A'}, 'PF09379': {'accession': 'PF09379.13', 'class': 'Domain', 'id': 'FERM_N', 'locations': {'ali_end': '68', 'ali_start': '7', 'bitscore': '68.67', 'end': '69', 'cond_evalue': '4.9e-23', 'ind_evalue': '3.2e-19', 'evidence': 'hmmer v3.0', 'hmm_end': '63', 'hmm_start': '1', 'start': '7'}, 'type': 'Pfam-A'}}

Chain D polypeptide 0 (length 297):  KPINVRVTTMDAELEFAIQPNTTGKQLFDQVVKTVGLREVW

@> Submitted Pfam search for sequence "KPINVRVTTMDAELEF...".


{'PF09380': {'accession': 'PF09380.13', 'class': 'Domain', 'id': 'FERM_C', 'locations': {'ali_end': '296', 'ali_start': '208', 'bitscore': '96.47', 'end': '297', 'cond_evalue': '1.6e-31', 'ind_evalue': '1.0e-27', 'evidence': 'hmmer v3.0', 'hmm_end': '93', 'hmm_start': '1', 'start': '208'}, 'type': 'Pfam-A'}, 'PF00373': {'accession': 'PF00373.21', 'class': 'Domain', 'id': 'FERM_M', 'locations': {'ali_end': '204', 'ali_start': '89', 'bitscore': '92.97', 'end': '204', 'cond_evalue': '2.7e-30', 'ind_evalue': '1.7e-26', 'evidence': 'hmmer v3.0', 'hmm_end': '116', 'hmm_start': '4', 'start': '86'}, 'type': 'Pfam-A'}, 'PF09379': {'accession': 'PF09379.13', 'class': 'Domain', 'id': 'FERM_N', 'locations': {'ali_end': '68', 'ali_start': '7', 'bitscore': '68.67', 'end': '69', 'cond_evalue': '4.9e-23', 'ind_evalue': '3.2e-19', 'evidence': 'hmmer v3.0', 'hmm_end': '63', 'hmm_start': '1', 'start': '7'}, 'type': 'Pfam-A'}}

Chain E polypeptide 0 (length 20):  SSKRAPQMDWSKKNELFSNL


@> Submitted Pfam search for sequence "SSKRAPQMDWSKKNEL...".


{'PF09007': {'accession': 'PF09007.14', 'class': 'Domain', 'id': 'EBP50_C', 'locations': {'ali_end': '20', 'ali_start': '1', 'bitscore': '44.55', 'end': '20', 'cond_evalue': '1.5e-15', 'ind_evalue': '2.9e-11', 'evidence': 'hmmer v3.0', 'hmm_end': '128', 'hmm_start': '109', 'start': '1'}, 'type': 'Pfam-A'}}

Chain F polypeptide 0 (length 20):  SSKRAPQMDWSKKNELFSNL


@> Submitted Pfam search for sequence "SSKRAPQMDWSKKNEL...".


{'PF09007': {'accession': 'PF09007.14', 'class': 'Domain', 'id': 'EBP50_C', 'locations': {'ali_end': '20', 'ali_start': '1', 'bitscore': '44.55', 'end': '20', 'cond_evalue': '1.5e-15', 'ind_evalue': '2.9e-11', 'evidence': 'hmmer v3.0', 'hmm_end': '128', 'hmm_start': '109', 'start': '1'}, 'type': 'Pfam-A'}}

Chain G polypeptide 0 (length 20):  SSKRAPQMDWSKKNELFSNL


@> Submitted Pfam search for sequence "SSKRAPQMDWSKKNEL...".


{'PF09007': {'accession': 'PF09007.14', 'class': 'Domain', 'id': 'EBP50_C', 'locations': {'ali_end': '20', 'ali_start': '1', 'bitscore': '44.55', 'end': '20', 'cond_evalue': '1.5e-15', 'ind_evalue': '2.9e-11', 'evidence': 'hmmer v3.0', 'hmm_end': '128', 'hmm_start': '109', 'start': '1'}, 'type': 'Pfam-A'}}

Chain H polypeptide 0 (length 20):  SSKRAPQMDWSKKNELFSNL


@> Submitted Pfam search for sequence "SSKRAPQMDWSKKNEL...".


{'PF09007': {'accession': 'PF09007.14', 'class': 'Domain', 'id': 'EBP50_C', 'locations': {'ali_end': '20', 'ali_start': '1', 'bitscore': '44.55', 'end': '20', 'cond_evalue': '1.5e-15', 'ind_evalue': '2.9e-11', 'evidence': 'hmmer v3.0', 'hmm_end': '128', 'hmm_start': '109', 'start': '1'}, 'type': 'Pfam-A'}}
sorting ProdyDataframe
fetch PF09380.13 MSA
Error during fetPfamMSA:  'https://pfam.xfam.org/family/acc?id=PF09380.13' could not be opened for reading, invalid URL or no internet connection
  PDB ID Chain  Polypeptide Index     Pfam   accession   class      id  \
0   2d10     B                  0  PF09380  PF09380.13  Domain  FERM_C   
1   2d10     C                  0  PF09380  PF09380.13  Domain  FERM_C   
2   2d10     D                  0  PF09380  PF09380.13  Domain  FERM_C   
3   2d10     A                  0  PF09380  PF09380.13  Domain  FERM_C   
4   2d10     A                  0  PF00373  PF00373.21  Domain  FERM_M   

     type                                       PDB Seque

In [None]:
seqA_nogap = [char for char in alignment.seqA if char!='-']
print(len(seqA_nogap), seqA_nogap)
print(len(alignment.seqA), alignment.seqA)
seqB_nogap = [char for char in alignment.seqB if char!='-']
print(len(seqB_nogap), seqB_nogap)
print(len(alignment.seqB), alignment.seqB)

In [None]:
print(len(aligned_pdb_str), aligned_pdb_str)

The MSA can be currated with the above reductions with confidence that the columns are appropriately aligned with the PDB sequence.