In [1]:
import os.path, sys

import numpy as np
import pandas as pd
from scipy import linalg
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OneHotEncoder

import Bio.PDB, warnings
pdb_list = Bio.PDB.PDBList()
pdb_parser = Bio.PDB.PDBParser()
from scipy.spatial import distance_matrix
from Bio import BiopythonWarning
warnings.simplefilter('ignore', BiopythonWarning)

from joblib import Parallel, delayed
import timeit

import matplotlib.pyplot as plt

# # --- Import our Code ---# #
#import emachine as EM
from direct_info import direct_info

# import data processing and general DCA_ER tools
from data_processing import data_processing_new
import ecc_tools as tools
from pathlib import Path
np.random.seed(1)

In [2]:

pfam_id = 'PF00023'
pfam_id = 'PF00011'
pfam_id = 'PF14525'
pfam_id = 'PF03496'
pfam_id = 'PF03678'
pfam_id = 'PF00186'


n_jobs = 6
print('Finding MF contacts for %s', pfam_id)

create_new = True
printing = True
removing_cols = True


data_path = Path('/data/cresswellclayec/DCA_ER/Pfam-A.full')

# Define data directories
DCA_ER_dir = '/data/cresswellclayec/DCA_ER' # Set DCA_ER directory
biowulf_dir = '%s/biowulf' % DCA_ER_dir
msa_npy_file = '%s/%s/msa.npy' % (str(data_path), pfam_id)
msa_fa_file  = '%s/%s/msa.fa' %  (str(data_path), pfam_id)
pdb_ref_file = '%s/%s/pdb_refs.npy' %  (str(data_path), pfam_id)

out_dir = '%s/protein_data/di/' % biowulf_dir
processed_data_dir = "%s/protein_data/data_processing_output" % biowulf_dir
pdb_dir = '%s/protein_data/pdb_data/' % biowulf_dir



Finding MF contacts for %s PF00186


# Uniprot/Pfam pdb_ref.npy data
* msa --> pdb connection extracted with parse_pfam.py (several flaws) 
* aqcuired from Pfam website.

In [3]:
# Referencing the same dataframe may be useful so we dont always have to load individual ref files...
# however we also
individual_pdb_ref_file = Path(data_path, pfam_id, 'pdb_refs.npy')
pdb = np.load(individual_pdb_ref_file)


try:
    # delete 'b' in front of letters (python 2 --> python 3)
    pdb = np.array([pdb[t,i].decode('UTF-8') for t in range(pdb.shape[0]) \
         for i in range(pdb.shape[1])]).reshape(pdb.shape[0],pdb.shape[1])


    # Print number of pdb structures in Protein ID folder
    npdb = pdb.shape[0]
    print('number of pdb structures:',npdb)

    # Create pandas dataframe for protein structure
    pdb_df = pd.DataFrame(pdb,columns = ['PF','seq','id','uniprot_start','uniprot_end',\
                                     'pdb_id','chain','pdb_start','pdb_end'])
    print(pdb_df.head())
    pdb_reference_ids = np.unique(pdb_df['pdb_id'].to_numpy())
    print('PDB reference IDs:\n', pdb_reference_ids)
    pdb_ref_ipdb = 0
    print('seq:',int(pdb[pdb_ref_ipdb,1]))



except(IndexError):
    print('Loaded pdb: ', pdb)
    print('\n\nEMPTY PDB REFERENCE!!!\n\n')
    pdb_reference_ids = None



number of pdb structures: 372
        PF seq            id uniprot_start uniprot_end pdb_id chain pdb_start  \
0  PF00186  69  Q5KZ26_GEOKA             1         160   1ZDR     B         1   
1  PF00186  69  Q5KZ26_GEOKA             1         160   1ZDR     A         1   
2  PF00186  83  Q81R22_BACAN             2         160   3JWK     B         2   
3  PF00186  83  Q81R22_BACAN             2         160   3S9U     B         2   
4  PF00186  83  Q81R22_BACAN             2         160   3FL9     H         2   

  pdb_end  
0     160  
1     160  
2     160  
3     160  
4     160  
PDB reference IDs:
 ['1BOZ' '1CZ3' '1D1G' '1DDR' '1DDS' '1DF7' '1DG5' '1DG7' '1DG8' '1DHI'
 '1DHJ' '1DLR' '1DLS' '1DR1' '1DR2' '1DR3' '1DR4' '1DR5' '1DR6' '1DR7'
 '1DRA' '1DRB' '1DRE' '1DRF' '1DRH' '1DYH' '1DYI' '1DYJ' '1HFP' '1HFQ'
 '1HFR' '1JOL' '1JOM' '1JUV' '1KMS' '1KMV' '1MVS' '1MVT' '1OHJ' '1OHK'
 '1PD8' '1PD9' '1PDB' '1RA1' '1RA2' '1RA3' '1RA8' '1RA9' '1RB2' '1RB3'
 '1RC4' '1RD7' '1RE7' '1RF7' '1RG7' 

# ecc_tools.py PDB html Query
* MSA --> PDB: Queries for best PDB structure using every sequence in a given MSA.

In [5]:
s0, curated_cols, s_index, tpdb, pdb_select \
= data_processing_new(data_path, pfam_id, index_pdb=0,gap_seqs=0.2, gap_cols=0.2, prob_low=0.004,
                        conserved_cols=0.9, printing=True, out_dir=processed_data_dir, pdb_dir=pdb_dir,  letter_format=False,
                        remove_cols=removing_cols, create_new=create_new)


pdb_chain, ct, ct_full, n_amino_full, poly_seq_curated, poly_seq_range, poly_seq, pp_ca_coords_curated, pp_ca_coords_full_range \
= tools.contact_map_new(pdb_id=pdb_select['PDB ID'][:4], pdb_range=[pdb_select['Subject Beg'], pdb_select['Subject End']], \
                  removed_cols=curated_cols, queried_seq=pdb_select['Subject Aligned Seq'],  mismatches=pdb_select['Mismatches'], pdb_out_dir=pdb_dir)



Original Sequence length:  918
#

--------------------- Find Matching PDB Strucutre for MSA ----#
looking for pdb_references in /data/cresswellclayec/DCA_ER/biowulf/protein_data/pdb_data/ 
Raw-Query PDB dataframe gives 7526 matches... 

PDB dataframe with better than .90 Identity: 199
PDB dataframe with no gaps open: 185
Sorted PDB matches (153 matches): 
       MSA Index  PDB ID  Score  Identity        E-value  Bitscore  \
6896       6896  3INV_1    1.0     1.000  7.568000e-138       432   
4761       4761  3CL9_1    1.0     1.000  1.422000e-137       431   
973         973  3INV_1    1.0     0.956  1.717000e-130       411   
5557       5557  1DR7_1    1.0     1.000  2.420000e-120       380   
250         250  3NXT_1    1.0     1.000  3.318000e-120       379   

      Alignment Length  Mismatches  Gaps Opened  Query Beg  Query End  \
6896               210           0            0          1        210   
4761               210           0            0          1        210   
973    

[[ 7 15  4 ... 18 18  3]
 [ 9  0  9 ... 17 19  3]
 [20 20 17 ... 20 20 20]
 ...
 [17 15  9 ... 17 19  8]
 [17 15  0 ... 17 19  3]
 [ 4 15  7 ...  7 19  3]]
(5798, 138)
In Data Processing Final Reference Sequence (shape= (138,) ): 
 [ 9 11 15  7 17 17  1 13 11 10  5  8  2  5 11  9 12 18 12  9 14 11  4  8
 19 13 14 10 15 13 11  0 17  7  8 18  4 15  7 12 14 12  8  2  7 11  7 17
  9 15 14  3  9  8  3  0 12  8  5  0  6  9 15  8 15  9  2  2  0  9 12  3
  9  8 15  8  2 10 17 18  7 17 16  0 17 19  8  0  0 10 12  7  9  6 14  9
  4 17 14  7  9  6  3  4  3 15 16  4  4  3  7  2 19  8  2  4  8  9  9 16
  3 19 12  0  2  3  3  2  5  7 13 19  8  4  3 17 19 13]


#-----------------------#
Generating Contact Map
#----------------------------#

Structure exists: '/data/cresswellclayec/DCA_ER/biowulf/protein_data/pdb_data/pdb1dr7.ent' 
SLNSIVAVCQNMGIGKDGNLPWPPLRNEYKYFQRMTSTSHVEGKQNAVIMGKKTWFSIPEKNRPLKDRINIVLSRELKEAPKGAHYLSKSLDDALALLDSPELKSKVDMVWIVGGTAVYKAAMEKPINHRLFVTRILHEFESDTFFPEIDYKDFKLLTEYPGVPADIQEEDG

## Using PDB-->MSA with Prody

In [14]:
from Bio import SeqIO
import Bio.PDB, warnings
from Bio.PDB import *
from scipy.spatial import distance_matrix
from Bio import BiopythonWarning
from Bio import pairwise2
from Bio.SubsMat.MatrixInfo import blosum62
pdb_parser = Bio.PDB.PDBParser()

from prody import *

pfam_dir = "/fdb/fastadb/pfam"



def pdb2msa(pdb_file):
    pdb_id = os.path.basename(pdb_file)[3:7]
    print(pdb_id)
    chain_matches = {}
    for record in SeqIO.parse(pdb_file, "pdb-seqres"):
        print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
        print(record.dbxrefs)

    pdb_model = pdb_parser.get_structure(str(pdb_id), pdb_file)[0]
    prody_columns =  ['PDB ID' , 'Chain', 'accession', 'class', 'id', 'locations', 'type']
    prody_df = pd.DataFrame(columns = prody_columns)
    for chain in pdb_model.get_chains():
        ppb = PPBuilder().build_peptides(chain)
        for i, pp in enumerate(ppb):
            poly_seq = list()
            for char in str(pp.get_sequence()):
                poly_seq.append(char)
            print('Chain %s polypeptide %d: ' % (chain.get_id(), i),''.join(poly_seq))

        try:
            prody_search = searchPfam(''.join(poly_seq))
            print(prody_search)
        except(HTTPError):
            print('HTTPError')
            pass
        
        prody_dict = {}
        for pfam_key in prody_search.keys():
            ps = prody_search[pfam_key]
            if pfam_key not in prody_df.index:
                prody_df.loc[pfam_key] = [pdb_id, chain.get_id(), ps['accession'], ps['class'], ps['id'], ps['locations'], ps['type']]
            elif ps['locations']['bitscore'] > prody_df.loc[pfam_key]['locations']['bitscore'] and ps['locations']['ind_evalue'] < prody_df.loc[pfam_key]['locations']['ind_evalue']:
                prody_df.loc[pfam_key] = [pdb_id, chain.get_id(), ps['accession'], ps['class'], ps['id'], ps['locations'], ps['type']]
            else:
                print('Pfam %s already found and the match is not an improvement' % pfam_key)
            print(prody_df.head())
        #for key in prody_search.keys():
        #    msa_path = fetchPfamMSA(prody_search[key])
        #    print(msa_path)        
        pfam_accession = prody_df.iloc[0]['accession']
    print('fetch %s MSA' % pfam_accession)
    if 1:
        # load the Pfam MSA from biowulf database
        fasta_file = fetchPfamMSA(pfam_accession, format='fasta')
        print(fasta_file)
    else:
        # Download the Pfam MSA with prody
        for pfam_id in prody_search.keys():
            msa_file = fetchPfamMSA(prody_search[pfam_id]['accession'])
            print(msa_file)





In [15]:
pf00186_path = "/pdb/pdb/zd/pdb1zdr.ent.gz"
import gzip, shutil
def gunzip(file_path,output_path):
    with gzip.open(file_path,"rb") as f_in, open(output_path,"wb") as f_out:
        shutil.copyfileobj(f_in, f_out)
unzipped_pdb_path = os.path.basename(pf00186_path).replace(".gz", "")
pdb_path = "%s%s" % (pdb_dir, unzipped_pdb_path)
print(pdb_path)
gunzip(pf00186_path, pdb_path)

pdb2msa(pdb_path)

/data/cresswellclayec/DCA_ER/biowulf/protein_data/pdb_data/pdb1zdr.ent
1zdr
Record id 1ZDR:A, chain A
['PDB:1ZDR', 'PDB:1ZDR']
Record id 1ZDR:B, chain B
['PDB:1ZDR', 'PDB:1ZDR']
Chain A polypeptide 0:  MISHIVAMDENRVIGKDNRLPWHLPADLAYFKRVTMGHAIVMGRKTFEAIGRPLPGRDNVVVTGNRSFRPEGCLVLHSLEEVKQWIASRADEVFIIGGAELFRATMPIVDRLYVTKIFASFPGDTFYPPISDDEWEIVSYTPGGKDEKNPYEHAFIIYERK


@> Submitted Pfam search for sequence "MISHIVAMDENRVIGK...".


{'PF00186': {'accession': 'PF00186.22', 'class': 'Domain', 'id': 'DHFR_1', 'locations': {'ali_end': '160', 'ali_start': '1', 'bitscore': '215.08', 'end': '160', 'cond_evalue': '2.3e-68', 'ind_evalue': '4.6e-64', 'evidence': 'hmmer v3.0', 'hmm_end': '160', 'hmm_start': '1', 'start': '1'}, 'type': 'Pfam-A'}}
        PDB ID Chain   accession   class      id  \
PF00186   1zdr     A  PF00186.22  Domain  DHFR_1   

                                                 locations    type  
PF00186  {'ali_end': '160', 'ali_start': '1', 'bitscore...  Pfam-A  
Chain B polypeptide 0:  MISHIVAMDENRVIGKDNRLPWHLPADLAYFKRVTMGHAIVMGRKTFEAIGRPLPGRDNVVVTGNRSFRPEGCLVLHSLEEVKQWIASRADEVFIIGGAELFRATMPIVDRLYVTKIFASFPGDTFYPPISDDEWEIVSYTPGGKDEKNPYEHAFIIYER


@> Submitted Pfam search for sequence "MISHIVAMDENRVIGK...".


{'PF00186': {'accession': 'PF00186.22', 'class': 'Domain', 'id': 'DHFR_1', 'locations': {'ali_end': '160', 'ali_start': '1', 'bitscore': '215.11', 'end': '160', 'cond_evalue': '2.3e-68', 'ind_evalue': '4.5e-64', 'evidence': 'hmmer v3.0', 'hmm_end': '160', 'hmm_start': '1', 'start': '1'}, 'type': 'Pfam-A'}}
        PDB ID Chain   accession   class      id  \
PF00186   1zdr     B  PF00186.22  Domain  DHFR_1   

                                                 locations    type  
PF00186  {'ali_end': '160', 'ali_start': '1', 'bitscore...  Pfam-A  
fetch PF00186.22 MSA


OSError: 'https://pfam.xfam.org/family/acc?id=PF00186.22' could not be opened for reading, invalid URL or no internet connection

In [None]:
pf00186_path = "/pdb/pdb/dr/pdb1dr7.ent.gz"
import gzip, shutil
def gunzip(file_path,output_path):
    with gzip.open(file_path,"rb") as f_in, open(output_path,"wb") as f_out:
        shutil.copyfileobj(f_in, f_out)
unzipped_pdb_path = os.path.basename(pf00186_path).replace(".gz", "")
pdb_path = "%s%s" % (pdb_dir, unzipped_pdb_path)
print(pdb_path)
gunzip(pf00186_path, pdb_path)
pdb2msa(pdb_path)