In [1]:
import os.path, sys

import numpy as np
import pandas as pd
from scipy import linalg
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OneHotEncoder

import Bio.PDB, warnings
pdb_list = Bio.PDB.PDBList()
pdb_parser = Bio.PDB.PDBParser()
from scipy.spatial import distance_matrix
from Bio import BiopythonWarning
warnings.simplefilter('ignore', BiopythonWarning)

from joblib import Parallel, delayed
import timeit

import matplotlib.pyplot as plt

# # --- Import our Code ---# #
#import emachine as EM
from direct_info import direct_info

# import data processing and general DCA_ER tools
from data_processing import data_processing_msa2pdb
import ecc_tools as tools
from pathlib import Path
np.random.seed(1)

from Bio import SeqIO
from Bio.PDB import *
from scipy.spatial import distance_matrix
from Bio import pairwise2
#from Bio.SubsMat.MatrixInfo import blosum62
pdb_parser = Bio.PDB.PDBParser()

from prody import *



In [2]:
# # # load and convert pfam to numpy array.
# msa_file = '/home/ecresswell/Pfam-A.full/PF00118_full.txt'

# from ecc_tools import read_FASTA
# msa_fasta, msa = read_FASTA(msa_file)
# np.save('/home/ecresswell/Pfam-A.full/PF00118/msa.npy', msa)


In [3]:
create_new = True
printing = True
removing_cols = True


data_path = Path('/home/ecresswell/Pfam-A.full')
data_path = Path('/data/cresswellclayec/Pfam-A.full')

# Define data directories
DCA_ER_dir = '/home/ecresswell/DCA_ER' # Set DCA_ER directory
DCA_ER_dir = '/data/cresswellclayec/DCA_ER' # Set DCA_ER directory

biowulf_dir = '%s/biowulf_full' % DCA_ER_dir

out_dir = '%s/protein_data/di/' % biowulf_dir
processed_data_dir = "%s/protein_data/data_processing_output" % biowulf_dir
pdb_dir = '%s/protein_data/pdb_data/' % biowulf_dir

pdb_path = "/pdb/pdb/zd/pdb1zdr.ent.gz"
# pdb_path = "/pdb/pdb/zd/pdb1zdr.ent.gz"
pdb_path_1 = "/pdb/pdb/as/pdb1ass.ent.gz"
pdb_path_2 = "/pdb/pdb/a6/pdb1a6e.ent.gz"


pfam_dir = "/fdb/fastadb/pfam"

from data_processing import pdb2msa, data_processing_pdb2msa


import gzip, shutil
def gunzip(file_path, output_path):
    print('Unzipping %s to %s' % (file_path, output_path))
    with gzip.open(file_path,"rb") as f_in, open(output_path,"wb") as f_out:
        shutil.copyfileobj(f_in, f_out)




n_cpus = 6
print('\n\nUnzipping %s' % pdb_path)





Unzipping /pdb/pdb/zd/pdb1zdr.ent.gz


In [4]:

unzipped_pdb_filename = os.path.basename(pdb_path_1).replace(".gz", "")

pdb_out_path = "%s%s" % (pdb_dir, unzipped_pdb_filename)
print('Unzipping %s to %s' % (pdb_path_1, pdb_out_path))

gunzip(pdb_path_1, pdb_out_path)
print(pdb_out_path)
print(pdb_dir)
pdb2msa_results_1 = pdb2msa(pdb_out_path, pdb_dir, create_new=False)
print(pdb2msa_results_1)


if len(pdb2msa_results_1) > 1:
    fasta_file = pdb2msa_results_1[0]
    prody_df = pdb2msa_results_1[1]
else:
    prody_df_1 = pdb2msa_results_1[0]


print('\nPDB DF with associated Protein Families\n', prody_df_1.loc[:,  [column for column in prody_df_1.columns if column not in ['locations', 'PDB Sequence']]].head())
print("\n\nLooping through Prody Search DataFrame:", prody_df_1.head())
rows_to_drop = []
for ir, pdb2msa_row in enumerate(prody_df_1.iterrows()):
    print('\n\nGetting msa with following pdb2msa entry:\n', pdb2msa_row)
    #try:
    dp_result1 =  data_processing_pdb2msa(data_path, prody_df_1.iloc[pdb2msa_row[0]], gap_seqs=0.2, gap_cols=0.2, prob_low=0.004,
                               conserved_cols=0.8, printing=True, out_dir=processed_data_dir, pdb_dir=pdb_dir, letter_format=False,
                               remove_cols=True, create_new=True, n_cpu=min(2, n_cpus))
    if dp_result1 is not None:
        [s0, removed_cols, s_index, tpdb, pdb_s_index] = dp_result1
        break
    else:
        rows_to_drop.append(ir)
        continue
    #except Exception as e:
    #    print('row %d got exception: ' % ir , e)
    #    print('moving on.. ')
    #    pass


pdb_id_1 = pdb2msa_row[1]['PDB ID']
pfam_id_1 = pdb2msa_row[1]['Pfam']
# update Prody search DF (use same filename as pdb2msa() in data_processing
prody_df_1 = prody_df_1.drop(rows_to_drop)

s0_1 = np.load("%s/%s_%s_preproc_msa.npy" % (processed_data_dir, pfam_id_1, pdb_id_1))
s_index_1 = np.load("%s/%s_%s_preproc_sindex.npy" % (processed_data_dir, pfam_id_1, pdb_id_1))
removed_cols_1 = np.load("%s/%s_%s_removed_cols.npy" % (processed_data_dir, pfam_id_1, pdb_id_1))
ref_seq_1 = np.load("%s/%s_%s_preproc_refseq.npy" % (processed_data_dir, pfam_id_1, pdb_id_1))



if dp_result1 is None:
    print('None of the available prody pdb search found matching alignments... Exiting..')
    sys.exit()
print('Done Preprocessing Data.....')


Unzipping /pdb/pdb/as/pdb1ass.ent.gz to /data/cresswellclayec/DCA_ER/biowulf_full/protein_data/pdb_data/pdb1ass.ent
Unzipping /pdb/pdb/as/pdb1ass.ent.gz to /data/cresswellclayec/DCA_ER/biowulf_full/protein_data/pdb_data/pdb1ass.ent
/data/cresswellclayec/DCA_ER/biowulf_full/protein_data/pdb_data/pdb1ass.ent
/data/cresswellclayec/DCA_ER/biowulf_full/protein_data/pdb_data/
[   Unnamed: 0  Unnamed: 0.1 PDB ID Chain  Polypeptide Index     Pfam  \
0           0             0   1ass     A                  0  PF00118   

    accession   class          id    type  ... ali_end  ali_start  bitscore  \
0  PF00118.27  Domain  Cpn60_TCP1  Pfam-A  ...     152          2     178.4   

   end   cond_evalue    ind_evalue    evidence hmm_end  hmm_start  start  
0  152  1.200000e-56  2.300000e-52  hmmer v3.0     332        182      1  

[1 rows x 21 columns]]

PDB DF with associated Protein Families
    Unnamed: 0  Unnamed: 0.1 PDB ID Chain  Polypeptide Index     Pfam  \
0           0             0   1ass

S--GIVI-DKEKV-HSK--MPDVVK---NAKIALIDS---ALEIK--KTEIE--AKVQISDPSK-I----QD---FLNQ-ET--NTFKQMV--E---KIKKS-GANVVLCQKGIDDV-AQHY-LAK-EGIY-AVRRVKKSDMEKLAKATGAK--IVTDL---DDLTPSV--LGEAET--VEERKIGDDR-MTFVMGC---K-
   || | ||| | |    ||   |   ||||||      |||    ||  |  ||  |      |    |    ||   |   |    |   |   || || |||||||||||||  |||  ||  |||  ||||||||||||||||||    ||      |||||    ||||    |||||||||  |  |      | 
-IYGI-IVDKE-VVH--PGMP---KRVENAKIAL---LNLALE--VEKT--EFDAK--I-----NIETPEQ-MEAFL--KE-EEN----M-LREMVDKI-KSAGANVVLCQKGIDD-MAQH-FLA-NEGI-LAVRRVKKSDMEKLAKATG--GRIV---NNIDDLTP--EDLGEA--GLVEERKIGDD-KM--V---FIEKC
  Score=100

match upgrade at 27
27: pairwise score=100.000000

lengths:  151 151
Alignment(seqA='S--GIVI-DKEKV-HSK--MPDVVK---NAKIALIDS---ALEIK--KTEIE--AKVQISDPSK-I----QD---FLNQ-ET--NTFKQMV--E---KIKKS-GANVVLCQKGIDDV-AQHY-LAK-EGIY-AVRRVKKSDMEKLAKATGAK--IVTDL---DDLTPSV--LGEAET--VEERKIGDDR-MTFVMGC---K-', seqB='-IYGI-IVDKE-VVH--PGMP---KRVENAKIAL---LNLALE--VEKT--EFDAK--I-----NIETPEQ-MEAFL--KE-

found 6439 duplicates! (Removing...)
After removing bad sequences, tpdb is now  6242
1410
After removing bad sequences, tpdb is now  5824

After removing bad sequences...
tpdb (s_ipdb) is :  5824
(19800, 150)
found bad columns := [ 34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57 133 134 135]
found conserved columns (80% repetition):
 [  0 111 126 127]
We remove conserved and bad columns with, at the following indices (len 31):
 [  0 133 134 135 111  34  35  36  37  38  39  40  41  42  43  44  45  46
  47  48  49  50  51  52  53  54  55  56  57 126 127]
Removed Columns...
s now has shape:  (19800, 119)
s_index (length=119) = 
 [  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  58  59  60
  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78
  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96
  97  98  99 100 101 102 1

In [5]:
unzipped_pdb_filename = os.path.basename(pdb_path_2).replace(".gz", "")

pdb_out_path = "%s%s" % (pdb_dir, unzipped_pdb_filename)
print('Unzipping %s to %s' % (pdb_path_2, pdb_out_path))

gunzip(pdb_path_2, pdb_out_path)
print(pdb_out_path)
print(pdb_dir)
pdb2msa_results_2 = pdb2msa(pdb_out_path, pdb_dir, create_new=False)
print(pdb2msa_results_2)


if len(pdb2msa_results_2) > 1:
    fasta_file = pdb2msa_results_2[0]
    prody_df = pdb2msa_results_2[1]
else:
    prody_df_2 = pdb2msa_results_2[0]


print('\nPDB DF with associated Protein Families\n', prody_df_2.loc[:,  [column for column in prody_df_2.columns if column not in ['locations', 'PDB Sequence']]].head())
print("\n\nLooping through Prody Search DataFrame:", prody_df_2.head())
rows_to_drop = []
for ir, pdb2msa_row in enumerate(prody_df_2.iterrows()):
    print('\n\nGetting msa with following pdb2msa entry:\n', pdb2msa_row)
    #try:
    dp_result2 =  data_processing_pdb2msa(data_path, prody_df_1.iloc[pdb2msa_row[0]], gap_seqs=0.2, gap_cols=0.2, prob_low=0.004,
                               conserved_cols=0.8, printing=True, out_dir=processed_data_dir, pdb_dir=pdb_dir, letter_format=False,
                               remove_cols=True, create_new=True, n_cpu=min(2, n_cpus))
    if dp_result2 is not None:
        [s0, removed_cols, s_index, tpdb, pdb_s_index] = dp_result2
        break
    else:
        rows_to_drop.append(ir)
        continue
    #except Exception as e:
    #    print('row %d got exception: ' % ir , e)
    #    print('moving on.. ')
    #    pass


pdb_id_2 = pdb2msa_row[1]['PDB ID']
pfam_id_2 = pdb2msa_row[1]['Pfam']
# update Prody search DF (use same filename as pdb2msa() in data_processing
prody_df_2 = prody_df_2.drop(rows_to_drop)

s0_2 = np.load("%s/%s_%s_preproc_msa.npy" % (processed_data_dir, pfam_id_2, pdb_id_2))
s_index_2 = np.load("%s/%s_%s_preproc_sindex.npy" % (processed_data_dir, pfam_id_2, pdb_id_2))
removed_cols_2 = np.load("%s/%s_%s_removed_cols.npy" % (processed_data_dir, pfam_id_2, pdb_id_2))
ref_seq_2 = np.load("%s/%s_%s_preproc_refseq.npy" % (processed_data_dir, pfam_id_2, pdb_id_2))


if dp_result2 is None:
    print('None of the available prody pdb search found matching alignments... Exiting..')
    sys.exit()
print('Done Preprocessing Data.....')

Unzipping /pdb/pdb/a6/pdb1a6e.ent.gz to /data/cresswellclayec/DCA_ER/biowulf_full/protein_data/pdb_data/pdb1a6e.ent
Unzipping /pdb/pdb/a6/pdb1a6e.ent.gz to /data/cresswellclayec/DCA_ER/biowulf_full/protein_data/pdb_data/pdb1a6e.ent
/data/cresswellclayec/DCA_ER/biowulf_full/protein_data/pdb_data/pdb1a6e.ent
/data/cresswellclayec/DCA_ER/biowulf_full/protein_data/pdb_data/
[   Unnamed: 0  Unnamed: 0.1 PDB ID Chain  Polypeptide Index     Pfam  \
0           0             0   1a6e     A                  0  PF00118   
1           1             1   1a6e     B                  0  PF00118   

    accession   class          id    type  ... ali_end  ali_start  bitscore  \
0  PF00118.27  Domain  Cpn60_TCP1  Pfam-A  ...     503         18    615.03   
1  PF00118.27  Domain  Cpn60_TCP1  Pfam-A  ...     501         14    591.06   

   end    cond_evalue     ind_evalue    evidence hmm_end  hmm_start  start  
0  503  4.800000e-189  9.500000e-185  hmmer v3.0     487          1     18  
1  502  9.000000e

S--GIVI-DKEKV-HSK--MPDVVK---NAKIALIDS---ALEIK--KTEIE--AKVQISDPSK-I----QD---FLNQ-ET--NTFKQMV--E---KIKKS-GANVVLCQKGIDDV-AQHY-LAK-EGIY-AVRRVKKSDMEKLAKATGAK--IVTDL---DDLTPSV--LGEAET--VEERKIGDDR-MTFVMGC---K-
   || | ||| | |    ||   |   ||||||      |||    ||  |  ||  |      |    |    ||   |   |    |   |   || || |||||||||||||  |||  ||  |||  ||||||||||||||||||    ||      |||||    ||||    |||||||||  |  |      | 
-IYGI-IVDKE-VVH--PGMP---KRVENAKIAL---LNLALE--VEKT--EFDAK--I-----NIETPEQ-MEAFL--KE-EEN----M-LREMVDKI-KSAGANVVLCQKGIDD-MAQH-FLA-NEGI-LAVRRVKKSDMEKLAKATG--GRIV---NNIDDLTP--EDLGEA--GLVEERKIGDD-KM--V---FIEKC
  Score=100

match upgrade at 27
27: pairwise score=100.000000

lengths:  151 151
Alignment(seqA='S--GIVI-DKEKV-HSK--MPDVVK---NAKIALIDS---ALEIK--KTEIE--AKVQISDPSK-I----QD---FLNQ-ET--NTFKQMV--E---KIKKS-GANVVLCQKGIDDV-AQHY-LAK-EGIY-AVRRVKKSDMEKLAKATGAK--IVTDL---DDLTPSV--LGEAET--VEERKIGDDR-MTFVMGC---K-', seqB='-IYGI-IVDKE-VVH--PGMP---KRVENAKIAL---LNLALE--VEKT--EFDAK--I-----NIETPEQ-MEAFL--KE-

found 6439 duplicates! (Removing...)
After removing bad sequences, tpdb is now  6242
1410
After removing bad sequences, tpdb is now  5824

After removing bad sequences...
tpdb (s_ipdb) is :  5824
(19800, 150)
found bad columns := [ 34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57 133 134 135]
found conserved columns (80% repetition):
 [  0 111 126 127]
We remove conserved and bad columns with, at the following indices (len 31):
 [  0 133 134 135 111  34  35  36  37  38  39  40  41  42  43  44  45  46
  47  48  49  50  51  52  53  54  55  56  57 126 127]
Removed Columns...
s now has shape:  (19800, 119)
s_index (length=119) = 
 [  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  58  59  60
  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78
  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96
  97  98  99 100 101 102 1

In [6]:
# Load ER results for both pfam-pdb pairs 
# should all be the same
print('%s %s\n%s %s' % (pdb_id_1, pfam_id_1, pdb_id_2, pfam_id_2))
ER_di_1 = np.load("%s/%s_%s_ER_di.npy" % (out_dir, pdb_id_1, pfam_id_1))
ER_di_2 = np.load("%s/%s_%s_ER_di.npy" % (out_dir, pdb_id_2, pfam_id_2))
print(dp_result1[2], len(dp_result1[2]))
print(dp_result2[2], len(dp_result2[2]))
print(dp_result1[0][dp_result1[3]])
print(dp_result2[0][dp_result2[3]])

1ass PF00118
1a6e PF00118
[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  58  59  60
  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78
  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96
  97  98  99 100 101 102 103 104 105 106 107 108 109 110 112 113 114 115
 116 117 118 119 120 121 122 123 124 125 128 129 130 131 132 136 137 138
 139 140 141 142 143 144 145 146 147 148 149] 119
[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  58  59  60
  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78
  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96
  97  98  99 100 101 102 103 104 105 106 107 108 109 110 112 113 114 115
 116 117 118 119 120 121 122 123 124 125 128 129 130 131 132 136 137 138
 139 140 141 142 143 144 145 146 147 148 149] 11

### plot contact maps and auc curves to see which structure ER is capturing better

In [8]:
file_end = '.npy'
ER_fp_file = "%s/%s_%s_ER_fp%s" % (pdb_path, pdb_id_1, pfam_id_1, file_end)
ER_tp_file = "%s/%s_%s_ER_tp%s" % (pdb_path, pdb_id_1, pfam_id_1, file_end)
ER_fp_1 = np.load(ER_fp_file)
ER_tp_1 = np.load(ER_tp_file)
ER_fp_file = "%s/%s_%s_ER_fp%s" % (pdb_path, pdb_id_2, pfam_id_2, file_end)
ER_tp_file = "%s/%s_%s_ER_tp%s" % (pdb_path, pdb_id_2, pfam_id_2, file_end)
ER_fp_2 = np.load(ER_fp_file)
ER_tp_2 = np.load(ER_tp_file)


NotADirectoryError: [Errno 20] Not a directory: '/pdb/pdb/zd/pdb1zdr.ent.gz/1ass_PF00118_ER_fp.npy'