In [1]:
import os.path

import numpy as np
import pandas as pd
from scipy import linalg
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OneHotEncoder

import Bio.PDB, warnings
pdb_list = Bio.PDB.PDBList()
pdb_parser = Bio.PDB.PDBParser()
from scipy.spatial import distance_matrix
from Bio import BiopythonWarning
warnings.simplefilter('ignore', BiopythonWarning)

from joblib import Parallel, delayed
import timeit

import matplotlib.pyplot as plt
%matplotlib inline

# # --- Import our Code ---# #
#import emachine as EM
from direct_info import direct_info

# import data processing and general DCA_ER tools
from data_processing import data_processing
import ecc_tools as tools
from pathlib import Path
np.random.seed(1)


def check_symmetric(a, rtol=1e-05, atol=1e-08):
    return np.allclose(a, a.T, rtol=rtol, atol=atol)

In [5]:
data_path = Path('/home', 'eclay','Pfam-A.full')
data_path = Path('Pfam-A.full')
DCA_ER_dir = Path('/home/evan/PycharmProjects/DCA_ER/')



pfam_id = 'PF00023'
pfam_id = 'PF00011'
pfam_id = 'PF14525'
pfam_id = 'PF03496'
pfam_id = 'PF03678'
pfam_id = 'PF00186'


n_jobs = 6
create_new = True
removing_cols = False


DCA_ER_dir = '/data/cresswellclayec/DCA_ER/'
msa_npy_file = '/data/cresswellclayec/DCA_ER/Pfam-A.full/%s/msa.npy' % pfam_id
msa_fa_file  = '/data/cresswellclayec/DCA_ER/Pfam-A.full/%s/msa.fa' % pfam_id
pdb_ref_file = '/data/cresswellclayec/DCA_ER/Pfam-A.full/%s/pdb_refs.npy' % pfam_id
out_dir = '%sprotein_data/di/' % DCA_ER_dir

# Define data directories
# Need to think on best way to do this..
# Referencing the same dataframe may be useful so we dont always have to load individual ref files...
# however we also
individual_pdb_ref_file = Path(data_path, pfam_id, 'pdb_refs.npy')
pdb = np.load(individual_pdb_ref_file)
processed_data_dir = "%s/protein_data/data_processing_output" % DCA_ER_dir

# delete 'b' in front of letters (python 2 --> python 3)
pdb = np.array([pdb[t,i].decode('UTF-8') for t in range(pdb.shape[0]) \
         for i in range(pdb.shape[1])]).reshape(pdb.shape[0],pdb.shape[1])


# Print number of pdb structures in Protein ID folder
npdb = pdb.shape[0]
print('number of pdb structures:',npdb)

# Create pandas dataframe for protein structure
pdb_df = pd.DataFrame(pdb,columns = ['PF','seq','id','uniprot_start','uniprot_end',\
                                 'pdb_id','chain','pdb_start','pdb_end'])
print(pdb_df.head())


number of pdb structures: 372
        PF seq            id uniprot_start uniprot_end pdb_id chain pdb_start  \
0  PF00186  69  Q5KZ26_GEOKA             1         160   1ZDR     B         1   
1  PF00186  69  Q5KZ26_GEOKA             1         160   1ZDR     A         1   
2  PF00186  83  Q81R22_BACAN             2         160   3JWK     B         2   
3  PF00186  83  Q81R22_BACAN             2         160   3S9U     B         2   
4  PF00186  83  Q81R22_BACAN             2         160   3FL9     H         2   

  pdb_end  
0     160  
1     160  
2     160  
3     160  
4     160  


In [6]:
ipdb = 0
printing = True
print('seq:',int(pdb[ipdb,1]))

start_time = timeit.default_timer()
pdb_matches= tools.find_best_pdb(pfam_id, data_path, create_new=create_new)
run_time = timeit.default_timer() - start_time

print('finding best match takes %f seconds ' % run_time)

# # Load pydca trimmed datrimmed_data_outfile = '%sprotein_data/data_processing_output/MSA_%s_Trimmed.fa' % (DCA_ER_dir, pfam_id)
# trimmed_msa_file = Path(DCA_ER_dir, 'protein_data/data_processing_output/MSA_%s_Trimmed.fa' % pfam_id)
# s0_pydca = tools.read_FASTA(str(trimmed_msa_file), ref_index = int(pdb[ipdb,1]))

# Load csv of PDB-PFAM mapping.
#    downloaded from 
#    ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/pdb_pfam_mapping.csv.gz
#       

seq: 69
Finding best PDB match for (Searching 7750 sequences) PF00186
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
bad query
Raw-Query PDB dataframe gives 7714 matches... 

Sorted PDB matches (1122 matches): 
       MSA Index  PDB ID  Score  Identity        E-value  Bitscore  \
0          6896  3INV_1    1.0     1.000  7.482000e-138       432   
1          4761  3CL9_1    1.0     1.000  1.406000e-137       431   
2           973  3INV_1    1.0     0.956  1.697000e-130       411   
3          5557  1DR7_1    1.0     1.000  2.393000e-120       380   
4           250  3NXT_1    1.0     1.000  3.280000e-120       379   
...         ...     ...    ...       ...            ...       ...   
11

In [4]:
## pfam_id = 'PF03678'
pdb_match_temp = pdb_matches

individual_pdb_ref_file = Path(data_path, pfam_id, 'pdb_refs.npy')
pdb = np.load(individual_pdb_ref_file)
processed_data_dir = "%s/protein_data/data_processing_output" % DCA_ER_dir

# delete 'b' in front of letters (python 2 --> python 3)
pdb = np.array([pdb[t,i].decode('UTF-8') for t in range(pdb.shape[0]) \
         for i in range(pdb.shape[1])]).reshape(pdb.shape[0],pdb.shape[1])


# Print number of pdb structures in Protein ID folder
npdb = pdb.shape[0]
print('number of pdb structures:',npdb)

# Create pandas dataframe for protein structure
pdb_df = pd.DataFrame(pdb,columns = ['PF','seq','id','uniprot_start','uniprot_end',\
                                 'pdb_id','chain','pdb_start','pdb_end'])
print(pdb_df.head())

number of pdb structures: 24
        PF seq            id uniprot_start uniprot_end pdb_id chain pdb_start  \
0  PF03678  12   CAPSH_ADE02           652         874   1P2Z     A       651   
1  PF03678  25  Q8UY79_9ADEN           617         839   2OBE     A       616   
2  PF03678  25  Q8UY79_9ADEN           617         839   2OBE     C       616   
3  PF03678  25  Q8UY79_9ADEN           617         839   2OBE     B       616   
4  PF03678  50   CAPSH_ADEG1           606         846   2INY     A       605   

  pdb_end  
0     873  
1     838  
2     838  
3     838  
4     845  
