In [1]:
import matplotlib.pyplot as plt
from matplotlib import patches
import numpy as np
import pandas as pd
from scipy.spatial import distance_matrix
import sys, os, Bio.PDB, warnings
pdb_list = Bio.PDB.PDBList()
pdb_parser = Bio.PDB.PDBParser()
data_dir = '../protein_pfam'
sys.path.append(data_dir)
from parse_pfam import parse_pfam
from Bio import BiopythonWarning
warnings.simplefilter('ignore', BiopythonWarning)
%matplotlib inline

In [2]:
pfam, pdb_refs = parse_pfam(data_dir)
print 'total MSAs: %i, total PDB refs: %i' % (pfam.shape[0], pdb_refs.shape[0])

total MSAs: 17772, total PDB refs: 249704


In [3]:
# Cas 9 family accesion code
ac = 'PF00186'
# store the family of interest in the 'fam' variable
fam = pfam.loc[ac]
#print 'size rank: %i of %i' % (pfam['size'].rank(ascending=False)[fam.name].astype(int), pfam.shape[0])

In [4]:
fam

res          918
seq         7750
pdb_refs     382
Name: PF00186, dtype: int64

In [5]:
# local directory containing data for this MSA
fam_dir = os.path.join(data_dir, 'Pfam-A.full', fam.name)

# the residue symbols array that is the MSA
msa = np.load(os.path.join(fam_dir, 'msa.npy'))

In [6]:
msa.shape

(918, 7750)

In [7]:
pfam[pfam.index == ac]

Unnamed: 0,res,seq,pdb_refs
PF00186,918,7750,382


# find pdb reference:

In [8]:
refs = pdb_refs[pdb_refs.index.str.contains(fam.name)]
refs.head()

Unnamed: 0,seq,uniprot_id,uniprot_start,uniprot_end,pdb_id,chain,pdb_start,pdb_end
PF00186,68,Q5KZ26_GEOKA,1,160,1ZDR,B,1,160
PF00186,68,Q5KZ26_GEOKA,1,160,1ZDR,A,1,160
PF00186,82,Q81R22_BACAN,2,160,3JWK,B,2,160
PF00186,82,Q81R22_BACAN,2,160,3S9U,B,2,160
PF00186,82,Q81R22_BACAN,2,160,3FL9,H,2,160


In [9]:
n_refs = refs.shape[0]
print(n_refs)

372


In [10]:
for i in range(n_refs):
    ref = refs.iloc[i]
    #print(ref)

    # pdb sequence
    #seq = msa[:,ref.seq]
    seq = msa[:,ref.seq+1]  # change j-1 --> j
    #print(seq)
    
    gap_pos = seq == '-'
    seq_non_gap = seq[~gap_pos] 

    #print(seq_non_gap.shape)
    #print(seq_non_gap)
    
    pdb_file = pdb_list.retrieve_pdb_file(ref.pdb_id, pdir=fam_dir, file_format='pdb')
    chain = pdb_parser.get_structure(ref.pdb_id, pdb_file)[0][ref.chain]
    coords = np.array([a.get_coord() for a in chain.get_atoms()])
    #print(coords.shape)
    #print(coords)
    coords_cut = coords[ref.pdb_start-1:ref.pdb_end]
    #print(coords_cut.shape)
    
    print(seq_non_gap.shape[0]-coords_cut.shape[0])    

Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb1zdr.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb1zdr.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3jwk.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3s9u.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3fl9.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3fl8.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb4elf.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb4elf.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb4elh.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb4elf.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb4elg.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3fl8.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3s9u.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3jw3.ent' 
0
Struct

0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb4kbn.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3nxx.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3nxr.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb5hpb.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3nu0.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb1dlr.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3gi2.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb1u71.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3nxt.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3nzd.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb4m6j.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3f91.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3nxo.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3fs6.ent' 
0
Stru

0
Downloading PDB structure '1DRE'...
0
Downloading PDB structure '3VCO'...
0
Downloading PDB structure '3NRR'...
0
Downloading PDB structure '3I3R'...
0
Downloading PDB structure '3KJR'...
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3nrr.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3kjr.ent' 
0
Downloading PDB structure '3K2H'...
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3k2h.ent' 
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3i3r.ent' 
0
Downloading PDB structure '2ITH'...
0
Downloading PDB structure '2JYB'...
0
Downloading PDB structure '1VDR'...
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb1vdr.ent' 
0
Downloading PDB structure '4H98'...
0
Downloading PDB structure '3QLX'...
0
Downloading PDB structure '3CSE'...
0
Structure exists: '../protein_pfam/Pfam-A.full/PF00186/pdb3qlx.ent' 
0
Downloading PDB structure '3ROA'...
0
Downloading PDB structure '3EEJ'...
0
Downloading PDB structure '3RO9'...
0
Do