In [1]:
import pickle
import numpy as np
#import pandas as pd
import Bio
from Bio import AlignIO
import os, urllib, gzip, re

data_dir='test'

In [2]:
pfam_current_release = 'ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release'

In [3]:
files = [
        'Pfam-A.full.gz',  # The full alignments of the curated families  (~6GB)
        #'pdbmap.gz',  # Mapping between PDB structures and Pfam domains. (~2MB)
]

In [4]:
# download 'files' from 'pfam_current_release'
def downloadfiles():
    for f in files:
        local = os.path.join(data_dir, f)
        remote = os.path.join(pfam_current_release, f)
        urllib.urlretrieve(remote, local)   

In [5]:
pfam_file = os.path.join(data_dir, 'pfam.npy')

pfam = []
with gzip.open(os.path.join(data_dir, 'Pfam-A.full.gz'), 'r') as f:
    for i, line in enumerate(f):
        if line[:7] == '#=GF AC':
            ac = line.split(' ')[4][:-1].split('.')[0]
            pfam.append([ac, 0, 0, 0])
            #pfam.append(ac)
      
        # 2018.12.03: test
        #pfam.append(line)
        if i == 50000: break        
            
pfam = np.array(pfam)
np.save(pfam_file, pfam)

In [6]:
print(pfam.shape)
print(pfam)

(6, 4)
[['PF10417' '0' '0' '0']
 ['PF12574' '0' '0' '0']
 ['PF09847' '0' '0' '0']
 ['PF00244' '0' '0' '0']
 ['PF16998' '0' '0' '0']
 ['PF00389' '0' '0' '0']]


In [7]:
# parse multiple sequence alignments file with Biopython
alignments = AlignIO.parse(gzip.open(os.path.join(data_dir, 'Pfam-A.full.gz'), 'r'),'stockholm')

In [8]:
# for each pfam/msa
for i, a in enumerate(alignments):
    if i > 0: break  
    
    # local directory associated with pfam
    pfam_dir = os.path.join(data_dir, 'Pfam-A.full', pfam[i, 0])

    try:
        os.makedirs(pfam_dir)
    except:
        pass
            
    # number of residues/sequences in alignment
    n_residue = a.get_alignment_length()
    n_sequence = len(a)

    #------------------------------------------------------------------
    # msa: residues symbols
    # pdb_refs: references to pdb
    msa = np.empty((n_residue, n_sequence), dtype=str)
    pdb_refs = []
    
    # for each sequence in alignment
    for j, seq in enumerate(a):
        # store residue symbols in lowercase
        #msa[:, j] = np.array(seq.seq.lower())
        msa[:, j] = np.array(seq.seq)  # 2018.12.03: Tai
    
        # store uniprot sequence id
        uniprot_id, uniprot_start, uniprot_end = re.split('[/-]', seq.id)
        
        # extract pdb refs if they are present
        refs = seq.dbxrefs
        if not refs:
            continue    
        for ref in refs:
            ref = ref.replace(' ', '').split(';')
            if ref[0] == 'PDB':
                pdb_id, chain = ref[1][:-1], ref[1][-1]
                pdb_start_end = ref[2].split('-')
                if len(pdb_start_end) == 2:
                    pdb_start, pdb_end = pdb_start_end
                else:
                    continue
                pdb_refs.append([pfam[i, 0], j - 1, uniprot_id, uniprot_start,
                    uniprot_end, pdb_id, chain, pdb_start, pdb_end])
                
    np.save(os.path.join(pfam_dir,'msa.npy'),msa)
    np.save(os.path.join(pfam_dir,'pdb_refs.npy'), pdb_refs)
            
    n_pdb_ref = len(pdb_refs)
    pfam[i, 1:] = n_residue, n_sequence, n_pdb_ref
          
np.save(pfam_file, pfam)        

In [9]:
print(n_residue,n_sequence)
print(n_pdb_ref)
#print(a)

(113, 9746)
555


In [10]:
print(pfam)

[['PF10417' '113' '9746' '555']
 ['PF12574' '0' '0' '0']
 ['PF09847' '0' '0' '0']
 ['PF00244' '0' '0' '0']
 ['PF16998' '0' '0' '0']
 ['PF00389' '0' '0' '0']]


In [12]:
ref

['PDB', '3VWUH', '237-255', '']

In [21]:
print(a)

SingleLetterAlphabet() alignment with 16 rows and 332 columns
--------------------------------------------...--- A0A0F3R0S1_ORITS/31-131
-------------------------------------------m...--- A0A0F3QXX2_ORITS/1-133
--------------------------------------------...--- A0A0F3MK43_9RICK/404-636
sssvnqyehnpppvpkraeskqeatglksffkgmfskapeasta...vel Q1RJ37_RICBR/189-408
--------------------------------------------...--- A0A0F3MH25_ORITS/577-813
----efrdylnskqgrgqaelalqnsgvqaeiqridlegyrsvh...--- A0A2A5BBI9_9RICK/282-487
--------------------------------------------...--- H8K5G2_RICAG/113-350
--------------------------------------------...--- A0A0F3RQ37_ORITS/508-742
------------------------------------------el...--- SCA4_RICPR/102-337
---------rqeiiakqqttlknilaeanitaisvtnlainsqq...--- A0A261DCJ7_9RICK/496-744
-----------lttstteevvtdagdktkviikpntvntedhki...--- Q1RGU6_RICBR/7-173
-------------------------------------------m...--- A0A0F3PKL7_9RICK/1-218
--lnndpayseeakdqekfrqflanlnagerqglydkalsdeqf...--- 