# Download [RNA base triple database](http://rna.bgsu.edu/triples/triples.php)

In [1]:
import os, sys, shutil
import pathlib
import glob as glob
import numpy as np
import json
import wget
from itertools import product
from zipfile import ZipFile
from pdbfixer import PDBFixer
import warnings
from openbabel import openbabel
#import time

In [2]:
#openbabel.obErrorLog.SetOutputLevel(0)
#openbabel.obErrorLog.StopLogging()

In [3]:
#warnings.filterwarnings("ignore")
#sys.stderr = sys.__stderr__

In [4]:
url = "http://rna.bgsu.edu/triples/zip"
release_version = "v1.4"

In [5]:
base_path = os.path.dirname(os.path.abspath("__file__")).strip('notebooks')
#output_path = os.path.join(base_path, "pdb", "triplebase")
output_path = os.path.join(base_path, "data", "triplebase")

In [6]:
if os.path.isdir(output_path):
    print(">remove directory: {}".format(output_path))
    shutil.rmtree(output_path)
    
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) 

>remove directory: /Users/takabak/work/rna_bgsu/data/triplebase


In [7]:
d = "GCAU"
arr = list(product(d, repeat=3))

In [8]:
for a in arr:
    seq = ''.join(a)    
    _output_path = os.path.join(output_path, seq)
    _url = os.path.join(url, release_version, seq + ".zip")
    
    #print('{}.zip'.format(seq))
    wget.download(_url, out=output_path, bar=None)     
    shutil.unpack_archive('{}.zip'.format(_output_path), _output_path)

In [9]:
# delete zip file
for a in arr:
    seq = ''.join(a)
    os.remove(os.path.join(output_path, seq + ".zip"))

### check pdb

In [10]:
def renumber_resid(f):
    arr = []
    count = 0
    new_rid = 0
    with open(f, "r") as wf:
        for l in wf.readlines():
            if l.startswith('ATOM'):
                _l = l.strip('\n').split()
                aid = str(_l[1])
                aname = str(_l[2])
                
                # handle four digit resid (e.g. C1983)
                x = str(_l[4])
                if len(x) == 5:
                    chain = str(x[:1])
                    rid = str(x[1:])
                else:
                    chain = str(_l[4])
                    rid = str(_l[5])

                # convert numeric chain to alphabet
                if chain.isnumeric():
                    new_chain = "X"
                else:
                    new_chain = chain
                    
                # check resid update
                if aname == 'P':
                    new_rid = str(int(rid) + count)
                    nl = replace_string(l, chain, new_chain, rid, new_rid)
                    arr.append(nl)
                    count += 1
                else:
                    nl = replace_string(l, chain, new_chain, rid, new_rid)
                    arr.append(nl)
            else:
                arr.append(l)

    # export pdb
    shutil.move(f, f + ".duplicateAtoms")
    with open(f, "w") as wf:
        for a in arr:
            wf.write(a)
    
    # check if new pdb passes PDBFixer
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("default")
        fixer = PDBFixer(filename=f)
        if len(w) != 0:
            print("{}: Error after renumbering pdb residues")

In [11]:
def replace_string(l, chain, new_chain, rid, new_rid):
    if len(rid) == 1:
        nl = l.replace("{}   {}".format(chain, rid), "{}   {}".format(new_chain, new_rid))
    elif len(rid) == 2:
        nl = l.replace("{}  {}".format(chain, rid), "{}  {}".format(new_chain, new_rid))
    elif len(rid) == 3:
        nl = l.replace("{} {}".format(chain, rid), "{} {}".format(new_chain, new_rid))
    elif len(rid) == 4:
        nl = l.replace("{}{}".format(chain, rid), "{}{}".format(new_chain, new_rid))
    return nl

In [12]:
files = glob.glob(output_path + "/*/*.pdb")

In [13]:
for f in files:
    basename = os.path.basename(f)

    try:
        # check converted pdb with PDBFixer
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("default")
            fixer = PDBFixer(filename=f)    

        # raise warning if duplicate residue exists
        if len(w) != 0:
            #print(">{}: renumber residues".format(basename))
            renumber_resid(f)
            
    except:
        print("{}: Error".format(basename))
        shutil.move(f, f + ".error")

Triple_tHW_tHS_GCC.pdb: Error


### check if modified files could be loaded with PDBFixer

In [14]:
f = os.path.join(output_path, "GCC", "Triple_tHW_tHS_GCC.pdb")

In [15]:
f

'/Users/takabak/work/rna_bgsu/data/triplebase/GCC/Triple_tHW_tHS_GCC.pdb'

In [16]:
fixer = PDBFixer(filename=f)