In [1]:
import os, sys, shutil
import pathlib
import glob as glob
import numpy as np
import json
import wget
import time
import pandas as pd
import warnings

In [2]:
def make_dir(output_path):
    """
    create directory
    """

    if os.path.isdir(output_path):
        print(">remove directory: {}".format(output_path))
        shutil.rmtree(output_path)
        
    pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) 

In [3]:
def download_cif(output_path, file, basepair_family, url):
    """
    Download RNA coordinates
    
    Parameters
    ----------
    output_path : str
        File path pointing to the output of cif files
    file : str
        CSV file in full path that stores information about basepair family 
    basepair_family : str
        Motif IDs of RNA
    url : str
        API url to download RNA coordinates
        
    Returns
    -------
    """
    
    count = 0
    df = pd.read_csv(file, skiprows=0, header=2, sep='\t', dtype='str')    
    
    for index, row in df.iterrows():
        """
        Sequence
        Isostericity Group
        Non-Redundant Count
        Exemplar PDB
        Exemplar Resolution
        Exemplar Model
        Exemplar Chains
        Exemplar Numbers
        Exemplar Symmetry Operators
        """
        # sequence (RNA base)
        seq = row['Sequence']
        s1 = seq[0]
        s2 = seq[1]
                
        # only consider sequences that starts from uppercase. Base combinations of same strucutre is starts with a lowercase.
        if s1.islower() == False:
            s2 = s2.upper()   # force s2 to be upper case to avoid API error
            
            # pdb
            pdb = row['Exemplar PDB']

            # model
            m = "1"
            #try:
            #    m1 = row['Exemplar Model'].split(',')[0]
            #    m2 = row['Exemplar Model'].split(',')[1]
            #except:
            #    m1  = row['Exemplar Model']
            #    m2  = m1

            # chain
            c1  = row['Exemplar Chains'].split(',')[0]
            c2  = row['Exemplar Chains'].split(',')[1]

            # residue number
            n1  = row['Exemplar Numbers'].split(',')[0]
            n2  = row['Exemplar Numbers'].split(',')[1]

            # symmetry operators
            sym1  = row['Exemplar Symmetry Operators'].split(',')[0].replace(' ', '')
            sym2  = row['Exemplar Symmetry Operators'].split(',')[1].replace(' ', '')

            # define unit id (https://www.bgsu.edu/research/rna/help/rna-3d-hub-help/unit-ids.html)
            if sym1.startswith("1_"):
                uid1 = '|'.join([pdb, m, c1, s1, n1]).replace(' ', '')
            else:
                uid1 = '|'.join([pdb, m, c1, s1, n1]).replace(' ', '') + "||||" + sym1
            if sym2.startswith("1_"):
                uid2 = '|'.join([pdb, m, c2, s2, n2]).replace(' ', '')
            else:
                uid2 = '|'.join([pdb, m, c2, s2, n2]).replace(' ', '') + "||||" + sym2
                
            filename = "BP_" + basepair_family + "_" + seq + ".cif"
            cif = os.path.join(output_path, "dump", filename)
            _url = url + '{},{}'.format(uid1, uid2)

            try:
                wget.download(_url, out=cif, bar=None)                    
                extract_model(output_path, cif) 
                time.sleep(1)  # avoid api traffic
            except:
                warnings.warn("Could not download {}".format(_url))
                
            count += 1

In [4]:
def extract_model(output_path, cif):
    """
    Extract the first model from the cif file. The downloaded cif file contains two models. Model 1 is the RNA motif of interset and model 2 is the neighboring structures of model 1. 
    
    Parameters
    ----------
    output_path : str
        Full path pointing to the output cif file
    release_version : str
        Release version of RNA 3D Motif Atlas (e.g. HairpinLoopMotifAtlasRelease3.57)
    cif : str
        Downloaded cif file in full path
    
    Returns
    -------
    """
    arr = []
    count = 0
    with open(cif, "r") as f:
        for l in f.readlines():
            _l = l.strip('\n').split()[0]

            if _l.startswith("#"):
                count += 1
            if count == 2:    
                break
                
            arr.append(l)
    
    _cif = os.path.join(output_path, os.path.basename(cif))
    with open(_cif, "w") as wf:
        for l in arr:
            wf.write(l)

In [5]:
if __name__ == "__main__":
    
    url = "http://rna.bgsu.edu/rna3dhub/rest/getCoordinates?coord="
    base_path = os.path.dirname(os.path.abspath("__file__")).strip('notebooks')
    
    release_version = "bpcatalog"
    files = glob.glob(base_path + "/data/{}_*.csv".format(release_version))
    
    # download cif for each basepair family
    for file in files:
        basepair_family = file.strip('.csv').split('bpcatalog_')[1]

        # create new directory
        output_path = os.path.join(base_path, "data", release_version, basepair_family)
        make_dir(output_path)
        make_dir(os.path.join(output_path, "dump"))
        
        # download cif
        download_cif(output_path, file, basepair_family, url)

>remove directory: /Users/takabak/work/rna_bgsu/data/bpcatalog/tWW
>remove directory: /Users/takabak/work/rna_bgsu/data/bpcatalog/cWH
>remove directory: /Users/takabak/work/rna_bgsu/data/bpcatalog/tWS
>remove directory: /Users/takabak/work/rna_bgsu/data/bpcatalog/cSS
>remove directory: /Users/takabak/work/rna_bgsu/data/bpcatalog/tHS
>remove directory: /Users/takabak/work/rna_bgsu/data/bpcatalog/cHH
>remove directory: /Users/takabak/work/rna_bgsu/data/bpcatalog/tHH
>remove directory: /Users/takabak/work/rna_bgsu/data/bpcatalog/cHS
>remove directory: /Users/takabak/work/rna_bgsu/data/bpcatalog/tSS
>remove directory: /Users/takabak/work/rna_bgsu/data/bpcatalog/cWS
>remove directory: /Users/takabak/work/rna_bgsu/data/bpcatalog/tWH
>remove directory: /Users/takabak/work/rna_bgsu/data/bpcatalog/cWW
