# Extract junction loops from [RNA representative structures](http://rna.bgsu.edu/rna3dhub/nrlist)
Since junction loop motifs are not available by [RNA 3D motif Atlas](http://rna.bgsu.edu/rna3dhub/motifs), download non-redundant RNA representative structures and extract junction loops manually

In [1]:
import os, sys, shutil
import pathlib
import glob as glob
import numpy as np
import json
import wget
import time
import warnings

In [2]:
def make_dir(output_path):
    """
    create directory
    """
    
    if os.path.isdir(output_path):
        print(">remove directory: {}".format(output_path))
        shutil.rmtree(output_path)
        
    pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) 

In [3]:
def download_csv(output_path, file, url_loop):
    """
    Download csv containing loop motif information
    
    Parameters
    ----------
    output_path : str
        File path pointing to the output directory 
    file : str
        CSV file in full path that contains non-redundant representative RNA information
    url_loop : str
        URL to download RNA loop strucutres (e.g. hairpin, internal, and junction) using BGSU API (https://www.bgsu.edu/research/rna/APIs.html)
    
    Returns
    -------
    """
    
    with open(file, "r") as f:
        for l in f.readlines():
            pdb  = l.split(',')[1].split('|')[0].replace('"', '')            
            csv  = os.path.join(output_path, "dump", pdb + "_loop.csv")                        
            _url = os.path.join(url_loop, pdb)
            
            try:
                # same pdb could be present with different chain id
                if os.path.isfile(csv):
                    #print("{} already found".format(os.path.basename(csv)))
                    pass
                else:
                    wget.download(_url, out=csv, bar=None)
            except:
                warnings.warn("Could not download {}\n{}".format(pdb, _url))
            
            time.sleep(1)  # avoid api traffic

In [4]:
def loadfile(file):
    """
    Load CSV file and get juction loop IDs
    
    Parameters
    ----------
    file : str
        CSV file in full path that stores loop IDs (e.g. hairpin: HP, internal: IL, junction: J3)
        
    Returns
    -------
    loop_ids : list of str
        List of junction loop IDs
    """
    
    loop_ids = []
    with open(file, 'r') as f:
        for l in f.readlines():
            
            # only extract junction loops
            if l.startswith('"J3_'):
                loop_id = l.split(',')[0].strip('"')
                loop_ids.append(loop_id)
            
        if len(loop_ids) == 0:
            print("no junction loops found: {}".format(os.path.basename(file)))
                
    return loop_ids

In [5]:
def download_cif(output_path, release_version, loop_ids, url_coord):
    """
    Download RNA coordinates using BGSU APIs (https://www.bgsu.edu/research/rna/APIs.html)
    
    Parameters
    ----------
    output_path : str
        Full path pointing to the output cif file
    release_version : str
        Release version of Representative sets of RNA 3D structures (http://rna.bgsu.edu/rna3dhub/nrlist)
    loop_ids : list of str
        Junciton loop id (e.g. J3_6SVS_002)
    url : str
        API url to download RNA coordinates
        
    Returns
    -------
    """
    
    for loop_id in loop_ids:
        _url = url_coord + loop_id
        cif = os.path.join(output_path, "dump", loop_id + ".cif")
        
        # check duplicate motif entry
        if os.path.exists(cif):
            warnings.warn("{} already exists. Duplicate motif entry.".format(loop_id))
        
        try:
            wget.download(_url, out=cif, bar=None)
            extract_model(output_path, release_version, cif) 
        except:
            warnings.warn("Could not download {}\n{}".format(loop_id, _url))

In [6]:
def extract_model(output_path, release_version, cif):
    """
    Extract the first model from the cif file. The downloaded cif file contains two models. Model 1 is the RNA motif of interset and model 2 is the neighboring structures of model 1. 
    
    Parameters
    ----------
    output_path : str
        Full path pointing to the output cif file
    release_version : str
        Release version of Representative sets of RNA 3D structures (http://rna.bgsu.edu/rna3dhub/nrlist)
    cif : str
        Downloaded cif file in full path
    
    Returns
    -------
    """
    arr = []
    count = 0
    with open(cif, "r") as f:
        for l in f.readlines():
            _l = l.strip('\n').split()[0]

            if _l.startswith("#"):
                count += 1
            if count == 2:    
                break
                
            arr.append(l)
    
    ofile = os.path.join(output_path, os.path.basename(cif))
    with open(ofile, "w") as wf:
        for l in arr:
            wf.write(l)

In [7]:
if __name__ == "__main__":
    
    url_coord = "http://rna.bgsu.edu/rna3dhub/rest/getCoordinates?coord="
    url_loop = "http://rna.bgsu.edu/rna3dhub/loops/download"
    base_path = os.path.dirname(os.path.abspath("__file__")).strip('notebooks')
    release_version = "nrlist_3.233_2.5A"    
    
    # create output directory
    output_path = os.path.join(base_path, "data", release_version, "JunctionLoop")
    make_dir(output_path)
    make_dir(os.path.join(output_path, "dump"))
    
    # download junction loops via RNA representative structures
    print(">download csv")
    file = os.path.join(base_path, "data", release_version + ".csv")
    download_csv(output_path, file, url_loop)    

    # extract loop junctions for each entry
    print(">download cif")
    files = glob.glob(os.path.join(output_path, "dump", "*_loop.csv"))
    for file in files:
        loop_ids = loadfile(file)
        download_cif(output_path, release_version, loop_ids, url_coord)

>download csv
>download cif
no junction loops found: 7OX9_loop.csv
no junction loops found: 4XNR_loop.csv
no junction loops found: 5HR7_loop.csv
no junction loops found: 5NDH_loop.csv
no junction loops found: 1ET4_loop.csv
no junction loops found: 4NGD_loop.csv
no junction loops found: 7BPV_loop.csv
no junction loops found: 7D7W_loop.csv
no junction loops found: 5EEU_loop.csv
no junction loops found: 6O5F_loop.csv
no junction loops found: 4QVD_loop.csv
no junction loops found: 3K64_loop.csv
no junction loops found: 6OZR_loop.csv
no junction loops found: 1UVL_loop.csv
no junction loops found: 1UVM_loop.csv
no junction loops found: 1IHA_loop.csv
no junction loops found: 7S3C_loop.csv
no junction loops found: 7ECN_loop.csv
no junction loops found: 5EV4_loop.csv
no junction loops found: 7S3B_loop.csv
no junction loops found: 7ECO_loop.csv
no junction loops found: 6F4H_loop.csv
no junction loops found: 3EQT_loop.csv
no junction loops found: 3HSB_loop.csv
no junction loops found: 3MJ0_loop.c