# Split internal and hairpin loop motifs into single, double and triple bases that are consecutively connected

In [1]:
import os, sys, shutil
import pathlib
import glob as glob
import numpy as np
import re
from openbabel import openbabel
import warnings

In [2]:
def make_dir(output_path):
    """
    Create output directory
    """
    if os.path.isdir(output_path):
        print(">remove directory")
        shutil.rmtree(output_path)
    
    pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

In [3]:
def extract_rna(filename, s1, s2):
    """
    Extract RNA coordinates
    
    Parameters
    ----------
    filename : str
        Full path to cif file
    s1 : int
        First index position of RNA sequence
    s2 : int
        Last index position of RNA sequence
    
    Returns
    -------
    arr : list of str
        RNA coordinates with header information
    """
    arr = []
    with open(filename, "r") as f:
        arr = f.readlines()[s1:s2]

    return h + arr 

In [4]:
def export_cif(arr, cif):
    """
    Export new cif file
    """
    with open(cif, "w") as wf:
        for w in arr:
            wf.write("{}".format(w))

In [5]:
def load_cif(file):
    """
    Find starting index position for each RNA residue. 
    
    Parameters
    ----------
    file :
        Cif file in full path
        
    Returns
    -------
    h : list of str
        Header information of cif file
    s : list of str
        RNA information from the cif file in the format of [chain ID/residue ID/residue name/insertion code/]
    s_idx : list of int
        Starting index position of each RNA residue
    """
    
    h, s = [], []
    with open(file, "r") as f:
        for l in f.readlines():
            _l = l.strip('\n').split()

            if _l[0].startswith("ATOM"):
                rname = str(_l[5])    # comp_id (RNA: GCAU)
                cid   = str(_l[6])    # asym_id (chain id)
                rid   = str(_l[8])    # seq_id  (residue id)            
                ins_code = str(_l[9]) # PDB_ins_code (insertion code) --> required to solve same seq id with different insertion code issue                
                if _l[2] == "P":   # required to solve same chain id and insertion code issue
                    x_coord = str(_l[10]) # Cartn_x_esd (x coordinate) 
                    y_coord = str(_l[11]) # Cartn_y_esd (y coordinate)
                    z_coord = str(_l[12]) # Cartn_z_esd (z coordinate)
                  
                s.append(cid + "/" + rid + "/" + rname + "/" + ins_code + "/" + x_coord + "/" + y_coord + "/" + z_coord)
            else:
                h.append(l)

    # find position at the first occurrence of the specified value
    #s_idx = [s.index(x) for x in sorted(set(s))]
    s_idx = [s.index(x) for x in list(dict.fromkeys(s))] # keep element order
    
    return h, s, s_idx

In [6]:
def single_base(output_path, release_version, file, h, s, s_idx):
    """
    Extract single base from CIF file 
    
    Parameters
    ----------
    output_path : str
        Full path pointing to the parent output directory
    release_version : str
        Release version of RNA 3D Motif Atlas (e.g. HairpinLoopMotifAtlasRelease3.57)
    file : str
        CIF file in full path
    h : list of str
        Header information of cif file
    s : list of str
        RNA information from the cif file in the format of [chain ID/residue ID/residue name/insertion code]
    s_idx : list of int
        Starting index position of each RNA residue
      
    Returns
    -------
    """

    category = "singlebase"
    
    for i in range(len(s_idx)):
        index_start = len(h)+s_idx[i]
        
        try:
            index_end = len(h)+s_idx[i+1]
            arr = extract_rna(file, index_start, index_end)
        except:
            index_end = len(h)+len(s)
            arr = extract_rna(file, index_start, index_end)

        rname = s[s_idx[i]].split('/')[2]
        fname = os.path.basename(file).split('.cif')[0] + "_" + rname + "_" + str(i)        
        cif = os.path.join(output_path, "dump", fname + ".cif")
        pdb = os.path.join(output_path, category, fname + ".pdb")
        
        export_cif(arr, cif)
        cif_to_pdb(cif, pdb)

In [7]:
def double_base(output_path, release_version, file, h, s, s_idx):
    """
    Extract double base from CIF file 
    
    Parameters
    ----------
    output_path : str
        Full path pointing to the parent output directory
    release_version : str
        Release version of RNA 3D Motif Atlas (e.g. HairpinLoopMotifAtlasRelease3.57)
    file : str
        CIF file in full path
    h : list of str
        Header information of cif file
    s : list of str
        RNA information from the cif file in the format of [chain ID/residue ID/residue name/insertion code]
    s_idx : list of int
        Starting index position of each RNA residue
      
    Returns
    -------
    """
    
    category = "doublebase"

    for i in range(len(s_idx)-1):    
        chain_1 = s[s_idx[i]].split('/')[0]
        chain_2 = s[s_idx[i+1]].split('/')[0]
        rid_1 = int(s[s_idx[i]].split('/')[1])
        rid_2 = int(s[s_idx[i+1]].split('/')[1])

        # bases from loop motifs are assumed consecutive
        #if chain_1 == chain_2 and abs(rid_1 - rid_2) == 1:        
        if chain_1 == chain_2:
            index_start = len(h)+s_idx[i]

            try:
                index_end = len(h)+s_idx[i+2]
                arr = extract_rna(file, index_start, index_end)
            except:
                index_end = len(h)+len(s)
                arr = extract_rna(file, index_start, index_end)

            rname_1 = s[s_idx[i]].split('/')[2]
            rname_2 = s[s_idx[i+1]].split('/')[2]            
            fname = os.path.basename(file).split('.cif')[0] + "_" + rname_1 + rname_2 + "_" + str(i)        
            cif = os.path.join(output_path, "dump", fname + ".cif")
            pdb = os.path.join(output_path, category, fname + ".pdb")
                    
            export_cif(arr, cif)
            cif_to_pdb(cif, pdb)
        else:
            print("Do not export. Disconnection found for {}: {}({}) - {}({})".format(os.path.basename(file), rid_1, chain_1, rid_2, chain_2))

In [8]:
def triple_base(output_path, release_version, file, h, s, s_idx):
    """
    Extract double base from CIF file 
    
    Parameters
    ----------
    output_path : str
        Full path pointing to the parent output directory
    release_version : str
        Release version of RNA 3D Motif Atlas (e.g. HairpinLoopMotifAtlasRelease3.57)
    file : str
        CIF file in full path
    h : list of str
        Header information of cif file
    s : list of str
        RNA information from the cif file in the format of [chain ID/residue ID/residue name/insertion code]
    s_idx : list of int
        Starting index position of each RNA residue
      
    Returns
    -------
    """
    
    category = "triplebase"
    
    for i in range(len(s_idx)-2):   
        chain_1 = s[s_idx[i]].split('/')[0]
        chain_2 = s[s_idx[i+1]].split('/')[0]
        chain_3 = s[s_idx[i+2]].split('/')[0]
        rid_1 = int(s[s_idx[i]].split('/')[1])
        rid_2 = int(s[s_idx[i+1]].split('/')[1])
        rid_3 = int(s[s_idx[i+2]].split('/')[1])

        # bases from loop motifs are assumed consecutive
        #if chain_1 == chain_2 == chain_3 and abs(rid_1 - rid_2) == 1 and abs(rid_2 - rid_3) == 1:
        if chain_1 == chain_2 == chain_3:
            index_start = len(h)+s_idx[i]

            try:
                index_end = len(h)+s_idx[i+3]
                arr = extract_rna(file, index_start, index_end)
            except:
                index_end = len(h)+len(s)
                arr = extract_rna(file, index_start, index_end)
    
            rname_1 = s[s_idx[i]].split('/')[2]
            rname_2 = s[s_idx[i+1]].split('/')[2]       
            rname_3 = s[s_idx[i+2]].split('/')[2]
            fname = os.path.basename(file).split('.cif')[0] + "_" + rname_1 + rname_2 + rname_3 + "_" + str(i)        
            cif = os.path.join(output_path, "dump", fname + ".cif")
            pdb = os.path.join(output_path, category, fname + ".pdb")
                    
            export_cif(arr, cif)
            cif_to_pdb(cif, pdb)
        else:
            print("Do not export. Disconnection found for {}: {}({}) - {}({}) - {}({})".format(os.path.basename(file), rid_1, chain_1, rid_2, chain_2, rid_3, chain_3))

In [9]:
def cif_to_pdb(cif, pdb):
    """
    Convert cif format to pdb format using OpenBabel
    """
    obConversion = openbabel.OBConversion()
    obConversion.SetInAndOutFormats("cif", "pdb")

    mol = openbabel.OBMol()
    obConversion.ReadFile(mol, cif)   # Open Babel will uncompress automatically

    #mol.AddHydrogens()
    mol.DeleteHydrogens()

    #print mol.NumAtoms()
    #print mol.NumBonds()
    #print mol.NumResidues()

    obConversion.WriteFile(mol, pdb)

In [10]:
if __name__ == "__main__":
    
    base_path = os.path.dirname(os.path.abspath("__file__")).strip('notebooks')
    release_versions = [ "HairpinLoopMotifAtlasRelease3.57", "InternalLoopMotifAtlasRelease3.57", "nrlist_3.233_2.5A/JunctionLoop" ]
    #release_versions = [ "InternalLoopMotifAtlasRelease3.57" ]
    
    
    for release_version in release_versions:
        #release_version = filename.split('.json')[0]
        print(">{}".format(release_version))
        
        # create output directory
        output_path = os.path.join(base_path, "pdb", "motif", release_version)
        make_dir(output_path)
        make_dir(os.path.join(output_path, "dump"))
        make_dir(os.path.join(output_path, "singlebase"))
        make_dir(os.path.join(output_path, "doublebase"))
        make_dir(os.path.join(output_path, "triplebase"))
        
        # extract bases and save as pdb
        files = glob.glob(os.path.join(base_path, "data", release_version, "*.cif"))
        for file in files:
            h, s, s_idx = load_cif(file)
            single_base(output_path, release_version, file, h, s, s_idx)
            double_base(output_path, release_version, file, h, s, s_idx)
            triple_base(output_path, release_version, file, h, s, s_idx)

>HairpinLoopMotifAtlasRelease3.57
>remove directory
>InternalLoopMotifAtlasRelease3.57
>remove directory
Disconnection found for IL_52729.1.cif: 16(C) - 11(D)
Disconnection found for IL_52729.1.cif: 15(C) - 16(C) - 11(D)
Disconnection found for IL_52729.1.cif: 16(C) - 11(D) - 12(D)
Disconnection found for IL_39900.1.cif: 32(A) - 1(B)
Disconnection found for IL_39900.1.cif: 12(A) - 32(A) - 1(B)
Disconnection found for IL_66450.1.cif: 13(A) - 38(B)
Disconnection found for IL_66450.1.cif: 12(A) - 13(A) - 38(B)
Disconnection found for IL_66450.1.cif: 13(A) - 38(B) - 39(B)
Disconnection found for IL_55884.1.cif: 9(C) - 12(D)
Disconnection found for IL_55884.1.cif: 8(C) - 9(C) - 12(D)
Disconnection found for IL_55884.1.cif: 9(C) - 12(D) - 13(D)
Disconnection found for IL_80007.15.cif: 10(A) - 4(B)
Disconnection found for IL_80007.15.cif: 9(A) - 10(A) - 4(B)
Disconnection found for IL_80007.15.cif: 10(A) - 4(B) - 5(B)
Disconnection found for IL_68506.1.cif: 27(C) - 35(D)
Disconnection found f