In [1]:
import pandas as pd
import numpy as np
import forgi
import subprocess
import json
from tqdm.auto import tqdm

In [2]:
df_train = pd.read_json("../data/train.json",lines=True)

In [3]:
df_test = pd.read_json("../data/test.json", lines=True)

In [4]:
df = pd.concat([df_train,df_test])

In [5]:
def predict_struct(sequence):
    with open('./tmp/input.fasta','w') as f:
        f.write('>0\n'+sequence)
    cp = subprocess.run("../contrafold/src/contrafold predict ./tmp/input.fasta --parens ./tmp/seq.parens --posteriors 0.001 ./tmp/seq.post", shell=True, capture_output=True)
    assert cp.returncode == 0
    with open('./tmp/seq.parens','r') as f:
        parens = list(f)
    with open('./tmp/seq.post','r') as f:
        posteriors = list(f)
    structure = parens[3].strip(' \n')
    assert len(structure) == len(sequence)
    bpp = np.zeros((len(sequence),len(sequence)))
    for post in posteriors:
        post = post.strip(' \n')
        i, nuc, *pss = post.split(' ')
        i = int(i) - 1
        assert sequence[i] == nuc
        for ps in pss:
            j,p = ps.split(':')
            j = int(j) - 1
            p = float(p)
            bpp[i,j] = p
            bpp[j,i] = p
    return structure, bpp

In [6]:
def create_cg(sequence, structure):
    with open('./tmp/tmp_seq.dbn','w') as f:
        f.write('\n'.join([sequence, structure]))
    cgs = forgi.load_rna('./tmp/tmp_seq.dbn')
    assert len(cgs) == 1
    cg, = cgs
    return cg

In [7]:
def cg_to_d(cg):
    nodes = {}
    edges = []
    for name, items in cg.defines.items():
        segments = None
        if len(items) == 2:
            segments = [items]
        elif len(items) == 4:
            segments = [items[:2],items[2:]]
        elif len(items) == 0:
            segments = []
        else:
            print(name, items)
            assert False
        segments = [(seg[0]-1, seg[1]) for seg in segments]
        nodes[name] = segments
    for edge_start, edge_ends in cg.edges.items():
        for edge_end in edge_ends:
            edges.append((edge_start,edge_end))
    return {"nodes":nodes, "edges":edges}

In [8]:
from pathlib import Path

In [9]:
bpp_base = Path('../data/contrafold_bpp')

In [11]:
structures = {}
for id_, sequence in tqdm(df[['id','sequence']].values):
    d = {}
    d['structure'], bpp = predict_struct(sequence)
    cg = create_cg(sequence, d['structure'])
    d['cg_graph'] = cg_to_d(cg)
    structures[id_] = d
    np.save(bpp_base/(id_+'.npy'), bpp)

HBox(children=(FloatProgress(value=0.0, max=6034.0), HTML(value='')))




In [19]:
for recs in structures.values():
    loop_type = ['o']*len(recs['structure'])
    for node_name, spans in recs['cg_graph']['nodes'].items():
        for span in spans:
            for i in range(*span):
                loop_type[i] = node_name[0]
    recs['loop_type'] = loop_type

In [22]:
with open('../data/contrafold_structures.json','w') as f:
    json.dump(structures, f)

In [23]:
{lp for d in structures.values() for lp in d['loop_type']}

{'f', 'h', 'i', 'm', 's', 't'}

In [16]:

np.load('../data/contrafold_bpp/id_00073f8be.npy')[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00639796,
       0.        , 0.        , 0.0024079 , 0.00167993, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.00492364, 0.        , 0.        ,
       0.        , 0.        , 0.00306728, 0.        , 0.        ,
       0.        , 0.00162223, 0.00207258, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.00106618, 0.        , 0.        , 0.        ,
       0.00189671, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.00127373, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [15]:
!ls -lah ../data/contrafold_bpp/

total 672M
drwxrwxr-x  2 stepan stepan 248K Oct  6 14:25 .
drwxrwxr-x 11 stepan stepan 4,0K Oct  6 14:39 ..
-rw-rw-r--  1 stepan stepan  90K Oct  6 14:22 id_00073f8be.npy
-rw-rw-r--  1 stepan stepan 133K Oct  6 14:22 id_000ae4237.npy
-rw-rw-r--  1 stepan stepan  90K Oct  6 14:22 id_00131c573.npy
-rw-rw-r--  1 stepan stepan  90K Oct  6 14:22 id_00181fd34.npy
-rw-rw-r--  1 stepan stepan  90K Oct  6 14:20 id_001f94081.npy
-rw-rw-r--  1 stepan stepan 133K Oct  6 14:22 id_0020473f7.npy
-rw-rw-r--  1 stepan stepan 133K Oct  6 14:22 id_002852873.npy
-rw-rw-r--  1 stepan stepan 133K Oct  6 14:22 id_0031191b7.npy
-rw-rw-r--  1 stepan stepan 133K Oct  6 14:22 id_003ab2445.npy
-rw-rw-r--  1 stepan stepan  90K Oct  6 14:20 id_0049f53ba.npy
-rw-rw-r--  1 stepan stepan 133K Oct  6 14:22 id_0051b1d76.npy
-rw-rw-r--  1 stepan stepan 133K Oct  6 14:22 id_00583aef6.npy
-rw-rw-r--  1 stepan stepan 133K Oct  6 14:22 id_006a0ab6e.npy
-rw-rw-r--  1 stepan stepan 133K Oct  6 14:22 id_006af2226.npy
-rw-rw-r--