In [4]:
!mkdir ./tmpa

In [173]:
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import forgi.visual.mplotlib as fvm
import pandas as pd
plt.rcParams["figure.figsize"] = (20,20)
import json

import forgi
# cg = forgi.load_rna("examples/input/1y26.fx", allow_many=False)
# fvm.plot_rna(cg, text_kwargs={"fontweight":"black"}, lighten=0.7,
#              backbone_kwargs={"linewidth":3})
# plt.show()
from scipy.linalg import block_diag
%matplotlib inline

In [7]:
df = pd.read_json("../data/train.json",lines=True)

In [16]:
def create_cg(sequence, structure):
    with open('./tmp/tmp_seq.dbn','w') as f:
        f.write('\n'.join([sequence, structure]))
    cgs = forgi.load_rna('./tmp/tmp_seq.dbn')
    assert len(cgs) == 1
    cg, = cgs
    return cg

In [17]:
cgs = {}
for id_, sequence, structure in tqdm(df[['id', 'sequence', 'structure']].values):
    cgs[id_] = create_cg(sequence, structure)

HBox(children=(FloatProgress(value=0.0, max=2400.0), HTML(value='')))




In [25]:
df.iloc[0].predicted_loop_type

'EEEEESSSSSSHHHHHHHSSSSBSSXSSIIIIISSIISSSSSSHHHHSSSSSSIISSIIIIISSXXXXSSSSSSSHHHHSSSSSSSEEEEEEEEEEEEEEEEEEEEE'

In [26]:
df.iloc[0].structure

'.....((((((.......)))).)).((.....((..((((((....))))))..)).....))....(((((((....))))))).....................'

In [43]:
cg = [cg for cg in cgs.values() if any(k[0] == 'i' for k in cg.defines)][0]

In [44]:
cg.defines

{'f0': [1, 5],
 't0': [87, 107],
 's0': [6, 7, 24, 25],
 's1': [8, 11, 19, 22],
 's2': [27, 28, 63, 64],
 's3': [34, 35, 56, 57],
 's4': [38, 43, 48, 53],
 's5': [69, 75, 80, 86],
 'i0': [23, 23],
 'i1': [29, 33, 58, 62],
 'i2': [36, 37, 54, 55],
 'm0': [26, 26],
 'm1': [65, 68],
 'h0': [12, 18],
 'h1': [44, 47],
 'h2': [76, 79]}

```
    fiveprime: The unpaired nucleotides at the 5’ end of a molecule/ chain. Name always starts with ‘f’ (e.g. ‘f0’).
    threeprime: The unpaired nucleotides at the 3’ end of a molecule/ chain. Name always start with ‘t’ (e.g. ‘t0’)

    stem: Regions of contiguous canonical Watson-Crick base-paired nucleotides. By default, stems have at least 2 consecutive basepairs.
        Always start with ‘s’ (e.g., ‘s0’, ‘s1’, ‘s2’, …)

    interior loop: Bulged out nucleotides and interior loops. An interior loop can contain unpaired bases on either strand or on both strands, flanked by stems on either side.
        Always start with ‘i’ (‘i0’, ‘i1’, ‘i2’,…)

    multiloop segment: Single-stranded regions bewteen two stems. Always start with ‘m’. (‘m0’, ‘m1’, ‘m2’…)
        In the current version of forgi, pseudo-knots and exterior loops segments between stems are treated as multiloop segments.

    hairpin loop: Always starts with ‘h’.
```

In [65]:
cg.defines

{'f0': [1, 5],
 't0': [87, 107],
 's0': [6, 7, 24, 25],
 's1': [8, 11, 19, 22],
 's2': [27, 28, 63, 64],
 's3': [34, 35, 56, 57],
 's4': [38, 43, 48, 53],
 's5': [69, 75, 80, 86],
 'i0': [23, 23],
 'i1': [29, 33, 58, 62],
 'i2': [36, 37, 54, 55],
 'm0': [26, 26],
 'm1': [65, 68],
 'h0': [12, 18],
 'h1': [44, 47],
 'h2': [76, 79]}

In [79]:
def cg_to_d(cg):
    nodes = {}
    edges = []
    for name, items in cg.defines.items():
        segments = None
        if len(items) == 2:
            segments = [items]
        elif len(items) == 4:
            segments = [items[:2],items[2:]]
        elif len(items) == 0:
            segments = []
        else:
            print(name, items)
            assert False
        segments = [(seg[0]-1, seg[1]) for seg in segments]
        nodes[name] = segments
    for edge_start, edge_ends in cg.edges.items():
        for edge_end in edge_ends:
            edges.append((edge_start,edge_end))
    return {"nodes":nodes, "edges":edges}

In [80]:
cg_train_graphs = {}
for id_, cg in cgs.items():
    try:
        cg_train_graphs[id_] = cg_to_d(cg)
    except Exception as e:
        print(id_)

In [84]:
!mkdir ../data/cg_graphs

In [88]:
cg_train_graphs['id_0049f53ba']['nodes']

{'f0': [(0, 5)],
 't0': [(86, 107)],
 's0': [(5, 18), (43, 56)],
 's1': [(18, 28), (32, 42)],
 's2': [(58, 61), (64, 67)],
 's3': [(68, 75), (79, 86)],
 'i0': [(42, 43)],
 'm0': [(56, 58)],
 'm1': [(67, 68)],
 'h0': [(28, 32)],
 'h1': [(61, 64)],
 'h2': [(75, 79)]}

In [91]:
 {v:k for k, v in enumerate("ftsimh")}

{'f': 0, 't': 1, 's': 2, 'i': 3, 'm': 4, 'h': 5}

In [98]:
import numpy as np

In [170]:
def create_bungle_features(cg_nodes):
    type_dict = {'f': 0, 't': 1, 's': 2, 'i': 3, 'm': 4, 'h': 5}
    features = np.zeros((len(cg_nodes),len(type_dict)+1))
    for index, (node_name, segments) in enumerate(cg_nodes.items()):
        features[index][type_dict[node_name[0]]] = 1
        num_b = sum(seg[1]-seg[0] for seg in segments)
        features[index][-1] = num_b
    return features

In [100]:
def seq2nodes(sequence, loop_type):
    type_dict={'A':0,'G':1,'U':2,'C':3}
    type_loop = {'E': 0, 'H': 1, 'M': 2, 'I': 3, 'X': 4, 'S': 5, 'B': 6}
    nodes=np.zeros((len(sequence),4))
    loops = np.zeros((len(sequence), len(type_loop)))
    for i,(s,lt) in enumerate(zip(sequence, loop_type)):
        nodes[i,type_dict[s]]=1
        loops[i,type_loop[lt]] = 1
    nodes = np.concatenate([nodes, loops],axis=-1)
    return nodes

In [103]:
id_, sequence, loop_type = df.iloc[0][['id', 'sequence', 'predicted_loop_type']].values

In [124]:
x = seq2nodes(sequence, loop_type)

In [107]:
cg_nodes = cg_train_graphs[id_]['nodes']

In [131]:
cg_graph = cg_train_graphs[id_]

In [121]:
cg_features = bungle_features(cg_nodes)

In [123]:
cg_features

array([[ 1.,  0.,  0.,  0.,  0.,  0.,  5.],
       [ 0.,  1.,  0.,  0.,  0.,  0., 21.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  4.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  8.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  4.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  4.],
       [ 0.,  0.,  1.,  0.,  0.,  0., 12.],
       [ 0.,  0.,  1.,  0.,  0.,  0., 14.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  1.,  0.,  0., 10.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  4.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  1.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  4.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  7.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  4.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  4.]])

In [126]:
x[0]

array([0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.])

In [171]:
def add_bungle_nodes(x, cg_graph):
    x = np.concatenate([x,np.ones((x.shape[0],1))], axis=1)
    cg_nodes = cg_graph['nodes']
    x_len,x_dim = x.shape
    node2idx = {node:index for index,node in enumerate(cg_nodes,start=x_len)}
    cg_x = create_bungle_features(cg_nodes)
    cg_x = np.concatenate([cg_x,np.ones((cg_x.shape[0],1))], axis=1)
    cg_x_len, cg_x_dim = cg_x.shape
    features = block_diag(x, cg_x)
    return features, node2idx

In [159]:
bungle_features, node2idx = add_bungle_nodes(x, cg_graph)

In [174]:
bungle_features_2, node2idx_2 = add_bungle_nodes(x, cg_graph)

In [187]:
node2idx

{'f0': 107,
 't0': 108,
 's0': 109,
 's1': 110,
 's2': 111,
 's3': 112,
 's4': 113,
 's5': 114,
 'i0': 115,
 'i1': 116,
 'i2': 117,
 'm0': 118,
 'm1': 119,
 'h0': 120,
 'h1': 121,
 'h2': 122}

In [202]:
def get_couples(structure):
    """
    For each closing parenthesis, I find the matching opening one and store their index in the couples list.
    The assigned list is used to keep track of the assigned opening parenthesis
    """
    opened = [idx for idx, i in enumerate(structure) if i == "("]
    closed = [idx for idx, i in enumerate(structure) if i == ")"]

    assert len(opened) == len(closed)

    assigned = []
    couples = []

    for close_idx in closed:
        for open_idx in opened:
            if open_idx < close_idx:
                if open_idx not in assigned:
                    candidate = open_idx
            else:
                break
        assigned.append(candidate)
        couples.append((candidate, close_idx))
        assigned.append(close_idx)
        couples.append((close_idx, candidate))

    assert len(couples) == 2 * len(opened)

    return couples


def build_matrix(couples, size):
    mat = np.zeros((size, size))

    for i in range(size):  # neigbouring bases are linked as well
        if i < size - 1:
            mat[i, i + 1] = 1
        if i > 0:
            mat[i, i - 1] = 1

    for i, j in couples:
        mat[i, j] = 2
        mat[j, i] = 2

    return mat


In [207]:
def seq2edge_index(structure):
    couples = sorted(set(get_couples(structure)))
    couples = np.array(couples).T
    neig = np.array([np.arange(0,len(structure) -1), np.arange(1,len(structure))])
    neig2 = neig[::-1,::]
    edge_index = np.concatenate([couples, neig, neig2], axis=1)
    edges_type = np.array([1]*couples.shape[1] + [2]*neig.shape[1]*2)

    return edge_index, edges_type

def edge_index2features(edge_index, edges_type, node_features):
    edge_type_f = np.zeros((edge_index.shape[1],2))
    for ty in [1,2]:
        edge_type_f[:,ty-1] = (edges_type == ty).astype(int)
    edge_direction = np.stack([(edge_index[1,] - edge_index[0,] == 1).astype(int),
                          (edge_index[0,] - edge_index[1,] == 1).astype(int)]).T
    edge_features = np.concatenate([edge_type_f,edge_direction],axis=-1)
    return edge_features

def seq2edges(structure, node_features):
    edge_index, edges_type = seq2edge_index(structure)
    edge_features = edge_index2features(edge_index, edges_type, node_features)
    return edge_index, edge_features

def cg2edges(cg_graph, node2idx):
    features = []
    indexes = []
    for node_name, segments in cg_graph['nodes'].items():
        node_idx = node2idx[node_name]
        for seg in segments:
            for idx in range(*seg):
                indexes.append((node_idx,idx))
                features.append([1,0,0])
                indexes.append((idx, node_idx))
                features.append([0,1,0])
    for node_1,node_2 in cg_graph['edges']:
        indexes.append((node2idx[node_1],node2idx[node_2]))
        features.append([0,0,1])
    indexes = np.array(indexes).T
    features = np.array(features)
    return indexes, features

def create_edges(structure, node_features, cg_graph,):
    edge_index_nuc, edge_features_nuc = seq2edges(structure, node_features)
    edge_index_bungle, edge_features_bungle = cg2edges(cg_graph, node2idx)
    edge_index = np.concatenate([edge_index_nuc,edge_index_bungle],axis=1)
    edge_from = node_features[edge_index[0,]]
    edge_to = node_features[edge_index[1,]]
    edge_features = block_diag(edge_features_nuc, edge_features_bungle)
    edge_features = np.concatenate([edge_features, edge_from,edge_to],axis=1)
    return edge_index, edge_features