In [1]:
import numpy as np

def parse_dbn(dbn_file):
    with open(dbn_file, 'r') as f:
        lines = f.readlines()
        # TODO: rewrite this method to handle multiple sequences and structures
        # Check if the file is in the correct format
        if len(lines) < 3:
            raise ValueError('The DBN file does not contain enough lines to return sequence and structure')
        sequence = lines[1].strip()  # Second line is the sequence
        structure = lines[2].strip()  # Third line is the dot-bracket notation
    return sequence, structure

def construct_secondary_structure_matrix(sequence, structure):
    """
    Constructs a secondary structure matrix for an RNA sequence based on its dot-bracket notation.

    The secondary structure matrix is an N x N matrix (where N is the length of the sequence) that 
    represents base pairings in the secondary structure. A '1' in position (i, j) of the matrix 
    indicates that the nucleotide at position i is paired with the nucleotide at position j, 
    and the matrix is symmetric since RNA base pairings are bidirectional.

    Parameters:
    ----------
    sequence : str
        The RNA sequence (not used directly in the computation but required for matrix size).
    structure : str
        Dot-bracket notation representing the RNA secondary structure. '(' represents a nucleotide
        that is base-paired with a later nucleotide, and ')' represents a nucleotide that is paired
        with a preceding nucleotide. Dots '.' represent unpaired nucleotides.

    Returns:
    -------
    matrix : np.ndarray
        An N x N matrix where N is the length of the sequence, with 1s indicating base pairings 
        and 0s elsewhere.

    Example:
    -------
    sequence = "GCAU"
    structure = "(..)"
    matrix = construct_secondary_structure_matrix(sequence, structure)
    # matrix will be:
    # array([[0, 0, 1, 0],
    #        [0, 0, 0, 0],
    #        [1, 0, 0, 0],
    #        [0, 0, 0, 0]])
    """
    N = len(sequence)
    matrix = np.zeros((N, N), dtype=int)
    
    # Stack to hold positions of '('
    stack = []
    
    for i, char in enumerate(structure):
        if char == '(':
            stack.append(i)
        elif char == ')':
            if stack:
                j = stack.pop()
                matrix[i][j] = 1
                matrix[j][i] = 1  # Symmetric pairing
    return matrix

dbn_file = 'example.dbn'  # Replace with actual DBN file path
sequence, structure = parse_dbn(dbn_file)
matrix = construct_secondary_structure_matrix(sequence, structure)

print(matrix)

[[0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0]]
