# Generate Dataframe from RDAT

This notebook takes an input RDAT file with reactivity data for a given sequence library,
and extracts the relevant data and stores it in a Dataframe.

In [None]:
import sys
sys.path.append('..')

import os
import pandas as pd
from openknotscore.utils import load_rdat, get_global_blank_out

# Trick to display dataframes with scrollbars
from IPython.display import display, HTML
display(HTML("<style>.jp-OutputArea-output {display:flex}</style>"))

In [None]:
# Generate empty dataframe with columns that we will extract from the RDAT

df = pd.DataFrame(columns=["id","title","author","sequence","reads","signal_to_noise","snr","warning","reactivity","errors"])

for file in os.listdir('../data/rdats'):
    if not file.endswith('.rdat'):
        continue
    rdat, constructName = load_rdat(f'../data/rdats/{file}')

    seqList = []

    BLANK_OUT5, BLANK_OUT3 = get_global_blank_out(rdat.constructs[constructName])

    # Loop through all sequences in the RDAT, extract relevant data, and add to the dataframe
    for sequence in rdat.constructs[constructName].data:
        # Grab annotations
        id = None
        author = None
        title = None
        if 'Eterna' in sequence.annotations:
            for annot in sequence.annotations.get('Eterna'):
                if annot.startswith('id:'):
                    id = annot.lstrip('id:')
                if annot.startswith('author:'):
                    author = annot.lstrip('author:')
                if annot.startswith('design_name:'):
                    title = annot.lstrip('design_name:')
        if title is None:
            title = sequence.annotations.get('name')[0]
        if id is None:
            id = f'EXTERNAL:{title}'
        seq = sequence.annotations.get('sequence')[0]
        reads = None
        if 'reads' in sequence.annotations:
            reads = int(sequence.annotations.get('reads')[0])
        signal_to_noise = sequence.annotations.get('signal_to_noise')[0]
        snr = float(sequence.annotations.get('signal_to_noise')[0].split(":")[1])
        warning = sequence.annotations.get('warning', '-')[0]

        # Get reactivity data and errors
        reactivity = sequence.values
        errors = sequence.errors

        # Create a dataframe from this row
        row = pd.DataFrame(
            data = [[id,seq,reads,signal_to_noise,snr,warning,reactivity,errors,BLANK_OUT5,BLANK_OUT3]],
            columns=["id","sequence","reads","signal_to_noise","snr","warning","reactivity","errors","blank_out5","blank_out3"]
        )

        # Add the row to the whole dataframe
        seqList.append(row)

df = pd.concat(seqList, ignore_index=True)

In [None]:
# Reset the index and sort by ID
df.set_index('id', inplace=True)
df.sort_values('id',ascending=False,inplace=True)

# Persist the dataframe for loading in future steps
# See the README for details about format choice

# df.to_csv('../data/data_rdatOnly.csv')
df.to_pickle('../data/data_rdatOnly.pkl')

In [None]:
# OPTIONAL
# Some sequences will have poor data quality, indicated with a badQuality warning
# To save compute, you may choose to filter out the low quality sequences

goodQuality = df.loc[df['warning'] != 'badQuality']

# goodQuality.to_csv('../data/data_highQuality.csv')
goodQuality.to_pickle('../data/data_highQuality.pkl')