# Generate Dataframe from RDAT

This notebook takes an input RDAT file with reactivity data for a given sequence library,
and extracts the relevant data and stores it in a Dataframe.

In [None]:
import os
import pandas as pd
import rdat_kit

# Trick to display dataframes with scrollbars
from IPython.display import display, HTML
display(HTML("<style>.jp-OutputArea-output {display:flex}</style>"))

In [None]:
# Generate empty dataframe with columns that we will extract from the RDAT

df = pd.DataFrame(columns=["id","title","author","sequence","reads","signal_to_noise","snr","warning","reactivity","errors"])

for file in os.listdir('../data/rdats'):
    if not file.endswith('.rdat'):
        continue
    rdat = rdat_kit.RDATFile()
    rdat.load(open(f'../data/rdats/{file}', 'r'))

    # Change your construct name to match your RDAT 
    constructName = 'DasLabBigLib_OneMil_OpenKnot_Round_2_test'
    rdatSequences = rdat.constructs[constructName].data

    seqList = []

    # Loop through all sequences in the RDAT, extract relevant data, and add to the dataframe
    for sequence in rdatSequences:
        # Grab annotations
        id = sequence.annotations.get('Eterna')[0].split(":")[1]
        title = sequence.annotations.get('Eterna')[1].split(":")[1]
        author = sequence.annotations.get('Eterna')[2].split(":")[1]
        seq = sequence.annotations.get('sequence')[0]
        reads = int(sequence.annotations.get('reads')[0])
        signal_to_noise = sequence.annotations.get('signal_to_noise')[0]
        snr = float(sequence.annotations.get('signal_to_noise')[0].split(":")[1])
        warning = sequence.annotations.get('warning', '-')[0]

        # Get reactivity data and errors
        reactivity = sequence.values
        errors = sequence.errors

        # Create a dataframe from this row
        row = pd.DataFrame(
            data = [[id,seq,reads,signal_to_noise,snr,warning,reactivity,errors]],
            columns=["id","sequence","reads","signal_to_noise","snr","warning","reactivity","errors"]
        )

        # Add the row to the whole dataframe
        seqList.append(row)

df = pd.concat(seqList, ignore_index=True)

In [None]:
# Reset the index and sort by ID
df.set_index('id', inplace=True)
df.sort_values('id',ascending=False,inplace=True)

# Persist the dataframe for loading in future steps
# See the README for details about format choice

# df.to_csv('../data/data_rdatOnly.csv')
df.to_pickle('../data/data_rdatOnly.pkl')

In [None]:
# OPTIONAL
# Some sequences will have poor data quality, indicated with a badQuality warning
# To save compute, you may choose to filter out the low quality sequences

goodQuality = df.loc[df['warning'] != 'badQuality']

# goodQuality.to_csv('../data/data_highQuality.csv')
goodQuality.to_pickle('../data/data_highQuality.pkl')