# Generate RDATs for upload to Eterna

This notebook takes an input dataframe of sequences with predictions and scoring metrics and generates a collection of RDAT files to be uploaded to Eterna for score distribution.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
from openknotscore.utils import load_rdat

## Load Data

In [None]:
# Original RDAT
rdat_name = input(
    "Name of the source RDAT file in the data/rdats folder"
)
rdat, constructName = load_rdat(f'../data/rdats/{rdat_name}')

# Input dataframe to be saved to RDATs
data = pd.read_pickle("../data/data_processed.pkl")

# Update this to match the construct in the orginal RDAT
job = input(
    "Job Name: Where the subsets are stored in SCRATCH (the same as the job name specified in the split date notebook)"
)

## Add OKS and best predictions to data

In [None]:
# Grab the sequences present in the original RDAT
for sequence in rdat.constructs[constructName].data:
    seq = sequence.annotations["sequence"][0]
    annotationList = sequence.annotations.setdefault('Eterna', [])
    
    # There may be no processed data (OKS, predictions) associated with the sequence
    # if the sequence had low-quality or missing reactivity data, so we skip those rows
    row = data.loc[data['sequence'] == seq].squeeze()
    if row.empty:
        print("No processed data")
        continue

    # Add annotations with the processed data to the RDAT
    annotationList.append(f"score:openknot_score:{row['ensemble_OKS']:.6f}")
    annotationList.append(f"score:eterna_classic_score:{row['ensemble_ECS']:.6f}")
    annotationList.append(f"score:crossed_pair_quality_score:{row['ensemble_CPQ']:.6f}")
    annotationList.append(f"best_fit:tags:{','.join(row['ensemble_tags'])}")
    annotationList.append(f"best_fit:structures:{','.join(row['ensemble_structures'])}")
    annotationList.append(f"best_fit:eterna_classic_scores:{','.join([f'{v:.6f}' for v in row['ensemble_structures_ecs']])}")

    print(annotationList)
    
# Save the RDAT with updated annotations
rdat.save(f'../data/{job}_processed.rdat')

## Split into multiple RDATS for upload (optional)

In [None]:
import math

# How many sequences to store in each output RDAT. Eterna has size limits on uploaded files,
# so size this as necessary to stay under 15MB per RDAT while limiting total rdats created.
outputSeq = 2000
totalRdats = math.ceil(len(data)/outputSeq)

inputFile = f"../data/{job}_processed.rdat"

for i in range(totalRdats):
    # Reset the RDAT object
    rdat, constructName = load_rdat(inputFile)
    assert len(rdat.constructs) == 1
    constructName = list(rdat.constructs.keys())[0]
    data = rdat.constructs[constructName].data

    # Subset the RDAT data for saving.
    rdat.constructs[constructName].data = data[i*outputSeq:(i+1)*outputSeq]
    rdat.values[constructName] = [row.values for row in data[i*outputSeq:(i+1)*outputSeq]]
    rdat.errors[constructName] = [row.errors for row in data[i*outputSeq:(i+1)*outputSeq]]

    rdat.save(f"../data/upload_rdats/{job}-{i}.rdat")