# Notebook to extract sequences in IDT format for input geometries and intended radii

Note that this notebook will try to find the sequences where the calibrated radii best fit the input. Check for the predicted calibrated radii in the file name title

## Setting up environment

In [2]:
import pandas as pd
import csv

output_folder = "./example_extracted_sequences"

## Prepare "get_sequences()" function

#### Read in unique staple datasets

In [3]:
# Load the files into pandas dataframes
file_container = "../origami/DNA_sequences/complete_lists_of_precomputed_seqeunces/container_seq_unique_8T.csv"
file_tube = "../origami/DNA_sequences/complete_lists_of_precomputed_seqeunces/tube_seq_unique_8T.csv"

# Read the files with an extra column for the file name
df_container = pd.read_csv(file_container)
df_container['source'] = 'container'

df_tube = pd.read_csv(file_tube)
df_tube['source'] = 'tube'

# Combine the dataframes
df_combined = pd.concat([df_container, df_tube], ignore_index=True)

# Function to parse range values in the dataframe
def parse_range(value):
    """
    Parses a string that potentially represents a range and returns the average value.
    If the value is a single number, it just converts it to float.
    Handles NaN values and non-string inputs.
    """
    if pd.isna(value):
        return None
    if isinstance(value, float):
        return value
    if "-" in value:
        parts = value.split('-')
        return (float(parts[0].strip()) + float(parts[1].strip())) / 2
    else:
        return float(value)

# Apply the function to relevant columns
df_combined['r_container_calibrated_avg'] = df_combined['r_container_calibrated'].apply(parse_range)
df_combined['r_tube_calibrated_avg'] = df_combined['r_tube_calibrated'].apply(parse_range)

df_combined.sample(2)

Unnamed: 0,r_naive,r_container_calibrated,circum_naive,number of monomers,Modified_Lines,source,r_tube_naive,r_tube_calibrated,r_container_calibrated_avg,r_tube_calibrated_avg
14,66,79.0,414.69,66.14,{'2.1': 'ACGCCAGCTGGCGGGGGAAAGGAACCCTAAAGGGAGA...,container,,,79.0,
36,115 - 120,139.0 - 145.0,738.275,209.66,{'2.1': 'ACGCCAGCTGGCGGGGGAAAGGAACCCTAAAGGGAGA...,container,,,142.0,


## Define extractiona dn formatting functions

In [4]:
def find_closest_rows(df, container=[], tube=[]):
    """
    Function to find the rows where the column value "r_container_calibrated" or "r_tube_calibrated"
    is closest to the integers provided in the container or tube lists.

    Args:
    df (DataFrame): The dataframe to search in.
    container (list): List of integers to find closest values in "r_container_calibrated".
    tube (list): List of integers to find closest values in "r_tube_calibrated".
    plane (bool): Flag to determine whether to use "plane" logic. (Currently not used)

    Returns:
    DataFrame: A dataframe with the closest rows.
    """
    closest_rows = []

    for number in container:
        # Calculate the absolute difference and find the row with minimum difference
        df['diff_container'] = abs(df['r_container_calibrated_avg'] - number)
        closest_row = df.loc[df['diff_container'].idxmin()]
        closest_rows.append(closest_row)

    for number in tube:
        # Calculate the absolute difference and find the row with minimum difference
        df['diff_tube'] = abs(df['r_tube_calibrated_avg'] - number)
        closest_row = df.loc[df['diff_tube'].idxmin()]
        closest_rows.append(closest_row)

    return pd.DataFrame(closest_rows)

    
def create_csv_files_for_each_design(df):
    """
    Creates CSV files for each row in the dataframe with limited keys.

    Args:
    df (DataFrame): DataFrame containing the closest rows.
    output_folder (str): Path to the folder where the CSV files will be saved.
    """
    # Generate well names (A1, A2, ..., H12)
    wells = [f"{chr(row)}{col}" for row in range(65, 73) for col in range(1, 13)]

    for index, row in df.iterrows():
        design_type = df["source"][index]
        if design_type == "container":
            mean_cal_radii = df["r_container_calibrated_avg"][index]
        elif design_type == "tube":
            mean_cal_radii = df["r_tube_calibrated_avg"][index]
        
        file_name = f"{output_folder}/{design_type}_closest_r_cal_{mean_cal_radii}_nm.csv"
        
        with open(file_name, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["Well Position","Name", "Sequence"])  # Write header

            # Get Modified_Lines as a dictionary
            modified_lines = eval(row['Modified_Lines'])

            for i, well in enumerate(wells):
                if i >= len(modified_lines):
                    break  # Stop adding rows once all keys have been used

                # Extract sequence and name for each well
                sequence = list(modified_lines.values())[i]
                name = f"{design_type}_r_cal_{mean_cal_radii}_nm_" + list(modified_lines.keys())[i]

                writer.writerow([well, name,sequence])

    return

def get_sequences(container=[], tube=[]):
    unformatted_results = find_closest_rows(df_combined, container=container, tube=tube)
    create_csv_files_for_each_design(unformatted_results)
    
    return

## Getting formatted sequences for wanted radii


In [6]:
#### Example use ####
# Use the follwoing to get csv. files for cotnainer with radii close to 60 and 100nm and for one tube with a radii close to 80 nm. 
# The csv files are automatically saved at the path given when setting up the Environment above
get_sequences(container=[60,300], tube=[50,80])
