# Computation of connector strand sets for an assembly type with a given curvature

## Setting up the Python environment

In [1]:
#### Imports ####
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from datetime import datetime
from scipy.optimize import minimize

#### Get current date and time ####
now = datetime.now()
formatted_time = now.strftime('%y%m%d_%H%M')

## Defining model parameter

In [2]:
#### Monomer dimensions ####
from numpy import intersect1d


BOTTOM_DIAMETER = 28.5 #nm
DIPID_HEIGHT    = 18 #nm
BOTTOM_AREA     = np.pi*(BOTTOM_DIAMETER/2)**2 #nm**2

#### Physical parameters ####
SS_NT_LENGTH    = 0.63 #nm https://www.cell.com/fulltext/S0006-3495(04)74308-8
DS_NT_LENGTH    = 0.34 #nm
ssDNA_Lp        = 1.5 #nm https://www.cell.com/fulltext/S0006-3495(04)74308-8
MGCL2_CONC      = 0.018 #M

#### Parameter used for the linker database population ####
NT_T_SPACER     = 8 #nt
NT_OF_SHORTEST_HYBR_SEQ = 4 #nt, specifically GCGC
CONSIDERED_RADII_RANGE = np.arange(30,500,1) #in nm

#### Setting the assembly type ####
design_type = "tube" # "container", "tube" #used only for plotting purposes

#### Defining calibration curves for cal(x) = ax+b
container_cal_params = {"slope":1.3200143558251145, "intercept":-6.705205195244417}
tube_cal_params = {"slope":1.4592586901710962, "intercept":-39.655924854948424}

tube_cal_threshold = 86.34768 #nm # where the tube calibration line intecepts x=y

def container_calibration(r_naive):
    return  container_cal_params["slope"]*r_naive+container_cal_params["intercept"]

def tube_calibration(r_naive):
    if r_naive < tube_cal_threshold:
        return r_naive
    elif r_naive >= tube_cal_threshold:
        return  tube_cal_params["slope"]*r_naive+tube_cal_params["intercept"]


## Creating dna_connector sequences and calculating their lengths

In [3]:
#### Defining the adapted wlc function ised to calculate linker lengths ####
def wlc(_T_spacer_, _T_cone_spacer_, _sticky_strand_):
    L = _T_spacer_ + _T_cone_spacer_
    if L == 0:
        return 0 + 0.5 * _sticky_strand_
    try:
        return np.sqrt(2 * ssDNA_Lp**2  * (L/ssDNA_Lp-1 + np.exp(-L/ssDNA_Lp))) + 0.5 * _sticky_strand_
    except:
        return 0 + 0.5 * _sticky_strand_

# Generating nt (nucleotide) combinations
nt_T_cone_spacers = np.arange(0, 60)
nt_sticky_strands = np.arange(4,9,2)

# Generating contour lengths of nucleotide combinations (nm)
T_spacer = NT_T_SPACER* SS_NT_LENGTH
T_cone_spacers = nt_T_cone_spacers * SS_NT_LENGTH
sticky_strands = nt_sticky_strands * DS_NT_LENGTH

# Initialize an empty list to collect DataFrame rows
rows = []

# Fill the list with all variable combinations
for nt_T_cone_spacer, T_cone_spacer in zip(nt_T_cone_spacers, T_cone_spacers):
    for nt_sticky_strand, sticky_strand in zip(nt_sticky_strands, sticky_strands):
        row = {
            "nt_T_spacer": NT_T_SPACER,
            "nt_T_cone_spacer": nt_T_cone_spacer,
            "nt_sticky_strand": nt_sticky_strand,
            "T_spacer": T_spacer,
            "T_cone_spacer": T_cone_spacer,
            "sticky_strand": sticky_strand
        }
        rows.append(row)

# Convert the list of rows into a DataFrame
df = pd.DataFrame(rows)

# Calculate WLC for each variable combination and add it to a new column
df["wlc_length [nm]"] = df.apply(lambda row: wlc(row["T_spacer"], row["T_cone_spacer"], row["sticky_strand"]), axis=1)

# Add raw_sequence column, combining the three components comprising a complete connector sequence
df["raw_sequence"] = df.apply(lambda row: ('T' * int(row["nt_T_spacer"])) + ('T' * int(row["nt_T_cone_spacer"])) + ('X' * int(row["nt_sticky_strand"])), axis=1)

# Save DataFrame to CSV
df.to_csv(f"../data/computed_dna_spacer_lengths/sequence_lengths_{NT_T_SPACER}T.csv", index=False)

# Display first few rows of the DataFrame
print(df.sample(3))



     nt_T_spacer  nt_T_cone_spacer  nt_sticky_strand  T_spacer  T_cone_spacer  \
171            8                57                 4      5.04          35.91   
87             8                29                 4      5.04          18.27   
144            8                48                 4      5.04          30.24   

     sticky_strand  wlc_length [nm]  \
171           1.36        11.558879   
87            1.36         8.768881   
144           1.36        10.746777   

                                          raw_sequence  
171  TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT...  
87           TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTXXXX  
144  TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT...  


## Creating a second dataframe, containing the required linker lengths for a wide range of curvatures

In [4]:
#### Setting the length of the shortest connector strand as the minimal length for strands to be considered. 
MIN_STAPLE_LENGTH = wlc(NT_T_SPACER*SS_NT_LENGTH, 0, NT_OF_SHORTEST_HYBR_SEQ * DS_NT_LENGTH)
BOTTOM_DIAMETER_CONE = BOTTOM_DIAMETER + MIN_STAPLE_LENGTH #nm depends on the used nt_T_spacer and nt_sticky_strand, as lowest sticky strand has 0 nt_T_cone_spacer

def aspect_ratio(r_naive):
    a = (r_naive+DIPID_HEIGHT)/r_naive
    return a

def upper_width(aspect_ratio):
    return BOTTOM_DIAMETER_CONE*aspect_ratio

def calc_monomer_number(r_sphere, r_monomer):
    A_sphere = 4 * np.pi * r_sphere ** 2
    A_monomer = np.pi* r_monomer**2
    return A_sphere / A_monomer
    
r_naive = CONSIDERED_RADII_RANGE

#calculate circumference
c = 2*np.pi*r_naive

# Calculate the aspect ratio and upper width
aspect_ratios = aspect_ratio(r_naive)
upper_widths = upper_width(aspect_ratios)
number_of_monomers = calc_monomer_number(r_naive,BOTTOM_DIAMETER_CONE/2)

# Create a pandas DataFrame with the results
geometric_constraints = pd.DataFrame({'r_naive': r_naive, 'circum_naive' : c, 'number of monomers': number_of_monomers, 'Aspect Ratio': aspect_ratios, 'Upper Width': upper_widths})

# Round all values in the DataFrame to two decimal places
geometric_constraints = geometric_constraints.round(2)

# Add a new column for the increase in radius
geometric_constraints['increase radius'] = (geometric_constraints['Upper Width'] - BOTTOM_DIAMETER_CONE) / 2

# Add a new column for the list of values
geometric_constraints['arm_lengths'] = geometric_constraints.apply(lambda row: (np.linspace(MIN_STAPLE_LENGTH, MIN_STAPLE_LENGTH + row['increase radius'], 5)).round(2).tolist(), axis=1)

## Fitting a sequence with pre-computed lengths to geometrically requierd lengths

In [5]:
#### match dfs

# Create matched_df
matched_df = geometric_constraints.copy()
nt_sticky_strand_values = [4,6,6,8,8] #as determined by the sequences (xxx make Magic number or get from dataframe, specific for tubes and vesicle designs)

# Definign the funciton to find the best matching sequence
def find_closest_index(arm_lengths, target_index, sticky_strand_value):
    
    # Filter the dataframe based on valid nt_sticky_strand lengths
    filtered_df = df[df['nt_sticky_strand'] == sticky_strand_value]
    initial_guess = filtered_df['wlc_length [nm]'].iloc[0]
    
    # Minimize the objective function for a single arm_length
    res = minimize(lambda x: (arm_lengths[target_index] - x) ** 2, initial_guess)
    
    # Round the result to the nearest valid wlc_length value
    closest_value = min(filtered_df['wlc_length [nm]'], key=lambda x: abs(x - res.x[0]))
    
    # Return the index of the closest wlc_length value in the filtered dataframe
    residual_squared = (arm_lengths[target_index] - closest_value) ** 2
    return filtered_df[filtered_df['wlc_length [nm]'] == closest_value].index[0], residual_squared



fitting_sequences = []
set_raw_sequences = []
set_rss = []  # List to hold the sum of RSS for each row

# Looping through each design to find the best matches
for i, row in matched_df.iterrows():
    arm_lengths = row['arm_lengths']
    closest_values = []
    closest_sequences = []
    rss = []  # Initialize RSS for this row
    for idx, x in enumerate(arm_lengths):
        filtered_df = df[df['nt_sticky_strand'] == nt_sticky_strand_values[idx]]
        index, residual_squared = find_closest_index(arm_lengths, idx, nt_sticky_strand_values[idx])
        closest_values.append(filtered_df['wlc_length [nm]'].loc[index])
        closest_sequences.append(filtered_df['raw_sequence'].loc[index])
        rss.append(residual_squared )
    fitting_sequences.append(closest_values)
    set_raw_sequences.append(closest_sequences)
    set_rss.append(rss) 

# Add fitting_sequences and set_raw_sequences as new columns to matched_df
matched_df['fitting_sequences'] = fitting_sequences
# Round the fitting_sequences to two digits
matched_df['fitting_sequences'] = matched_df['fitting_sequences'].apply(lambda seq: [round(value, 2) for value in seq])
matched_df['set_rss'] = set_rss 
matched_df['set_raw_sequences'] = set_raw_sequences

matched_df.sample(3)

Unnamed: 0,r_naive,circum_naive,number of monomers,Aspect Ratio,Upper Width,increase radius,arm_lengths,fitting_sequences,set_rss,set_raw_sequences
214,244,1533.1,903.92,1.07,34.86,1.198636,"[3.96, 4.26, 4.56, 4.86, 5.16]","[3.96, 4.3, 4.57, 4.91, 5.16]","[7.446591256961168e-06, 0.0018257541358919769,...","[TTTTTTTTXXXX, TTTTTTTTXXXXXX, TTTTTTTTTXXXXXX..."
338,368,2312.21,2056.11,1.05,34.05,0.793636,"[3.96, 4.16, 4.36, 4.56, 4.76]","[3.96, 4.3, 4.3, 4.64, 4.64]","[7.446591256961168e-06, 0.020371522997479408, ...","[TTTTTTTTXXXX, TTTTTTTTXXXXXX, TTTTTTTTXXXXXX,..."
14,44,276.46,29.39,1.41,45.74,6.638636,"[3.96, 5.62, 7.28, 8.94, 10.6]","[3.96, 5.71, 7.26, 8.97, 10.65]","[7.446591256961168e-06, 0.007654762034055714, ...","[TTTTTTTTXXXX, TTTTTTTTTTTTTTXXXXXX, TTTTTTTTT..."


## Computing a collection of fitting quality measures

In [6]:
# Function to compute the mean squared deviation (MSD)
def compute_msd(arm_lengths, fitting_sequences):
    return np.mean([(x - y)**2 for x, y in zip(arm_lengths, fitting_sequences)])

# Function to compute the pairwise mismatch as percentages rounded to two digits
def compute_mismatch(arm_lengths, fitting_sequences):
    return [round(100 * abs(x - y) / max(x, y), 2) for x, y in zip(arm_lengths, fitting_sequences)]

# Add msd and pairwise mismatch columns to matched_df
matched_df['msd'] = matched_df.apply(lambda row: compute_msd(row['arm_lengths'], row['fitting_sequences']), axis=1)
# Update the pairwise mismatch column in matched_df
matched_df['pairwise mismatch [%]'] = matched_df.apply(lambda row: compute_mismatch(row['arm_lengths'], row['fitting_sequences']), axis=1)
# Add a new column for the sum of each row's pairwise mismatch
matched_df['sum pairwise mismatch'] = matched_df['pairwise mismatch [%]'].apply(sum)

def compute_sum_differences(arm_lengths, fitting_sequences):
    differences = [x - y for x, y in zip(arm_lengths, fitting_sequences)]
    return abs(sum(differences))

matched_df['sum_differences'] = matched_df.apply(lambda row: compute_sum_differences(row['arm_lengths'], row['fitting_sequences']), axis=1)
matched_df['sum_differences'] = matched_df['sum_differences'].round(2)

matched_df.sample(3)

Unnamed: 0,r_naive,circum_naive,number of monomers,Aspect Ratio,Upper Width,increase radius,arm_lengths,fitting_sequences,set_rss,set_raw_sequences,msd,pairwise mismatch [%],sum pairwise mismatch,sum_differences
363,393,2469.29,2344.96,1.05,33.95,0.743636,"[3.96, 4.15, 4.33, 4.52, 4.71]","[3.96, 4.3, 4.3, 4.64, 4.64]","[7.446591256961168e-06, 0.02332609988363809, 0...","[TTTTTTTTXXXX, TTTTTTTTXXXXXX, TTTTTTTTXXXXXX,...",0.00854,"[0.0, 3.49, 0.69, 2.59, 1.49]",8.26,0.17
144,174,1093.27,459.67,1.1,35.82,1.678636,"[3.96, 4.38, 4.8, 5.22, 5.64]","[3.96, 4.3, 4.82, 5.16, 5.63]","[7.446591256961168e-06, 0.005970831501986954, ...","[TTTTTTTTXXXX, TTTTTTTTXXXXXX, TTTTTTTTTTXXXXX...",0.0021,"[0.0, 1.83, 0.41, 1.15, 0.18]",3.57,0.13
51,81,508.94,99.61,1.22,39.68,3.608636,"[3.96, 4.86, 5.77, 6.67, 7.57]","[3.96, 4.82, 5.71, 6.62, 7.6]","[7.446591256961168e-06, 0.0013239349243987877,...","[TTTTTTTTXXXX, TTTTTTTTTTXXXXXX, TTTTTTTTTTTTT...",0.00172,"[0.0, 0.82, 1.04, 0.75, 0.39]",3.0,0.12


## Filtering the sequence sets for unique sets

### Defining the staple sequences and shared functions

In [7]:
# The keys correspond to the staple position and names in the scadnano file
template_lines = {
        "2.1":"ACGCCAGCTGGCGGGGGAAAGGAACCCTAAAGGGAGAATCCCTCC",
        "2.2":"GCTGCGGCGAGCGGAAGCATATTATCCGCTCACAATGCGATTATT",
        "2.3":"ACACGACTTATCGCTGTTCCGGGCGTTTCCCCCTGGGGTAATACG",
        "2.4":"AGCGATCTGTCTAAAGATCCTGCAGCAGATTACGCGTAACAGGAG",
        "2.5":"GCATAATTCTCTTGCAACGTTAGCTAGAGTAAGTAGCTGACTCTC",
        "2.6":"GCGCACATTTCCCCATCTTTTGATGTAACCCACTCGATGCTTTCT",
        "4.1":"TCATAGCTGTTTCTAACGCCAGGGCGATCGGTGCGACGTGGCGGG",
        "4.2":"AACCCGACAGGACCCACAGAGCTCACTGACTCGCTAAGCCTGGGG",
        "4.3":"GGTAGCGGTGGTTGAGCGAGTAACTATCGTCTTGACGCTTACCGA",
        "4.4":"ATCCAGTCTATTATGTAGATCCAATGCTTAATCAGTTTCTACGCT",
        "4.5":"ATCTTACCGCTGTCTGGTGAAGTGTTATCACTCATTTGCTACACC",
        "4.6":"TCGAGGTGCCGTACAAAAGATATTTAGAAAAATAACCAGCGTTGG",
        "6.1":"CCATTCAGGCTGCAAGGGAAGGAACCATCACCCTAAGAGATAGGT",
        "6.2":"CGTATTGGGCGCTAATGAGTGCTGGAGAATTCGCGCTCCCAGTCG",
        "6.3":"CAGCCCGACCGCTTGTCCGCCAAAAATCGACGCTCAGATAACGTG",
        "6.4":"TATGAGTAAACTTACGCTCAGAGCTCTTGATCCGGCGGCGGTGTT",
        "6.5":"ATCGTTGTCAGAATGGTGTCAAGAAGTGGTCCTGCAATACGGGTA",
        "6.6":"CTCATGAGCGGATGAGCAAAAAAACGTTCTTCGGGGACCAAGTCG",
        "8.1":"CGACGGCCAGTGATTGTAAAAATGCGCCGCTACAGAAGGAGCGCG",
        "8.2":"GGCTCCGCCCCCCGAACATGATCGGCCAACGCGCGTCACATTAAA",
        "8.3":"CAGTTACCTTCGGGTTCTTGAAGCTGGGCTGTGTGCTTCGGGATA",
        "8.4":"ACCAGCCAGCCGGACCATCTAAATGAAGTTTTAAAAAAACTCAGC",
        "8.5":"GAACTTTAAAAGTAGAATAGAAAAAAGCGGTTAGCCGTTTGGTAA",
        "8.6":"TCTATCAGGGCGATGTTGTTTATTATTGAAGCATTGGCAAAATCA",
        "10.1":"GCGTAACCACCACGGGCGCTGTGGACTCCAACGTCAGGAACAATT",
        "10.2":"ACCTGTCGTGCCATGCGCTCAATTCGCGCTTGGCGTCCAGTGAGC",
        "10.3":"TCAGTTCGGTGTACGCTTTCTAAAAGGCCGCGTTGCAAGGCCAAA",
        "10.4":"TCTTCACCTAGATGGATTTTGCAGTATTTGGTATCTGGCCTAATC",
        "10.5":"GAGTTACATGATCCATTCAGCGCTCACCGGCTCCAGGTGCTGCGA",
        "10.6":"GAATACTCATACTAAAAGGGACAATACGGGATAATAGGCGACCGC",
}

container_sticky_seq = [
    ("XXXX", " GCGC"),
    ("XXXXXX", " TAGCTA"),
    ("XXXXXX", " GCATGC"),
    ("XXXXXXXX", " ATGATCAT"),
    ("XXXXXXXX", " ATTATAAT")
]

tube_sticky_seq = [
    ("XXXX", " GCGC"),
    ("XXXXXX", " YYYYYY"), #not used in design
    ("XXXXXX", " TAGCTA"), # differs for eac face, corrected later
    ("XXXXXXXX", " YYYYYYYY"), # not used in design
    ("XXXXXXXX", " ATGATCAT") # differs for eac face, corrected later
]


def append_string_to_values(data, append_info):
    """
    Appends a string to specific values in a dictionary.

    :param data: The dictionary to modify.
    :param append_info: A tuple with the first element being the string to append,
                        and the second being a list of keys.
    """
    string_to_append, keys_to_modify = append_info

    for key in keys_to_modify:
        if key in data:
            data[key] += string_to_append


## Visualizing fitted data (can take some time for many designs)

In [8]:
# # Number of rows in the DataFrame
# num_rows = filtered_df.shape[0]

# # Create subplots
# fig, axes = plt.subplots(num_rows, 1, figsize=(8, 6*num_rows), sharex=True)

# # Convert axes to list if num_rows is 1
# if num_rows == 1:
#     axes = [axes]

# # Loop through each row and plot the points
# for i, (arm, fit, r_naive, circum_naive, pm, spm, number_of_monomers) in enumerate(zip(filtered_df["arm_lengths"], filtered_df["fitting_sequences"], 
#                                                        filtered_df["r_naive"], filtered_df["circum_naive"], 
#                                                        filtered_df["pairwise mismatch [%]"], filtered_df["sum pairwise mismatch"],
#                                                        filtered_df["number of monomers"])):

#     # Create an array of 5 equidistant points between 0 and 18
#     y_values = np.linspace(0, DIPID_HEIGHT, 5)
    
#     # Plot arm_lengths points in black with connecting lines
#     axes[i].scatter(arm, y_values, color='black', label='arm_lengths')
    
#     # Shade the area to the LEFT of the arm_lengths line
#     axes[i].fill_betweenx(y_values, 0, arm, color='black', alpha=0.3)
    
#     # Plot fitting_sequences points in blue with connecting lines
#     axes[i].scatter(fit, y_values, color='blue', label='fitting_sequences')
    
#     # Shade the area to the LEFT of the fitting_sequences line
#     axes[i].fill_betweenx(y_values, 0, fit, color='blue', alpha=0.3)
    
#     axes[i].set_title(f"Radius: {r_naive} (nm)")
#     axes[i].set_xlabel("increased dipid radius (nm)")
#     axes[i].set_ylabel("arbitrary position")
#     axes[i].legend()
    
#     # Adding the information box
#     info_text = f"Circumference: {np.round(circum_naive/1000,2)}Âµm\nNumber of monomers: {np.round(number_of_monomers,1)}\nPairwise mismatch : {np.round(pm,1)}%\nSum pairwise mismatches: {np.round(spm,1)}%"
#     axes[i].text(0.5, 0.05, info_text, transform=axes[i].transAxes, fontsize=10,
#                  verticalalignment='bottom', bbox=dict(boxstyle='round,pad=0.5', facecolor='white', edgecolor='blue'))


# # Form the desired filename
# fig.savefig(f"../data/computed_dna_spacer_lengths/{design_type}_{NT_T_SPACER}T_fit_plot.jpg")
# plt.tight_layout()
# plt.show()


## Based on the staple sets for a given naive curvature, we now create the complete staple sequences

### Inserting the used sticky_sequences

In [9]:
container_df = matched_df.copy(deep=True)
tube_df = matched_df.copy(deep=True)

## substitute dummy sticky seq with assembly specific sequences

def replace_substrings(row_sequences, replacement_sequences):
    return [row_sequences[i].replace(*replacement_sequences[i]) for i in range(len(row_sequences))]

container_df['set_raw_sequences'] = container_df['set_raw_sequences'].apply(lambda x: replace_substrings(x, container_sticky_seq))
tube_df.sample(3)

tube_df['set_raw_sequences'] = tube_df['set_raw_sequences'].apply(lambda x: replace_substrings(x, tube_sticky_seq))
tube_df.sample(3)

Unnamed: 0,r_naive,circum_naive,number of monomers,Aspect Ratio,Upper Width,increase radius,arm_lengths,fitting_sequences,set_rss,set_raw_sequences,msd,pairwise mismatch [%],sum pairwise mismatch,sum_differences
202,232,1457.7,817.2,1.08,34.98,1.258636,"[3.96, 4.28, 4.59, 4.91, 5.22]","[3.96, 4.3, 4.57, 4.91, 5.16]","[7.446591256961168e-06, 0.0005166003635744493,...","[TTTTTTTT GCGC, TTTTTTTT YYYYYY, TTTTTTTTT TAG...",0.00088,"[0.0, 0.47, 0.44, 0.0, 1.15]",2.06,0.06
281,311,1954.07,1468.49,1.06,34.34,0.938636,"[3.96, 4.2, 4.43, 4.67, 4.9]","[3.96, 4.3, 4.3, 4.64, 4.91]","[7.446591256961168e-06, 0.010553215452844416, ...","[TTTTTTTT GCGC, TTTTTTTT YYYYYY, TTTTTTTT TAGC...",0.00558,"[0.0, 2.33, 2.93, 0.64, 0.2]",6.1,0.05
453,483,3034.78,3541.96,1.04,33.67,0.603636,"[3.96, 4.11, 4.26, 4.42, 4.57]","[3.96, 4.3, 4.3, 4.64, 4.64]","[7.446591256961168e-06, 0.03714440742827307, 0...","[TTTTTTTT GCGC, TTTTTTTT YYYYYY, TTTTTTTT TAGC...",0.0182,"[0.0, 4.42, 0.93, 4.74, 1.51]",11.6,0.52


### Appending the computed linker sequences to the rigth staple

#### container

In [10]:
def append_computed_container_link_seq(sequences, template_lines):
    container_lines = template_lines.copy()
    append_string_to_values(container_lines, (" " + sequences[0], ['2.1', '2.2', '2.3', '2.4', '2.5', '2.6']))
    append_string_to_values(container_lines, (" " + sequences[1], ['4.1', '4.2', '4.3', '4.4', '4.5', '4.6']))
    append_string_to_values(container_lines, (" " + sequences[2], ['6.1', '6.2', '6.3', '6.4', '6.5', '6.6']))
    append_string_to_values(container_lines, (" " + sequences[3], ['8.1', '8.2', '8.3', '8.4', '8.5', '8.6']))
    append_string_to_values(container_lines, (" " + sequences[4], ['10.1', '10.2', '10.3', '10.4', '10.5', '10.6']))
    return container_lines

# Applying the function
container_df['Modified_Lines'] = container_df['set_raw_sequences'].apply(lambda sequences: append_computed_container_link_seq(sequences, template_lines))

#### tubes

In [11]:
def append_computed_tube_link_seq(sequences, template_lines):
    tube_lines = template_lines.copy()
    append_string_to_values(tube_lines, (" " + sequences[0], ['2.2', '2.3', '2.5', '2.6']))
    append_string_to_values(tube_lines, (" " + sequences[2], ['6.2', '6.5']))
    append_string_to_values(tube_lines, (" " + sequences[2][:-6] + "GCATGC", ['6.3', '6.6']))
    append_string_to_values(tube_lines, (" " + sequences[4], ['10.2', '10.5']))
    append_string_to_values(tube_lines, (" " + sequences[4][:-8] + "ATTATAAT",  ['10.3', '10.6']))
    
    #Planar faces
    append_string_to_values(tube_lines, (" TTTTTTTT TGTATACA", ['2.1', '2.4']))
    append_string_to_values(tube_lines, (" TTTTTTTT ATCTAGAT", ['6.1', '6.4']))
    append_string_to_values(tube_lines, (" TTTTTTTT TCTATAGA", ['10.1', '10.4']))
    return tube_lines

tube_df['Modified_Lines'] = tube_df['set_raw_sequences'].apply(lambda sequences: append_computed_tube_link_seq(sequences, template_lines))

### Compute calibrated radii, tube radii and export dataframe as CSV

#### Container

In [12]:
container_df['r_container_calibrated'] = np.round(container_df["r_naive"].apply(container_calibration),0)
container_df.to_csv(f"../origami/DNA_sequences/complete_lists_of_precomputed_seqeunces/container_seq_{NT_T_SPACER}T.csv", index=False)
container_df.sample(3)

Unnamed: 0,r_naive,circum_naive,number of monomers,Aspect Ratio,Upper Width,increase radius,arm_lengths,fitting_sequences,set_rss,set_raw_sequences,msd,pairwise mismatch [%],sum pairwise mismatch,sum_differences,Modified_Lines,r_container_calibrated
399,429,2695.49,2794.25,1.04,33.82,0.678636,"[3.96, 4.13, 4.3, 4.47, 4.64]","[3.96, 4.3, 4.3, 4.64, 4.64]","[7.446591256961168e-06, 0.029835253655955733, ...","[TTTTTTTT GCGC, TTTTTTTT TAGCTA, TTTTTTTT GCAT...",0.01156,"[0.0, 3.95, 0.0, 3.66, 0.0]",7.61,0.34,{'2.1': 'ACGCCAGCTGGCGGGGGAAAGGAACCCTAAAGGGAGA...,560.0
31,61,383.27,56.49,1.3,42.04,4.788636,"[3.96, 5.16, 6.36, 7.55, 8.75]","[3.96, 5.06, 6.28, 7.6, 8.71]","[7.446591256961168e-06, 0.009687258972627955, ...","[TTTTTTTT GCGC, TTTTTTTTTTT TAGCTA, TTTTTTTTTT...",0.0041,"[0.0, 1.94, 1.26, 0.66, 0.46]",4.32,0.17,{'2.1': 'ACGCCAGCTGGCGGGGGAAAGGAACCCTAAAGGGAGA...,74.0
369,399,2506.99,2417.11,1.05,33.93,0.733636,"[3.96, 4.15, 4.33, 4.51, 4.7]","[3.96, 4.3, 4.3, 4.64, 4.64]","[7.446591256961168e-06, 0.02332609988363809, 0...","[TTTTTTTT GCGC, TTTTTTTT TAGCTA, TTTTTTTT GCAT...",0.00878,"[0.0, 3.49, 0.69, 2.8, 1.28]",8.26,0.19,{'2.1': 'ACGCCAGCTGGCGGGGGAAAGGAACCCTAAAGGGAGA...,520.0


#### Tubes

In [13]:
tube_df["r_tube_naive"] = np.round(tube_df['r_naive'] * np.cos(np.radians(30)),0)
tube_df['r_tube_calibrated'] = np.round(tube_df["r_tube_naive"].apply(tube_calibration),0)
container_df.to_csv(f"../origami/DNA_sequences/complete_lists_of_precomputed_seqeunces/tube_seq_{NT_T_SPACER}T.csv", index=False)
tube_df.sample(3)

Unnamed: 0,r_naive,circum_naive,number of monomers,Aspect Ratio,Upper Width,increase radius,arm_lengths,fitting_sequences,set_rss,set_raw_sequences,msd,pairwise mismatch [%],sum pairwise mismatch,sum_differences,Modified_Lines,r_tube_naive,r_tube_calibrated
266,296,1859.82,1330.25,1.06,34.44,0.988636,"[3.96, 4.21, 4.46, 4.7, 4.95]","[3.96, 4.3, 4.57, 4.64, 4.91]","[7.446591256961168e-06, 0.008598638566685711, ...","[TTTTTTTT GCGC, TTTTTTTT YYYYYY, TTTTTTTTT TAG...",0.00508,"[0.0, 2.09, 2.41, 1.28, 0.81]",6.59,0.1,{'2.1': 'ACGCCAGCTGGCGGGGGAAAGGAACCCTAAAGGGAGA...,256.0,334.0
3,33,207.35,16.53,1.55,50.17,8.853636,"[3.96, 6.18, 8.39, 10.6, 12.82]","[3.96, 6.09, 8.37, 10.65, 12.41]","[7.446591256961168e-06, 0.007396789479681445, ...","[TTTTTTTT GCGC, TTTTTTTTTTTTTTTT YYYYYY, TTTTT...",0.03582,"[0.0, 1.46, 0.24, 0.47, 3.2]",5.37,0.47,{'2.1': 'ACGCCAGCTGGCGGGGGAAAGGAACCCTAAAGGGAGA...,29.0,29.0
9,39,245.04,23.09,1.46,47.45,7.493636,"[3.96, 5.84, 7.71, 9.58, 11.46]","[3.96, 5.9, 7.7, 9.56, 11.43]","[7.446591256961168e-06, 0.004159234877682909, ...","[TTTTTTTT GCGC, TTTTTTTTTTTTTTT YYYYYY, TTTTTT...",0.001,"[0.0, 1.02, 0.13, 0.21, 0.26]",1.62,0.0,{'2.1': 'ACGCCAGCTGGCGGGGGAAAGGAACCCTAAAGGGAGA...,34.0,34.0


### Filtering unique staple sets

#### Container

In [14]:
def radius_range(x):
    if len(x) > 1:
        return f"{x.min()} - {x.max()}"
    else:
        return f"{x.iloc[0]}"
    
    
# Convert 'set_raw_sequences' column to string
container_df_collapsed = container_df.copy()
container_df_collapsed['modified_lines_str'] = container_df['Modified_Lines'].apply(lambda x: ' '.join(x.values()))

# Use the aggregation function in the groupby operation
container_df_collapsed = container_df_collapsed.groupby('modified_lines_str').agg({
    'r_naive': radius_range,
    'r_container_calibrated' : radius_range,
    'circum_naive' : 'mean',
    'number of monomers' : 'mean',
    'Modified_Lines' : 'first'
}).reset_index()

# Drop the temporary 'modified_lines_str' column and recreate the 'modified_lines' column from the string
container_df_collapsed['modified_lines_str'] = container_df_collapsed['modified_lines_str'].apply(lambda x: x.split())
container_df_collapsed.drop(columns='modified_lines_str', inplace=True)
container_df_collapsed = container_df_collapsed.sort_values(by='circum_naive', ascending=True).reset_index(drop=True)

container_df_collapsed.to_csv(f"../origami/DNA_sequences/complete_lists_of_precomputed_seqeunces/container_seq_unique_{NT_T_SPACER}T.csv", index=False)
container_df_collapsed.tail(3)

Unnamed: 0,r_naive,r_container_calibrated,circum_naive,number of monomers,Modified_Lines
71,273 - 308,354.0 - 400.0,1825.265556,1282.913333,{'2.1': 'ACGCCAGCTGGCGGGGGAAAGGAACCCTAAAGGGAGA...
72,309 - 360,401.0 - 468.0,2101.725385,1702.219423,{'2.1': 'ACGCCAGCTGGCGGGGGAAAGGAACCCTAAAGGGAGA...
73,361 - 499,470.0 - 652.0,2701.769712,2831.731727,{'2.1': 'ACGCCAGCTGGCGGGGGAAAGGAACCCTAAAGGGAGA...


#### tubes

In [15]:
# Convert 'set_raw_sequences' column to string
tube_df_collapsed = tube_df.copy()
tube_df_collapsed['modified_lines_str'] = tube_df['Modified_Lines'].apply(lambda x: ' '.join(x.values()))

# Use the aggregation function in the groupby operation
tube_df_collapsed = tube_df_collapsed.groupby('modified_lines_str').agg({
    'r_naive': radius_range,
    'r_tube_naive' : radius_range,
    'r_tube_calibrated' : radius_range,
    'circum_naive' : 'mean',
    'number of monomers' : 'mean',
    'Modified_Lines' : 'first'
}).reset_index()

# Drop the temporary 'modified_lines_str' column and recreate the 'modified_lines' column from the string
tube_df_collapsed['modified_lines_str'] = tube_df_collapsed['modified_lines_str'].apply(lambda x: x.split())
tube_df_collapsed.drop(columns='modified_lines_str', inplace=True)
tube_df_collapsed = tube_df_collapsed.sort_values(by='circum_naive', ascending=True).reset_index(drop=True)


tube_df_collapsed.to_csv(f"../origami/DNA_sequences/complete_lists_of_precomputed_seqeunces/tube_seq_unique_{NT_T_SPACER}T.csv", index=False)
tube_df_collapsed.tail(3)

Unnamed: 0,r_naive,r_tube_naive,r_tube_calibrated,circum_naive,number of monomers,Modified_Lines
59,273 - 308,236.0 - 267.0,305.0 - 350.0,1825.265556,1282.913333,{'2.1': 'ACGCCAGCTGGCGGGGGAAAGGAACCCTAAAGGGAGA...
60,309 - 360,268.0 - 312.0,351.0 - 416.0,2101.725385,1702.219423,{'2.1': 'ACGCCAGCTGGCGGGGGAAAGGAACCCTAAAGGGAGA...
61,361 - 499,313.0 - 432.0,417.0 - 591.0,2701.769712,2831.731727,{'2.1': 'ACGCCAGCTGGCGGGGGAAAGGAACCCTAAAGGGAGA...
