## Setting up environment and constants

In [2]:
import numpy as numpy
import pandas as pd
import csv
from collections import Counter

## Mw of bases
#https://www.thermofisher.com/de/de/home/references/ambion-tech-support/rna-tools-and-calculators/dna-and-rna-molecular-weights-and-conversions.html
amp = 313.2 #g/mol
cmp = 289.2
tmp = 304.2
gmp = 329.2

## Computing Mw of a XXL Dipid

In [5]:
def count_bases_and_calculate_mw(file_path, is_csv=True):
    """Count the bases and calculate the total molecular weight, ignoring non-base characters and case-insensitive."""
    # Base molecular weights
    mw_dict = {'A': 313.2, 'C': 289.2, 'T': 304.2, 'G': 329.2}
    base_counts = Counter()
    total_mw = 0

    with open(file_path, newline='') as file:
        if is_csv:
            reader = csv.reader(file)
            next(reader, None)  # Skip the header
            for row in reader:
                if len(row) > 1:
                    sequence = row[1].upper()  # Convert sequence to uppercase
                    base_counts.update([base for base in sequence if base in 'ATGC'])
        else:
            next(file)  # Skip the first line if it's a text file
            sequence = file.readline().strip().upper()  # Read the sequence line and convert to uppercase
            base_counts.update([base for base in sequence if base in 'ATGC'])

    # Calculate the total molecular weight
    for base, count in base_counts.items():
        total_mw += mw_dict[base] * count

    return base_counts, total_mw

# File paths
csv_file_paths = ['../origami/DNA_sequences/sequences_used_in_this_study/base_monomer_staple.csv', '../data/computing_Mw_and_monomer_numbers_from_collapsed_TEM_XXL_container_areas/container_8T_XXL_2.csv']
text_file_path = '../origami/DNA_sequences/sequences_used_in_this_study/scaffold_sequence_p2873.txt'

total_mw = 0
# Process each file and print results
for csv_file_path in csv_file_paths:
    csv_base_counts, csv_total_mw = count_bases_and_calculate_mw(csv_file_path, is_csv=True)
    total_mw += csv_total_mw
    print(f"Counts for {csv_file_path}: {csv_base_counts}, Total MW: {csv_total_mw} g/mol")

text_base_counts, text_total_mw = count_bases_and_calculate_mw(text_file_path, is_csv=False)
total_mw += text_total_mw
print(f"Counts for {text_file_path}: {text_base_counts}, Total MW: {text_total_mw} g/mol")

total_mw #g/mol

Counts for ../origami/DNA_sequences/sequences_used_in_this_study/base_monomer_staple.csv: Counter({'G': 1081, 'C': 1074, 'A': 787, 'T': 771}), Total MW: 1147492.5999999999 g/mol
Counts for ../data/computing_Mw_and_monomer_numbers_from_collapsed_TEM_XXL_container_areas/container_8T_XXL_2.csv: Counter({'T': 614, 'A': 393, 'G': 368, 'C': 356}), Total MW: 533967.2 g/mol
Counts for ../origami/DNA_sequences/sequences_used_in_this_study/scaffold_sequence_p2873.txt: Counter({'T': 737, 'C': 726, 'G': 708, 'A': 702}), Total MW: 887094.6 g/mol


2568554.4

In [7]:
import pandas as pd

def process_with_pandas(input_file_path, output_file_path, total_mw_placeholder):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_file_path)

    # Calculate new columns
    df['density_fraction_corrected'] = df.iloc[:, 1] * 0.9069  # Assuming the second column contains the relevant values
    df['monomer_area'] = df['density_fraction_corrected'] * 2
    monomer_radius = 29.5 / 2  # Radius of a monomer
    monomer_area_unit = 887 #nm**2 extrcated from an image by averaging over multiple mnomers in an assembly of a XXL vesicle  #(np.pi * monomer_radius**2)  # Area of a single monomer
    df['monomer_count'] = df['monomer_area'] / monomer_area_unit
    df['Mw'] = df['monomer_count'] * total_mw  # g/mol
    df["Mw_Giga_dalton"] = df['Mw'] * 10**-9

    # Write the DataFrame with the new columns to a new CSV file
    df.to_csv(output_file_path, index=False)
    return df

# Define input and output file paths
input_file_path = "../data/computing_Mw_and_monomer_numbers_from_collapsed_TEM_XXL_container_areas/measured_areas.csv"
output_file_path = '../data/computing_Mw_and_monomer_numbers_from_collapsed_TEM_XXL_container_areas/pandas_processed_measured_areas.csv'

# Process the file using Pandas
df = process_with_pandas(input_file_path, output_file_path, total_mw)
df


Unnamed: 0,container_id,area (nm**2),Unnamed: 2,density_fraction_corrected,monomer_area,monomer_count,Mw,Mw_Giga_dalton
0,XXL1,11697980.0,,10608900.0,21217800.0,23920.859931,61442030000.0,61.44203
1,XXL2_top_image,3958867.0,,3590297.0,7180593.0,8095.370116,20793400000.0,20.793399
2,XXL3,4424583.0,,4012655.0,8025309.0,9047.699348,23239510000.0,23.239508
3,XXL4,4777405.0,,4332628.0,8665257.0,9769.173354,25092650000.0,25.092653
4,XL1_top_image,2793135.0,,2533094.0,5066189.0,5711.5997,14670550000.0,14.670555
5,L1_top_image,720571.7,,653486.5,1306973.0,1473.475733,3784703000.0,3.784703
