<a href="https://colab.research.google.com/github/eoinleen/biochem-general/blob/main/FIDA_analysis_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Mount Google Drive
drive.mount('/content/drive')

# Update this to the folder in your Drive where files are
directory = '/content/drive/MyDrive/your_folder_name'

# Helper to parse concentration from sample name like '[0 nM] A1-15-07-2025-12-15.txt'
def parse_concentration(sample_name):
    # Example sample_name: "[0 nM] A1-15-07-2025-12-15.txt"
    match = re.search(r'\[(.*?)\]', sample_name)
    if match:
        conc_str = match.group(1).strip()  # e.g. "0 nM"
        # Split to value and unit
        parts = conc_str.split()
        if len(parts) == 2:
            val, unit = parts
            val = float(val)
            # Convert to µM if unit is nM
            if unit.lower() == 'nm':
                return val / 1000  # nM to µM
            elif unit.lower() == 'µm' or unit.lower() == 'um':
                return val
            else:
                return val  # no conversion if unknown
    return None

# Function to extract Rh from a file
def extract_Rh(filepath):
    # Read the first line to get header columns
    with open(filepath, 'r') as f:
        header_line = f.readline().strip()
    headers = header_line.split('\t')

    # Find column index of Hydrodynamic radius 1 (nm)
    try:
        rh_index = headers.index('Hydrodynamic radius 1 (nm)')
    except ValueError:
        raise Exception(f"'Hydrodynamic radius 1 (nm)' column not found in {filepath}")

    # Read data rows (skip header)
    df = pd.read_csv(filepath, sep='\t', skiprows=1, header=None, names=headers)

    # Assuming you want the first value of Rh from the file (or mean?)
    # Here we take the mean Rh across rows in that file:
    rh_values = pd.to_numeric(df['Hydrodynamic radius 1 (nm)'], errors='coerce').dropna()

    if len(rh_values) == 0:
        return None

    return rh_values.mean()

# Prepare data collection
concentrations = []
rh_values = []
file_names = []

# List all .txt files in directory
for file in sorted(os.listdir(directory)):
    if file.endswith('.txt'):
        filepath = os.path.join(directory, file)
        conc = parse_concentration(file)
        if conc is None:
            print(f"Skipping {file} — no concentration found")
            continue
        rh = extract_Rh(filepath)
        if rh is None:
            print(f"Skipping {file} — no Rh found")
            continue

        concentrations.append(conc)
        rh_values.append(rh)
        file_names.append(file)

# Create DataFrame with a single 'Concentration (µM)' column and one Rh column per file
data_dict = {'Concentration (µM)': concentrations}
for fn, rh in zip(file_names, rh_values):
    # Clean filename for column name, e.g. remove spaces or extension
    col_name = fn.replace('.txt', '') + ' Rh (nm)'
    data_dict[col_name] = [rh] * len(concentrations)  # Repeat rh for all rows

# Since all concentrations are the same, we want unique concentrations and associated Rh values
# Actually, your statement "all protein concentrations are the same" suggests that each file is for one concentration
# So we build a single row per file with concentration and Rh

df_output = pd.DataFrame({
    'Concentration (µM)': concentrations
})
for fn, rh in zip(file_names, rh_values):
    col_name = fn.replace('.txt', '') + ' Rh (nm)'
    df_output[col_name] = rh

# But that duplicates concentration column in every row; since each file has one conc/Rh pair,
# better to organize as:
df_output = pd.DataFrame({
    'Concentration (µM)': concentrations
})
for fn, rh in zip(file_names, rh_values):
    col_name = fn.replace('.txt', '') + ' Rh (nm)'
    df_output[col_name] = rh

# Now plot protein concentration vs Rh
plt.figure(figsize=(8,6))
plt.xscale('log')
plt.xlabel('Protein Concentration (µM)')
plt.ylabel('Hydrodynamic Radius (nm)')
plt.title('Protein Concentration vs Hydrodynamic Radius')

# Plot each file's point
for conc, rh, fn in zip(concentrations, rh_values, file_names):
    plt.scatter(conc, rh, label=fn.replace('.txt',''), alpha=0.7)

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
plt.tight_layout()
plt.show()

# Save combined CSV file
output_csv = 'combined_Rh_vs_concentration.csv'
df_output.to_csv(output_csv, index=False)
print(f"Combined data saved to {output_csv}")