In [None]:
import pandas as pd
import numpy as np
import os
import re
import csv
import glob
import statsmodels.formula.api as smf

# FA01
Fitness assays of populations in YPD.

Going from `t1_data.txt, t2_data.txt` to `FA01_parsed_fitness_data.txt`.

Needs `plate_layout_FA01.txt`.

In [None]:
## t1/2_data.txt -> t1/2f_parsed.txt

def transform_data(input_file, output_file):
    # Load the input file
    data = pd.read_csv(input_file, sep="\t", names=["Name", "Statistic", "Cells"])

    # Extract the base file names
    data["BaseName"] = data["Name"].str.extract(r"(^.*\.fcs)")

    # Filter rows for each metric
    total_counts = data[data["Name"].str.endswith(".fcs")]
    cells_counts = data[data["Name"].str.endswith("/cells")]
    mcherry_counts = data[data["Name"].str.endswith("/cells/mCherry")]

    # Merge the counts based on the base file name
    merged = pd.merge(total_counts[["BaseName", "Cells"]], cells_counts[["BaseName", "Cells"]], on="BaseName", suffixes=("_total", "_cells"))
    merged = pd.merge(merged, mcherry_counts[["BaseName", "Cells"]], on="BaseName")
    merged.rename(columns={"Cells": "Cells_mCherry"}, inplace=True)

    # Save to output
    merged[["BaseName", "Cells_total", "Cells_cells", "Cells_mCherry"]].to_csv(output_file, sep="\t", index=False, header=False)

    print(f"Transformed data saved to {output_file}")

# Process t1_data.txt and t2_data.txt
transform_data("t1_data.txt", "t1f_parsed.txt")
transform_data("t2_data.txt", "t2f_parsed.txt")

Transformed data saved to t1f_parsed.txt
Transformed data saved to t2f_parsed.txt


In [None]:
## t1/2f_parsed.txt -> winfo_t1/2f_parsed.txt

def transform_parsed_to_winfo(parsed_file, plate_layout_file, output_file):
    """
    Transform parsed file to winfo format using plate layout with row and well repetition.

    Args:
        parsed_file (str): Path to the parsed file (e.g., t1f_parsed.txt).
        plate_layout_file (str): Path to the plate_layout.txt file.
        output_file (str): Path to save the transformed winfo file.
    """
    # Load parsed data
    parsed_data = pd.read_csv(parsed_file, sep="\t", header=None, names=["V1", "V2", "V3", "V4"])

    # Load plate layout
    plate_layout = pd.read_csv(plate_layout_file, sep="\t", header=None)

    # Repeat each well twice consecutively
    repeated_wells = []
    for row in plate_layout.values:
        repeated_row = []
        for well in row:
            repeated_row.extend([well, well])  # Repeat each well twice
        repeated_wells.append(repeated_row)
        repeated_wells.append(repeated_row)  # Repeat each row twice

    # Flatten the plate layout for `sample.id`
    sample_ids = [well for row in repeated_wells for well in row]

    # Adjust the length of sample IDs to match parsed data
    sample_ids = sample_ids * (len(parsed_data) // len(sample_ids))
    parsed_data["sample.id"] = sample_ids

    # Generate the 'fact' column to match winfo files - not sure how this was actually generated?
    facts = []
    for v1 in parsed_data["V1"]:
        # Extract the row letter from the specimen name (e.g., "A", "B", etc.)
        row_letter = v1.split("_")[2][0]

        # Assign fact based on row letter (A/C/E/G -> A/B alternating, B/D/F/H -> C/D alternating)
        if row_letter in ["A", "C", "E", "G", "I", "K", "M", "O"]:
            fact = "A" if len(facts) % 2 == 0 else "B"
        elif row_letter in ["B", "D", "F", "H", "J", "L", "N", "P"]:
            fact = "C" if len(facts) % 2 == 0 else "D"
        facts.append(fact)
    parsed_data["fact"] = facts

    # Save the transformed data to output file
    parsed_data.to_csv(output_file, sep="\t", index=False)
    print(f"Winfo file saved to {output_file}")

# Example usage
transform_parsed_to_winfo("t1f_parsed.txt", "plate_layout_FA01.txt", "winfo_t1f_parsed.txt")
transform_parsed_to_winfo("t2f_parsed.txt", "plate_layout_FA01.txt", "winfo_t2f_parsed.txt")

Winfo file saved to winfo_t1f_parsed.txt
Winfo file saved to winfo_t2f_parsed.txt


In [None]:
## renaming winfo_t1/2f_parsed.txt -> FA01_YPD_t1/2.txt

def copy_files():
    # Define the mapping of old file names to new file names
    file_mapping = {
        "winfo_t1f_parsed.txt": "FA01_YPD_t1.txt",
        "winfo_t2f_parsed.txt": "FA01_YPD_t2.txt",
    }

    for old_name, new_name in file_mapping.items():
        if os.path.exists(old_name):
            with open(old_name, 'rb') as src_file:
                with open(new_name, 'wb') as dest_file:
                    dest_file.write(src_file.read())
            print(f"Copied {old_name} to {new_name}")
        else:
            print(f"File {old_name} not found.")

if __name__ == "__main__":
    copy_files()

Copied winfo_t1f_parsed.txt to FA01_YPD_t1.txt
Copied winfo_t2f_parsed.txt to FA01_YPD_t2.txt


In [None]:
## FA01_YPD_t1/2.txt -> FA01_parsed_fitness_data.txt and FA01_parsed_fitness_data_2N.txt

def fit_est(f1, f0, t):
    """
    Calculate the fitness estimate using the given formula.
    """
    return (1 / t) * (np.log(f1 / (1 - f1)) - np.log(f0 / (1 - f0)))

# Helper function to safely split sample_id
def safe_split(sample_id, index):
    parts = sample_id.split("_")
    return parts[index] if len(parts) > index else "NA"

# Load data
YPD_t1 = pd.read_csv("FA01_YPD_t1.txt", sep="\t")
YPD_t2 = pd.read_csv("FA01_YPD_t2.txt", sep="\t")

# Merge data
YPD = pd.merge(YPD_t1, YPD_t2.iloc[:, :4], on="V1", how="left")
YPD["well_id"] = YPD["V1"].apply(lambda x: x.split("_")[3].replace(".fcs", ""))
YPD = YPD.drop(columns=["V1"])
YPD.columns = ["tot_t1", "cells_t1", "ref_t1", "sample_id", "plate", "tot_t2", "cells_t2", "ref_t2", "well_id"]

# Add additional metadata with error handling
YPD["sex_cycle"] = YPD["sample_id"].apply(lambda x: safe_split(x, 1))
YPD["kk_well_id"] = YPD["sample_id"].apply(lambda x: safe_split(x, 2))

# Flag regime
YPD["regime"] = np.where(YPD["sample_id"].str.contains("a1.1_|2N.1_"), "asexual", "NA")
YPD.loc[YPD["sample_id"].str.contains("a3_|2N.3_"), "regime"] = "sexual"

# Flag ancestor
YPD["anc"] = np.where(YPD["sample_id"].str.contains("aAnc|dipAnc"), 1, 0)

# Flag diploids
YPD["diploid"] = np.where(YPD["sample_id"].str.startswith("2N") | YPD["sample_id"].str.startswith("dip"), 1, 0)

# Flag reference
YPD["ref"] = np.where(YPD["sample_id"].str.endswith("FR"), 1, 0)

# Add assay
YPD["assay"] = "FA01"

# Estimate fitness
YPD["s_hat"] = fit_est(
    f1=(YPD["cells_t2"] - YPD["ref_t2"]) / YPD["cells_t2"],
    f0=(YPD["cells_t1"] - YPD["ref_t1"]) / YPD["cells_t1"],
    t=10,
)
YPD["s_hat"] = YPD["s_hat"].astype(object)  # Convert to object for mixed values
YPD.loc[YPD["ref"] == 1, "s_hat"] = "NA"

# Add environment and replicate
YPD["env"] = "YPD"
YPD["rep"] = YPD["plate"]
YPD["plate"] = "YPD_" + YPD["plate"]

# Replace blanks with NAs
YPD = YPD.fillna("NA")

# Export haploid data
relcols = ["kk_well_id", "assay", "regime", "plate", "env", "rep", "anc", "ref", "s_hat"]
YPD_haps = YPD[YPD["diploid"] == 0][relcols]

# Remove outlier samples
YPD_haps = YPD_haps[YPD_haps["s_hat"].astype(str) != "NA"]
YPD_haps = YPD_haps[YPD_haps["s_hat"].astype(float) < 0.1]

# Add kk_pop_id with NA for missing values
YPD_haps["kk_pop_id"] = "NA"
YPD_haps.loc[
    (YPD_haps["regime"] == "sexual") & (YPD_haps["anc"] == 0), "kk_pop_id"
] = YPD_haps["kk_well_id"].apply(lambda x: f"a3_{x}")
YPD_haps.loc[
    (YPD_haps["regime"] == "asexual") & (YPD_haps["anc"] == 0), "kk_pop_id"
] = YPD_haps["kk_well_id"].apply(lambda x: f"a1.1_{x}")

# Define the custom header
header = [
    '"kk_well_id"',
    '"assay"',
    '"regime"',
    '"plate"',
    '"env"',
    '"rep"',
    '"anc"',
    '"ref"',
    '"s_hat"',
    '"kk_pop_id"',
]

# Format the output to ensure correct quoting
def format_row(row):
    return [
        f'"{val}"' if isinstance(val, str) and val != "NA" else str(val)
        for val in row
    ]

# Save haploid data with the specified header
with open("FA01_parsed_fitness_data.txt", "w") as f:
    f.write(",".join(header) + "\n")  # Write the header
    for _, row in YPD_haps.iterrows():
        formatted_row = format_row(row)
        f.write(",".join(formatted_row) + "\n")

print("Haploid data saved to FA01_parsed_fitness_data.txt")

## DIPLOIDS - diploid data but this won't be used for any subsequent analyses for this project

# Save diploid data with dynamic headers derived from YPD
diploid_data = YPD[YPD["diploid"] == 1]  # Filter for diploid data

# Format column names dynamically for diploid export
diploid_header = [f'"{col}"' for col in diploid_data.columns]

# Save diploid data with dynamically derived headers
with open("FA01_parsed_fitness_data_2N.txt", "w") as f:
    f.write(",".join(diploid_header) + "\n")  # Write dynamic header
    for _, row in diploid_data.iterrows():
        formatted_row = [
            f'"{val}"' if isinstance(val, str) and val != "NA" else str(val)
            for val in row
        ]
        f.write(",".join(formatted_row) + "\n")

print("Diploid data saved to FA01_parsed_fitness_data_2N.txt")

Haploid data saved to FA01_parsed_fitness_data.txt
Diploid data saved to FA01_parsed_fitness_data_2N.txt


# FA02
Fitness assays of populations in SC 30C, SC 37C, SC +NaCl, SC pH 7.3, SC -P.

Going from `1AB2AB_t1.txt, 1AB2AB_t2.txt, 3AB4AB_t1.txt, 3AB4AB_t2.txt, 5AB6AB_t1.txt, 5AB6AB_t2.txt` to `FA02_parsed_fitness_data.txt`.

Needs `plate_layout_FA02.txt`.

In [None]:
## 1AB2AB_t1/2.txt -> parsed_1AB2AB_t1/2.txt etc

files_to_process = [
    "1AB2AB_t1.txt",
    "1AB2AB_t2.txt",
    "3AB4AB_t1.txt",
    "3AB4AB_t2.txt",
    "5AB6AB_t1.txt",
    "5AB6AB_t2.txt"
]

def parse_file(input_file):
    tot_lines = []
    ref_lines = []

    # Read and classify lines
    with open(input_file, 'r') as f:
        for line in f:
            if '.fcs' in line and '/mCherry' not in line:
                tot_lines.append(line.strip())
            elif '/mCherry' in line:
                ref_lines.append(line.strip())

    # Check if the number of lines matches
    if len(tot_lines) != len(ref_lines):
        print(f"Mismatch in line counts! Total: {len(tot_lines)}, Reference: {len(ref_lines)}")
        return

    # Combine data and write parsed output
    parsed_file = f"parsed_{input_file}"
    with open(parsed_file, 'w') as out:
        for tot, ref in zip(tot_lines, ref_lines):
            tot_parts = re.split(r'\s+', tot.strip())
            ref_parts = re.split(r'\s+', ref.strip())

            if len(tot_parts) >= 2 and len(ref_parts) >= 3:
                out.write(f"{tot_parts[0]}\t{tot_parts[1]}\t{ref_parts[2]}\n")

    print(f"Finished processing {input_file}, output saved to {parsed_file}.")

# Process specified files
for file_name in files_to_process:
    if os.path.exists(file_name):  # Check if the file exists
        parse_file(file_name)
    else:
        print(f"File {file_name} not found, skipping.")

Finished processing 1AB2AB_t1.txt, output saved to parsed_1AB2AB_t1.txt.
Finished processing 1AB2AB_t2.txt, output saved to parsed_1AB2AB_t2.txt.
Finished processing 3AB4AB_t1.txt, output saved to parsed_3AB4AB_t1.txt.
Finished processing 3AB4AB_t2.txt, output saved to parsed_3AB4AB_t2.txt.
Finished processing 5AB6AB_t1.txt, output saved to parsed_5AB6AB_t1.txt.
Finished processing 5AB6AB_t2.txt, output saved to parsed_5AB6AB_t2.txt.


In [None]:
## parsed_1AB2AB_t1/2.txt -> final_parsed_1AB2AB_t1/2.txt etc

def add_missing_well_ids(parsed_data):
    """
    Identifies and adds missing well_ids based on the repeating pattern A01-A24, B01-B24, etc.

    Args:
        parsed_data (pd.DataFrame): The DataFrame containing the parsed data with assigned well_ids.

    Returns:
        pd.DataFrame: The DataFrame with missing well_ids added.
    """
    # Define the expected well_id pattern
    rows = "ABCDEFGHIJKLMNOP"
    cols = [f"{i:02d}" for i in range(1, 25)]
    expected_well_ids = [f"{row}{col}" for row in rows for col in cols]

    # Identify missing well_ids
    existing_well_ids = parsed_data["well_id"].tolist()
    missing_well_ids = set(expected_well_ids) - set(existing_well_ids)

    # Create a DataFrame for the missing well_ids with all other columns as NaN
    missing_data = pd.DataFrame({
        "well_id": list(missing_well_ids),
        "sample_id": [pd.NA] * len(missing_well_ids),
        "fact": [pd.NA] * len(missing_well_ids),
        "name_long": [pd.NA] * len(missing_well_ids),
        "cells": [pd.NA] * len(missing_well_ids),
        "ref": [pd.NA] * len(missing_well_ids)
    })

    # Combine the original and missing data, then sort by well_id
    combined_data = pd.concat([parsed_data, missing_data]).sort_values("well_id").reset_index(drop=True)
    return combined_data

def extract_fact_prefix(filename):
    """
    Extracts the fact prefix (1, 3, 5, etc.) from the filename.

    Args:
        filename (str): The input filename.

    Returns:
        int: The fact prefix (e.g., 1, 3, 5).
    """
    if "1AB2AB" in filename:
        return 1
    elif "3AB4AB" in filename:
        return 3
    elif "5AB6AB" in filename:
        return 5
    else:
        raise ValueError(f"Unexpected filename format: {filename}")

def transform_parsed_to_final(parsed_file, plate_layout_file, output_file):
    """
    Transform parsed data to final parsed format using the plate layout.

    Args:
        parsed_file (str): Path to the parsed file.
        plate_layout_file (str): Path to the plate_layout.txt file.
        output_file (str): Path to save the transformed final parsed file.
    """
    # Load parsed data
    parsed_data = pd.read_csv(parsed_file, sep="\t", header=None, names=["name_long", "cells", "ref"])

    # Extract well_id from name_long (e.g., "Specimen_001_A1_A01.fcs" -> "A01")
    parsed_data["well_id"] = parsed_data["name_long"].str.extract(r"(?:.*?_){3}([^_.]+)")

    # Add missing well_ids
    parsed_data = add_missing_well_ids(parsed_data)

    # Load plate layout
    plate_layout = pd.read_csv(plate_layout_file, sep="\t", header=None)

    # Repeat each well and each row twice for sample_id
    repeated_wells = []
    for row in plate_layout.values:
        repeated_row = []
        for well in row:
            repeated_row.extend([well, well])  # Repeat each well twice
        repeated_wells.append(repeated_row)
        repeated_wells.append(repeated_row)  # Repeat each row twice

    # Flatten the repeated wells
    repeated_sample_ids = [well for row in repeated_wells for well in row]

    # Ensure sample_id length matches parsed data
    repeat_factor = len(parsed_data) // len(repeated_sample_ids) + 1
    repeated_sample_ids = np.tile(repeated_sample_ids, repeat_factor)[:len(parsed_data)]
    parsed_data["sample_id"] = repeated_sample_ids

    # Extract fact prefix from the filename
    fact_prefix = extract_fact_prefix(parsed_file)

    # Assign `fact` column based on row letters and alternation
    facts = []
    for i, well in enumerate(parsed_data["well_id"]):
        if pd.isna(well):  # Handle NaN cases for added rows
            facts.append(pd.NA)
            continue
        row_letter = well[0]
        if row_letter in "ACEGIKMOQ":
            facts.append(f"{fact_prefix}_A" if i % 2 == 0 else f"{fact_prefix}_B")
        else:
            facts.append(f"{fact_prefix + 1}_A" if i % 2 == 0 else f"{fact_prefix + 1}_B")
    parsed_data["fact"] = facts

    # Reorder columns to match the desired output
    final_parsed = parsed_data[["well_id", "sample_id", "fact", "name_long", "cells", "ref"]]

    # Save the final parsed data to the output file
    final_parsed.to_csv(output_file, sep="\t", index=False, na_rep="NA")
    print(f"Final parsed file saved to {output_file}")

# List of parsed files to process
parsed_files = [
    "parsed_1AB2AB_t1.txt",
    "parsed_1AB2AB_t2.txt",
    "parsed_3AB4AB_t1.txt",
    "parsed_3AB4AB_t2.txt",
    "parsed_5AB6AB_t1.txt",
    "parsed_5AB6AB_t2.txt"
]

# Plate layout file
plate_layout_file = "plate_layout_FA02.txt"

# Loop over the files and process each
for parsed_file in parsed_files:
    output_file = f"final_{parsed_file}"  # Output directly to the current directory
    transform_parsed_to_final(parsed_file, plate_layout_file, output_file)

Final parsed file saved to final_parsed_1AB2AB_t1.txt
Final parsed file saved to final_parsed_1AB2AB_t2.txt
Final parsed file saved to final_parsed_3AB4AB_t1.txt
Final parsed file saved to final_parsed_3AB4AB_t2.txt
Final parsed file saved to final_parsed_5AB6AB_t1.txt
Final parsed file saved to final_parsed_5AB6AB_t2.txt


In [None]:
## renaming final_parsed_xxxxxx_t1/2.txt -> FA02_xxxxxx_t1/2.txt

import os

# List of files to copy
files_to_copy = [
    "final_parsed_1AB2AB_t1.txt",
    "final_parsed_1AB2AB_t2.txt",
    "final_parsed_3AB4AB_t1.txt",
    "final_parsed_3AB4AB_t2.txt",
    "final_parsed_5AB6AB_t1.txt",
    "final_parsed_5AB6AB_t2.txt"
]

# Loop over the files and copy each with the new name
for file in files_to_copy:
    new_file_name = file.replace("final_parsed_", "FA02_")
    with open(file, "rb") as src, open(new_file_name, "wb") as dest:
        dest.write(src.read())
    print(f"Copied {file} to {new_file_name}")

Copied final_parsed_1AB2AB_t1.txt to FA02_1AB2AB_t1.txt
Copied final_parsed_1AB2AB_t2.txt to FA02_1AB2AB_t2.txt
Copied final_parsed_3AB4AB_t1.txt to FA02_3AB4AB_t1.txt
Copied final_parsed_3AB4AB_t2.txt to FA02_3AB4AB_t2.txt
Copied final_parsed_5AB6AB_t1.txt to FA02_5AB6AB_t1.txt
Copied final_parsed_5AB6AB_t2.txt to FA02_5AB6AB_t2.txt


In [None]:
## FA02_xxxxxx_t1/2.txt -> FA02_parsed_fitness_data.txt

# Fitness estimation function
def fit_est(f1, f0, t):
    """
    Calculate the fitness estimate using the given formula.
    """
    return (1 / t) * (np.log(f1 / (1 - f1)) - np.log(f0 / (1 - f0)))

# Define file paths for time 1 and time 2
path_t1 = "FA02*t1.txt"
path_t2 = "FA02*t2.txt"

# Read all time 1 and time 2 files
files_t1 = sorted(glob.glob(path_t1))
files_t2 = sorted(glob.glob(path_t2))

data_t1 = [pd.read_csv(file, sep="\t") for file in files_t1]
data_t2 = [pd.read_csv(file, sep="\t") for file in files_t2]

# Combine all files into single DataFrames
t1_combined = pd.concat(data_t1, ignore_index=True)
t2_combined = pd.concat(data_t2, ignore_index=True)

# Add generation as a factor column
t1_combined['t'] = 1
t2_combined['t'] = 10

# Merge time 1 and time 2 data on common columns
data_merged = pd.merge(t1_combined, t2_combined, on=['well_id', 'sample_id', 'fact'], suffixes=('_t1', '_t2'))

# Drop unnecessary columns
data_merged = data_merged.drop(columns=[col for col in data_merged if 'name_long' in col or col in ['t_t1', 't_t2']])

# Rename columns
data_merged.columns = ['well_id', 'sample_id', 'fact', 'cells_t1', 'ref_t1', 'cells_t2', 'ref_t2']

# Filter out rows where sample_id is 'BLANK'
data_filtered = data_merged[data_merged['sample_id'] != 'BLANK'].copy()

# Add environment and replicate columns
data_filtered.loc[:, 'env'] = data_filtered['fact'].str.split('_').str[0]
data_filtered.loc[:, 'rep'] = data_filtered['fact'].str.split('_').str[1]

# Add regime column
data_filtered.loc[:, 'regime'] = pd.NA
data_filtered.loc[data_filtered['sample_id'].str.contains('a1.1', na=False), 'regime'] = 'asexual'
data_filtered.loc[data_filtered['sample_id'].str.contains('a3', na=False), 'regime'] = 'sexual'

# Flag ancestors, blanks, and references
data_filtered.loc[:, 'anc'] = np.where(data_filtered['sample_id'].str.contains('MJM64'), 1, 0)
data_filtered.loc[:, 'blank'] = np.where(data_filtered['sample_id'] == 'BLANK', 1, 0)
data_filtered.loc[:, 'ref'] = np.where(data_filtered['sample_id'].str.contains('REF_ALONE'), 1, 0)

# Add kk_well_id column
data_filtered.loc[:, 'kk_well_id'] = data_filtered['sample_id'].str.split('_').str[1]
data_filtered.loc[(data_filtered['anc'] == 1) | (data_filtered['ref'] == 1), 'kk_well_id'] = np.nan

# Replace environment mapping using .loc
environment_mapping = {
    '1': 'FLC4', '2': 'SC30C', '3': 'SC37C', '4': 'lowP', '5': 'SC_pH7.3', '6': 'SC_0.2M_NaCl'
}
data_filtered.loc[:, 'env'] = data_filtered['env'].replace(environment_mapping)

# Set plate and assay columns using .loc
#data_filtered.loc[:, 'plate'] = data_filtered['env'] + '_' + data_filtered['rep']
data_filtered.loc[:, 'assay'] = 'FA02'

# Rename 'fact' column without inplace modification
data_filtered = data_filtered.rename(columns={'fact': 'plate'})

data_filtered.loc[:, 's_hat'] = fit_est(
    f0=(data_filtered['cells_t1'] - data_filtered['ref_t1']) / data_filtered['cells_t1'],
    f1=(data_filtered['cells_t2'] - data_filtered['ref_t2']) / data_filtered['cells_t2'],
    t=10
)

data_filtered.loc[data_filtered['ref'] == 1, 's_hat'] = np.nan

# Assign 'kk_pop_id' with .loc
data_filtered.loc[:, 'kk_pop_id'] = None
data_filtered.loc[(data_filtered['regime'] == 'sexual') & (data_filtered['anc'] == 0) & (data_filtered['ref'] == 0), 'kk_pop_id'] = \
    'a3_' + data_filtered['kk_well_id']
data_filtered.loc[(data_filtered['regime'] == 'asexual') & (data_filtered['anc'] == 0) & (data_filtered['ref'] == 0), 'kk_pop_id'] = \
    'a1.1_' + data_filtered['kk_well_id']

# Relevant columns for export
columns_to_export = ['kk_well_id', 'assay', 'regime', 'plate', 'env', 'rep', 'anc', 'ref', 's_hat', 'kk_pop_id']
data_export = data_filtered[columns_to_export]

def format_row(row):
    """
    Formats each row:
    - Strings are quoted unless "NA".
    - Missing values are replaced with NA.
    """
    return [
        "NA" if pd.isnull(val) or val == "nan" else (f'"{val}"' if isinstance(val, str) else str(val))
        for val in row
    ]

# Save the data to a .txt file with correct formatting
with open("FA02_parsed_fitness_data.txt", "w") as f:
    # Write the header with proper quoting
    f.write(",".join([f'"{col}"' for col in columns_to_export]) + "\n")
    # Write the data rows
    for _, row in data_export.iterrows():
        formatted_row = format_row(row)
        f.write(",".join(formatted_row) + "\n")

print("Data saved to FA02_parsed_fitness_data.txt")

Data saved to FA02_parsed_fitness_data.txt


# Combining Fitness Data

Going from `FA01_parsed_fitness_data.txt` and `FA01_parsed_fitness_data.txt` to `parsed_pop_fitness_data.csv`.

In [None]:
# --- load + combine (like R: YPD <- FA01, AWAY <- FA02, PD <- rbind) ---
fa01 = pd.read_csv("FA01_parsed_fitness_data.txt")
fa02 = pd.read_csv("FA02_parsed_fitness_data.txt")
PD = pd.concat([fa01, fa02], ignore_index=True)

# R: filter(env != 'FLC4', ref == 0)
PD = PD[(PD["env"] != "FLC4") & (PD["ref"] == 0)].copy()

def plate_effects_ml(df, fixed_formula):
    """
    Mimic glmmTMB(..., family=gaussian) random intercepts:
      (1|kk_pop_id) + (1|plate)
    and extract the plate BLUPs. Use ML (reml=False) to match glmmTMB default.
    """
    d = df.dropna(subset=["s_hat", "regime", "kk_pop_id", "plate"]).copy()
    d["all"] = 1  # single grouping level; use variance components for kk_pop_id and plate
    m = smf.mixedlm(
        f"s_hat ~ {fixed_formula}",
        d,
        groups=d["all"],
        vc_formula={"kk": "0 + C(kk_pop_id)", "plate": "0 + C(plate)"},
        re_formula="0",
    )
    r = m.fit(reml=False, method="lbfgs", maxiter=500, disp=False)

    re = r.random_effects[1]
    # keys look like: "plate[C(plate)[YPD_A]]" -> "YPD_A"
    return {k.split("[")[-1].rstrip("]"): v for k, v in re.items() if k.startswith("plate")}

# --- estimate plate effects per assay (exactly like the Rmd) ---
PD01 = PD[PD["assay"] == "FA01"]
PD02 = PD[PD["assay"] == "FA02"]

# R: FA01 model: s_hat ~ regime + (1|kk_pop_id) + (1|plate)
plate1 = plate_effects_ml(PD01, "C(regime)")

# R: FA02 model: s_hat ~ regime*env + (1|kk_pop_id) + (1|plate)
plate2 = plate_effects_ml(PD02, "C(regime)*C(env)")

# R: PD2$s_hat_adj <- PD2$s_hat - PD2$plate_mean
PD["plate_mean"] = PD["plate"].map({**plate1, **plate2})
PD["s_hat_adj"] = PD["s_hat"] - PD["plate_mean"]

# R: MATa <- filter(PD2, anc==1) %>% group_by(env) %>% summarise(mu_s = mean(s_hat_adj), ...)
MATa = (
    PD[PD["anc"] == 1]
    .groupby("env", as_index=False)["s_hat_adj"]
    .agg(mu_s="mean", sd_s="std")
)

# R: PD3 <- merge(PD2, MATa, by='env'); PD3$fitness_gain <- PD3$s_hat_adj - PD3$mu_s
PD = PD.merge(MATa, on="env", how="left", sort=False)
PD["fitness_gain"] = PD["s_hat_adj"] - PD["mu_s"]

# R: PD4 <- filter(PD3, anc==0, ref==0) %>% group_by(kk_pop_id, env) %>% summarise(mean/sd of fitness_gain)
out = (
    PD[(PD["anc"] == 0) & (PD["ref"] == 0)]
    .groupby(["kk_pop_id", "env"], as_index=False)["fitness_gain"]
    .agg(fitness_gain_mu_pop="mean", fitness_gain_sd_pop="std")
)

out.to_csv("parsed_pop_fitness_data.csv", index=False)

