In [None]:
import pandas as pd
import numpy as np
import os
import re
import csv
import math
import glob
import statsmodels.formula.api as smf

# FA03
Fitness assays of clones in YPD, SC 30C, SC 37C, SC +NaCl, SC pH 7.3, SC -P.

Going from `FA03_1AB2AB_T1.txt, FA03_1AB2AB_T2.txt, FA03_3AB4AB_T1.txt, FA03_3AB4AB_T2.txt, FA03_5AB6AB_T1.txt, FA03_5AB6AB_T2.txt` to `parsed_clone_fitness_data.csv`.

Needs `plate_layout_FA03.txt` as well as `FA01_parsed_fitness_data.txt` and `FA02_parsed_fitness_data.txt` (outputs of `population_fitness_processing.ipynb`).


In [None]:
## FA03_*_T1/T2.txt -> FA03_parsed_fitness_data.txt  (FA03 plate layout aware)

# Fitness estimation function (same form as FA01/FA02)
def fit_est(f1, f0, t):
    """Calculate fitness estimate from two frequencies."""
    # protect against 0/1
    eps = 1e-9
    f0 = np.clip(f0, eps, 1-eps)
    f1 = np.clip(f1, eps, 1-eps)
    return (1 / t) * (np.log(f1 / (1 - f1)) - np.log(f0 / (1 - f0)))

# ---- input patterns ----
path_t1 = "FA03_*_T1.txt"
path_t2 = "FA03_*_T2.txt"
plate_layout_path = "plate_layout_FA03.txt"  # FA03-specific layout (different from FA01/FA02)

# ---- load all T1/T2 files ----
files_t1 = sorted(glob.glob(path_t1))
files_t2 = sorted(glob.glob(path_t2))

data_t1 = [pd.read_csv(file, sep="\t") for file in files_t1]
data_t2 = [pd.read_csv(file, sep="\t") for file in files_t2]

t1_combined = pd.concat(data_t1, ignore_index=True)
t2_combined = pd.concat(data_t2, ignore_index=True)

# Add generation factor columns (matches FA02 style; not kept)
t1_combined['t'] = 1
t2_combined['t'] = 10

# Merge time 1 and time 2 data on common columns
data_merged = pd.merge(
    t1_combined, t2_combined,
    on=['well_id', 'sample_id', 'fact'],
    suffixes=('_t1', '_t2')
)

# Drop extra columns (name_long) and temp generation columns
drop_cols = [c for c in data_merged.columns if 'name_long' in c] + ['t_t1', 't_t2']
drop_cols = [c for c in drop_cols if c in data_merged.columns]
data_merged = data_merged.drop(columns=drop_cols)

# Rename columns to explicit T1/T2 names (FA03 uses 'dark' not 'ref')
data_merged.columns = [
    'well_id', 'sample_id', 'fact',
    'cells_t1', 'dark_t1',
    'cells_t2', 'dark_t2'
]

# Filter out blanks (keep REF and ancestors)
data_filtered = data_merged[data_merged['sample_id'] != 'BLANK'].copy()

# --- FA03 plate layout handling ---
# FA03 has a different layout; use it to (a) validate sample_id vs expected, (b) fill any missing IDs.
def read_plate_layout(layout_file):
    """Read plate_layout_FA03.txt into a dict mapping well_id (A01..H12) -> expected sample_id."""
    with open(layout_file, 'r') as fh:
        lines = [l.strip() for l in fh if l.strip()]
    # split on whitespace or tabs; infer ncols from first row
    first = re.split(r"\s+|\t", lines[0])
    ncols = len(first)
    rows = [chr(ord('A') + i) for i in range(8)]
    cols = list(range(1, ncols + 1))
    mapping = {}
    for r_idx, line in enumerate(lines[:8]):
        entries = re.split(r"\s+|\t", line)
        for c_idx, entry in enumerate(entries):
            mapping[f"{rows[r_idx]}{cols[c_idx]:02d}"] = entry
    return mapping

layout_map = read_plate_layout(plate_layout_path)

# expected sample_id from layout
data_filtered.loc[:, 'layout_sample_id'] = data_filtered['well_id'].map(layout_map)

# If sample_id is missing, fill from layout. If it disagrees with layout, keep file sample_id but flag.
data_filtered.loc[:, 'layout_mismatch'] = (
    data_filtered['layout_sample_id'].notna() &
    (data_filtered['sample_id'] != data_filtered['layout_sample_id'])
)
data_filtered.loc[data_filtered['sample_id'].isna(), 'sample_id'] = data_filtered.loc[data_filtered['sample_id'].isna(), 'layout_sample_id']

# --- normalize sample_id to match prior parsing ---
def normalize_sample_id(s):
    s = str(s).strip()
    if s == "REF":
        return "REF"
    if s.startswith("MJM"):
        return s   # keep MJM##_MATa / MJM##_MATalpha exactly as-is
    # force A1.1/A3 tokens into consistent case
    s = s.replace("A1.1", "a1.1")
    s = re.sub(r'_(A3)_', r'_a3_', s)
    parts = s.split("_")
    if len(parts) >= 2:
        parts[1] = parts[1].lower()
    if len(parts) >= 3:
        parts[2] = parts[2].upper()
    return "_".join(parts)

data_filtered.loc[:, 'sample_id'] = data_filtered['sample_id'].map(normalize_sample_id)

# Add environment and replicate columns
data_filtered.loc[:, 'env'] = data_filtered['fact'].str.split('_').str[0]
data_filtered.loc[:, 'rep'] = data_filtered['fact'].str.split('_').str[1]

# Add regime column
data_filtered.loc[:, 'regime'] = pd.NA
data_filtered.loc[data_filtered['sample_id'].str.contains('a1.1', na=False), 'regime'] = 'asexual'
data_filtered.loc[data_filtered['sample_id'].str.contains('a3', na=False), 'regime'] = 'sexual'

# Flag ancestors and references (FA03 ancestors contain 'MJM'; REF is exactly 'REF')
data_filtered.loc[:, 'anc'] = np.where(data_filtered['sample_id'].str.match(r'^MJM\d+_MATa$', na=False), 1, 0)
data_filtered.loc[:, 'ref'] = np.where(data_filtered['sample_id'] == 'REF', 1, 0)

# Add kk_well_id column (FA03 sample_id pattern: <batch>_<regime>_<WELL>_<clone>)
data_filtered.loc[:, 'kk_well_id'] = data_filtered['sample_id'].str.split('_').str[2]
data_filtered.loc[(data_filtered['anc'] == 1) | (data_filtered['ref'] == 1), 'kk_well_id'] = np.nan

# Replace environment mapping (match prior FA03 parsing)
environment_mapping = {
    'YPD': 'YPD',
    'SC': 'SC30C',
    'SC37': 'SC37C',
    'lowP': 'lowP',
    'pH7.3': 'SC_pH7.3',
    'NaCl': 'SC_0.2M_NaCl'
}
data_filtered.loc[:, 'env'] = data_filtered['env'].replace(environment_mapping)

# Set assay and plate columns (match FA02 style)
data_filtered.loc[:, 'assay'] = 'FA03'
data_filtered = data_filtered.rename(columns={'fact': 'plate'})

# compute s_hat using dark/cells fractions (FA03)
data_filtered.loc[:, 's_hat'] = fit_est(
    f0=(data_filtered['dark_t1']) / data_filtered['cells_t1'],
    f1=(data_filtered['dark_t2']) / data_filtered['cells_t2'],
    t=10
)
data_filtered.loc[:, 's_hat_for_analysis'] = data_filtered['s_hat'].copy()
data_filtered.loc[data_filtered['ref'] == 1, 's_hat'] = np.nan

# Assign kk_pop_id (match FA02 logic, but FA03 well ID is in kk_well_id)
data_filtered.loc[:, 'kk_pop_id'] = None
data_filtered.loc[(data_filtered['regime'] == 'sexual') & (data_filtered['anc'] == 0) & (data_filtered['ref'] == 0), 'kk_pop_id'] = \
    'a3_' + data_filtered['kk_well_id']
data_filtered.loc[(data_filtered['regime'] == 'asexual') & (data_filtered['anc'] == 0) & (data_filtered['ref'] == 0), 'kk_pop_id'] = \
    'a1.1_' + data_filtered['kk_well_id']
data_filtered.loc[(data_filtered['ref'] == 1), 'kk_pop_id'] = 'REF'
data_filtered.loc[(data_filtered['anc'] == 1), 'kk_pop_id'] = data_filtered.loc[(data_filtered['anc'] == 1), 'sample_id']

# Ensure MATalpha control (e.g., MJM36_MATalpha) gets a stable kk_pop_id (it has regime = NA)
mask_matalpha = (
    data_filtered['sample_id'].str.match(r'^MJM\d+_MATalpha$', na=False) &
    (data_filtered['anc'] == 0) &
    (data_filtered['ref'] == 0)
)
data_filtered.loc[mask_matalpha, 'kk_pop_id'] = data_filtered.loc[mask_matalpha, 'sample_id']

# Relevant columns for export (keep sample_id, plus layout QC fields at end)
columns_to_export = [
    'kk_well_id','sample_id','assay','regime','plate','env','rep','anc','ref','s_hat','kk_pop_id'
]
data_export = data_filtered[columns_to_export]

def format_row(row):
    """Format like the existing FAxx_parsed_fitness_data.txt outputs: quote strings, NA for missing."""
    out = []
    for val in row:
        if pd.isnull(val) or val == "nan":
            out.append("NA")
        elif isinstance(val, str):
            out.append(f'"{val}"')
        else:
            out.append(str(val))
    return out

# Save the data to a .txt file with correct formatting
out_txt = "FA03_parsed_fitness_data.txt"
with open(out_txt, "w") as f:
    f.write(",".join([f'"{col}"' for col in columns_to_export]) + "\n")
    for _, row in data_export.iterrows():
        f.write(",".join(format_row(row)) + "\n")

print(f"Data saved to {out_txt}")
#print(f"Layout mismatches flagged: {int(data_export['layout_mismatch'].sum())}")

Data saved to FA03_parsed_fitness_data.txt


In [None]:
## FA03_parsed_fitness_data.txt -> parsed_clone_fitness_data_FA03.csv

# ---------- helper: extract plate BLUPs (like your plate_effects_ml) ----------
def plate_effects_ml(df, fixed_formula):
    d = df.dropna(subset=["s_hat", "regime", "kk_pop_id", "plate"]).copy()
    d["all"] = 1
    m = smf.mixedlm(
        f"s_hat ~ {fixed_formula}",
        d,
        groups=d["all"],
        vc_formula={"kk": "0 + C(kk_pop_id)", "plate": "0 + C(plate)"},
        re_formula="0",
    )
    r = m.fit(reml=False, method="lbfgs", maxiter=500, disp=False)

    re = r.random_effects[1]
    return {k.split("[")[-1].rstrip("]"): v for k, v in re.items() if k.startswith("plate")}

# ---------- 1) compute mu_s(env) from FA01 + FA02 (SP does this) ----------
fa01 = pd.read_csv("FA01_parsed_fitness_data.txt")
fa02 = pd.read_csv("FA02_parsed_fitness_data.txt")
PD = pd.concat([fa01, fa02], ignore_index=True)

# SP: filter(env != 'FLC4', ref == 0)
PD = PD[(PD["env"] != "FLC4") & (PD["ref"] == 0)].copy()

PD01 = PD[PD["assay"] == "FA01"].copy()
PD02 = PD[PD["assay"] == "FA02"].copy()

plate1 = plate_effects_ml(PD01, "C(regime)")
plate2 = plate_effects_ml(PD02, "C(regime)*C(env)")

PD["plate_mean"] = PD["plate"].map({**plate1, **plate2})
PD["s_hat_adj"] = PD["s_hat"] - PD["plate_mean"]

# SP: MATa baseline by env
MATa = (
    PD[PD["anc"] == 1]
    .groupby("env", as_index=False)["s_hat_adj"]
    .agg(mu_s="mean")
)

# ---------- 2) load FA03 parsed fitness, merge mu_s, fit FA03 plate model ----------
CD = pd.read_csv("FA03_parsed_fitness_data.txt", quotechar='"')

# merge mu_s onto FA03 (CD2 in Rmd)
CD2 = CD.merge(MATa, on="env", how="left", sort=False)

# Fit FA03 plate effects like R: s_hat ~ regime*env + (1|kk_pop_id) + (1|plate)
# (statsmodels will drop NA rows automatically for fitting)
fit_df = CD2.dropna(subset=["s_hat", "regime", "kk_pop_id", "plate"]).copy()
fit_df["all"] = 1

m = smf.mixedlm(
    "s_hat ~ C(regime)*C(env)",
    fit_df,
    groups=fit_df["all"],
    vc_formula={"kk": "0 + C(kk_pop_id)", "plate": "0 + C(plate)"},
    re_formula="0",
)
r = m.fit(reml=False, method="lbfgs", maxiter=500, disp=False)

re = r.random_effects[1]
plate_effect_FA03 = {
    k.split("[")[-1].rstrip("]"): v
    for k, v in re.items()
    if k.startswith("plate")
}

# ---------- 3) compute unaggregated clone fitness_gain (CD3 in Rmd) ----------
CD3 = CD2.copy()
CD3["plate_means"] = CD3["plate"].map(plate_effect_FA03)
CD3["fitness_gain"] = CD3["s_hat"] - CD3["plate_means"] - CD3["mu_s"]

# # write unaggregated (matches Rmd intent)
# CD3.to_csv("parsed_unagg_clone_fitness_data_FA03.csv", index=False)

# ---------- 4) aggregate to final clone CSV (matches fitness_assay_processing.R) ----------
CD4 = CD3[(CD3["anc"] == 0) & (CD3["ref"] == 0)].copy()

# Capture first-appearance order of each clone group (R-like group order)
GROUP_KEY = ["kk_pop_id", "sample_id", "env"]
group_order = CD4[GROUP_KEY].drop_duplicates(keep="first").reset_index(drop=True)
group_order["__order"] = np.arange(len(group_order))

clone_summary = (
    CD4.groupby(["kk_pop_id", "sample_id", "env"], as_index=False, sort=False)["fitness_gain"]
    .agg(
        fitness_gain_avg=lambda x: np.nanmean(x),
        fitness_gain_sd=lambda x: np.nanstd(x, ddof=1),
    )
)

# attach regime/anc/ref like the R aggregation does (first())
meta = CD4.groupby(["kk_pop_id", "sample_id", "env"], as_index=False, sort=False).agg(
    regime=("regime", "first"),
    anc=("anc", "first"),
    ref=("ref", "first"),
)
clone_summary = clone_summary.merge(meta, on=["kk_pop_id", "sample_id", "env"], how="left")

clone_summary["regime"] = (clone_summary["regime"].astype("string").str.strip().replace("", pd.NA))

# Apply the R-like group order
clone_summary = (
    clone_summary
    .merge(group_order, on=GROUP_KEY, how="left")
    .sort_values("__order", kind="mergesort")
    .drop(columns="__order")
)

# reorder columns like SP clone output
clone_summary = clone_summary[
    ["sample_id","env","regime","anc","ref","kk_pop_id","fitness_gain_avg","fitness_gain_sd"]
]

env_order = ["lowP", "SC_0.2M_NaCl", "SC_pH7.3", "SC30C", "SC37C", "YPD"]
clone_summary["env"] = pd.Categorical(clone_summary["env"], categories=env_order, ordered=True)
clone_summary = clone_summary.sort_values(["sample_id", "env"], kind="mergesort")

clone_summary.index = np.arange(1, len(clone_summary) + 1)
clone_summary.to_csv("parsed_clone_fitness_data_FA03.csv", index=True, na_rep="NA")
print("Wrote parsed_clone_fitness_data_FA03.csv")



Wrote parsed_unagg_clone_fitness_data_FA03.csv and parsed_clone_fitness_data_FA03.csv
