In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler


1. DATA PROCESSING

In [None]:
# Load the dataset
absorb = pd.read_csv("abs.csv")
absorb.head()

cat = pd.read_csv("cat.csv")
cat.head()

gain_70 = pd.read_csv("gain_70.csv")
gain_70.head()  

new_rows = absorb.columns[2:-3]
df = pd.DataFrame()
df["well"]=new_rows
'''
OD_init: Initial optical density (OD) measurement at time 0 at 600nm
OD_1h: OD measurement at 1 hour (lag phase)
OD_2h: OD measurement at 2 hours (lag phase)
OD_3h: OD measurement at 3 hours (early log phase)
OD_4h: OD measurement at 4 hours (early log phase)
growth_rate: Slope of the OD curve at 4 hours modelled with log scale
lag: Duration of the lag phase (time until OD starts to increase)
AUC_4h: Area under the OD curve from 0 to 4 hours
curvature_4h: Curvature of the OD curve at 4 hours : slope of (0-2)hours - slope of (2-4)hours
outcome: final fluorescence /final OD (protein/cell) at ~45 hrs
'''
# Process the absorbance data

# input medita_type, OD_init
df_merged = df.merge(cat[['well',"OD_init","media_type","function"]], on='well', how='left')

# 1) long format
well_cols = absorb.columns[absorb.columns.str.match(r"^[A-H](?:[1-9]|1[0-2])$")]
long = absorb[["Time", *well_cols]].melt(id_vars="Time", var_name="well", value_name="OD")

# 2) time in hours
long["time_h"] = pd.to_timedelta(long["Time"]).dt.total_seconds() / 3600

# 3) attach metadata
long = long.merge(cat[["well", "media_type", "function"]], on="well", how="left")
long["function"] = long["function"].astype(str).str.lower().str.strip()

# 4) blank OD per (media_type, time_h)
blank_ref = (
    long[long["function"].eq("blank")]
    .groupby(["media_type", "time_h"], as_index=False)["OD"]
    .mean()
    .rename(columns={"OD": "OD_blank"})
)

long = long.merge(blank_ref, on=["media_type", "time_h"], how="left")

# 5) blank subtraction first (exclude standards)
long["OD"] = pd.to_numeric(long["OD"], errors="coerce")
long["OD_bs"] = long["OD"]
mask = ~long["function"].eq("standard")
long.loc[mask, "OD_bs"] = long.loc[mask, "OD"] - long.loc[mask, "OD_blank"]

# 6) smooth corrected OD per well before extracting OD_1h to OD_4h
long = long.sort_values(["well", "time_h"])
long["OD_smooth"] = long.groupby("well")["OD_bs"].transform(
    lambda s: s.rolling(window=5, center=True, min_periods=1).mean()
)

# 7) OD closest to target hours from smoothed values
targets = [1.0, 2.0, 3.0, 4.0]
od_tables = []
for t in targets:
    idx = long.groupby("well")["time_h"].apply(lambda s: (s - t).abs().idxmin())
    od_t = long.loc[idx, ["well", "OD_smooth"]].rename(columns={"OD_smooth": f"OD_{int(t)}h"})
    od_tables.append(od_t)

# 8) specific growth rate = slope of time_h vs log(OD_smooth)
def calc_mu(g):
    x = pd.to_numeric(g["time_h"], errors="coerce")
    y = pd.to_numeric(g["OD_smooth"], errors="coerce")
    valid = x.notna() & y.notna() & (y > 0)
    if valid.sum() < 2:
        return np.nan
    return np.polyfit(x[valid], np.log(y[valid]), 1)[0]

mu = long.groupby("well", as_index=False).apply(
    lambda g: pd.Series({"specific_growth_rate": calc_mu(g)})
).reset_index(drop=True)

# 9) final well-level table
df_merged = cat.copy()
for od_t in od_tables:
    df_merged = df_merged.merge(od_t, on="well", how="left")
df_merged = df_merged.merge(mu, on="well", how="left")
df_merged.head()




Unnamed: 0,well,OD_init,media_type,function,OD_1h
0,H11,22.5,YPG,trial_1,0.304
1,G2,22.5,YPG,trial_2,0.3
2,D3,22.5,YPG,trial_3,0.319
3,F6,22.5,YPD,trial_1,0.283
4,D8,22.5,YPD,trial_2,0.295
