In [28]:
from pathlib import Path
import re, unicodedata
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GroupShuffleSplit, train_test_split
import joblib

# -------- canonical name normalizer (same as EDA) ----------
def canon(name: str) -> str:
    s = unicodedata.normalize("NFKC", str(name))
    s = s.replace("\u200b", "").replace("\xa0", "")
    s = s.strip().lower()
    s = s.translate(str.maketrans({"µ": "u", "μ": "u"}))
    s = s.replace("umol m-2 s-1", "umol_m2_s1")
    s = s.replace("umol/m2/s",  "umol_m2_s1")
    s = s.replace("umol m^-2 s^-1", "umol_m2_s1")
    s = re.sub(r"[^a-z0-9]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s

# -------- paths ----------
ROOT = Path.cwd().parents[0]
DATA = ROOT / "data" / "processed"
MODELS = ROOT / "models"  # optional

IN_CSV  = DATA / "microalgae_dataset_clean.csv"  # output from EDA
ART_DIR = DATA                                     # where we save artifacts

TARGET = "co2_fixation_g_l_day"  # same as EDA


In [29]:
df = pd.read_csv(IN_CSV, sep=";", encoding="utf-8-sig")
df.columns = [canon(c) for c in df.columns]

print(df.shape)
df.head()


(300, 12)


Unnamed: 0,sample_id,day_index,air_temp_c,light_intensity_umol_m2_s1,ph_before_injection,ph_after_injection,do_mg_l,salinity_psu,optical_density,dry_weight_g_l,growth_rate,co2_fixation_g_l_day
0,c1m1,1.0,28.2,108.01,8.11,8.02,3.86,33.0,0.15,0.0,0.0,0.0
1,c1m1,2.0,28.0,104.65,8.14,8.07,4.0,31.0,0.166,7e-07,1e-06,1e-06
2,c1m1,3.0,26.5,106.83,8.21,8.01,4.3,35.0,0.168,2.18e-05,2.1e-05,3.9e-05
3,c1m1,4.0,25.7,100.97,8.25,8.08,4.26,29.0,0.195,0.0003134,0.000292,0.000534
4,c1m1,5.0,25.4,104.97,8.25,8.07,4.04,30.0,0.1859,0.0025101,0.002197,0.00402


In [30]:
# ID-like columns that should not be used as features
drop_like = {"sample_id", "id", "row_id"}

num_cols = df.select_dtypes(include="number").columns.tolist()

# auto features: numeric, not the target, not id-like
features_auto = [c for c in num_cols if c != TARGET and c not in drop_like]

# If you prefer to force exact features, uncomment and edit:
# WANTED = [
#    "day_index", "air_temp_c", "light_intensity_umol_m2_s1",
#    "ph_before_injection", "ph_after_injection", "do_mg_l",
#    "salinity_psu", "optical_density", "dry_weight_g_l", "growth_rate"
# ]
# features = [f for f in WANTED if f in df.columns]

features = features_auto

print("Target:", TARGET)
print("Features:", features)


Target: co2_fixation_g_l_day
Features: ['day_index', 'air_temp_c', 'light_intensity_umol_m2_s1', 'ph_before_injection', 'ph_after_injection', 'do_mg_l', 'salinity_psu', 'optical_density', 'dry_weight_g_l', 'growth_rate']


In [35]:
TARGET = "co2_fixation_g_l_day"

# BEFORE split:
print("rows before dropna target:", len(df))
df = df.dropna(subset=[TARGET]).copy()
print("rows after dropna target:", len(df))

rows before dropna target: 300
rows after dropna target: 296


In [36]:
y = df[TARGET].astype(float)

if "sample_id" in df.columns:
    groups = df["sample_id"].astype(str)
    splitter = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)
    train_idx, test_idx = next(splitter.split(df, y, groups=groups))
else:
    train_idx, test_idx = train_test_split(
        np.arange(len(df)), test_size=0.2, random_state=42, shuffle=True
    )

X_train_df = df.iloc[train_idx][features].copy()
X_test_df  = df.iloc[test_idx][features].copy()
y_train = y.iloc[train_idx].copy()
y_test  = y.iloc[test_idx].copy()

X_train_df.shape, X_test_df.shape, y_train.shape, y_test.shape


((232, 10), (64, 10), (232,), (64,))

In [37]:
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  StandardScaler())
])

# Only numeric columns here; if you ever add categoricals, extend this ColumnTransformer
preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, features)
    ],
    remainder="drop"
)

# Fit on train, transform train & test
X_train_proc = preprocess.fit_transform(X_train_df)
X_test_proc  = preprocess.transform(X_test_df)

X_train_proc.shape, X_test_proc.shape


((232, 10), (64, 10))

In [38]:
ART_DIR.mkdir(parents=True, exist_ok=True)

np.save(ART_DIR / "X_train_proc.npy", X_train_proc)
np.save(ART_DIR / "X_test_proc.npy",  X_test_proc)

pd.Series(y_train).to_csv(ART_DIR / "y_train.csv", index=False)
pd.Series(y_test ).to_csv(ART_DIR / "y_test.csv",  index=False)

joblib.dump(preprocess, ART_DIR / "preprocess_pipeline.joblib")

# Save the exact feature order the pipeline expects
pd.Series(features).to_csv(ART_DIR / "features_used.csv", index=False, header=False)

print("saved to:", ART_DIR)


saved to: D:\6 Project\ZeroAlgae\Research\carbon-fixation-ml-starter\data\processed


In [39]:
# Reload and do a round-trip to ensure nothing is missing
Xtr = np.load(ART_DIR / "X_train_proc.npy")
Xte = np.load(ART_DIR / "X_test_proc.npy")
ytr = pd.read_csv(ART_DIR / "y_train.csv").squeeze("columns").values
yte = pd.read_csv(ART_DIR / "y_test.csv").squeeze("columns").values
pipe = joblib.load(ART_DIR / "preprocess_pipeline.joblib")
feat = pd.read_csv(ART_DIR / "features_used.csv", header=None).iloc[:,0].tolist()

print("Xtr/Xte:", Xtr.shape, Xte.shape)
print("ytr/yte:", ytr.shape, yte.shape)
print("pipe ok:", type(pipe).__name__, "features:", len(feat))


Xtr/Xte: (232, 10) (64, 10)
ytr/yte: (232,) (64,)
pipe ok: ColumnTransformer features: 10
