In [1]:
import pandas as pd
from google.colab import files
from sklearn.preprocessing import StandardScaler

def standardize_train_and_test(train_csv, test_csv, train_out_csv=None, test_out_csv=None):
    """
    Standardize features in training and test datasets.
    Fit scaler on training set features, then apply to both training and test features.

    Parameters:
        train_csv (str): Path to training CSV file.
        test_csv (str): Path to test CSV file.
        train_out_csv (str, optional): Path to save standardized training CSV.
        test_out_csv (str, optional): Path to save standardized test CSV.

    Returns:
        df_train_std (pd.DataFrame): Standardized training DataFrame.
        df_test_std (pd.DataFrame): Standardized test DataFrame.
    """
    # Load data
    df_train = pd.read_csv(train_csv)
    df_test = pd.read_csv(test_csv)

    # Columns
    id_col = df_train.columns[0]
    taxa_col = df_train.columns[-2]
    label_col = df_train.columns[-1]

    # Extract features and meta columns from train
    X_train = df_train.iloc[:, 1:-2]
    train_meta = df_train[[id_col, taxa_col, label_col]]

    # Extract features and meta columns from test
    X_test = df_test.iloc[:, 1:-2]
    test_meta = df_test[[id_col, taxa_col, label_col]]

    # Fit scaler on training features
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

    # Transform test features using the same scaler
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

    # Recombine
    df_train_std = pd.concat([train_meta[id_col].reset_index(drop=True),
                              X_train_scaled.reset_index(drop=True),
                              train_meta[[taxa_col, label_col]].reset_index(drop=True)], axis=1)

    df_test_std = pd.concat([test_meta[id_col].reset_index(drop=True),
                             X_test_scaled.reset_index(drop=True),
                             test_meta[[taxa_col, label_col]].reset_index(drop=True)], axis=1)

    # Save if paths provided
    if train_out_csv:
        df_train_std.to_csv(train_out_csv, index=False)
        print(f"✅ Saved standardized training data to {train_out_csv}")
    if test_out_csv:
        df_test_std.to_csv(test_out_csv, index=False)
        print(f"✅ Saved standardized test data to {test_out_csv}")

    return df_train_std, df_test_std

In [108]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def standardize_protist_with_training_scaler(train_csv, protist_csv, protist_out_csv=None):
    """
    Standardize protist dataset using scaler fitted on training data.

    Parameters:
        train_csv (str): Path to standardized training CSV (with taxa + label).
        protist_csv (str): Path to protist dataset CSV (same feature format).
        protist_out_csv (str): Path to save standardized protist CSV.

    Returns:
        pd.DataFrame: Standardized protist DataFrame (with ID + scaled features).
    """
    # Load data
    df_train = pd.read_csv(train_csv)
    df_protist = pd.read_csv(protist_csv)

    # Identify columns
    id_col = df_train.columns[0]
    feature_cols = df_train.columns[1:-2]  # Exclude ID, taxa, label

    # Sanity check
    missing = [col for col in feature_cols if col not in df_protist.columns]
    if missing:
        raise ValueError(f"❌ Protist data is missing features: {missing}")

    # Fit scaler on training features
    scaler = StandardScaler()
    scaler.fit(df_train[feature_cols])

    # Standardize protist features
    protist_ids = df_protist[id_col]
    X_protist = df_protist[feature_cols]
    X_protist_scaled = pd.DataFrame(scaler.transform(X_protist), columns=feature_cols)

    # Combine ID + scaled features
    df_protist_std = pd.concat([protist_ids.reset_index(drop=True),
                                 X_protist_scaled.reset_index(drop=True)], axis=1)

    # Save output
    if protist_out_csv:
        df_protist_std.to_csv(protist_out_csv, index=False)
        print(f"✅ Saved standardized protist data to {protist_out_csv}")

    return df_protist_std
