"""
GKM Analytics — Lab Data Cleaning Pipeline
------------------------------------------
Steps:
  1. Load CSV
  2. Inspect the raw data
  3. Remove duplicates
  4. Standardize categorical columns
  5. Handle missing values
  6. Flag and handle outliers
  7. Save clean dataset
"""

In [14]:
import pandas as pd
import numpy as np

1. Load CSV

In [20]:
def load_data(filepath: str) -> pd.DataFrame:
    df = pd.read_csv(filepath)
    print(f"\n{'='*55}")
    print(f"  LOADED: {filepath}")
    print(f"  Rows: {len(df)} | Columns: {df.shape[1]}")
    print(f"{'='*55}")
    return df

# Now CALL the function
df = load_data("lab_data_raw")

# Now preview
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'lab_data_raw'

In [16]:
# ── 2. INSPECT ─────────────────────────────────────────────────────────────────

def inspect(df: pd.DataFrame) -> None:
    print("\n── Column Types ──")
    print(df.dtypes.to_string())

    print("\n── Missing Values ──")
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    if missing.empty:
        print("  None found.")
    else:
        for col, count in missing.items():
            print(f"  {col}: {count} missing ({count/len(df)*100:.1f}%)")

    print("\n── Duplicate Rows ──")
    dupes = df.duplicated().sum()
    print(f"  {dupes} duplicate rows found")

    print("\n── Numeric Summary ──")
    print(df.describe().round(2).to_string())


# ── 3. REMOVE DUPLICATES ───────────────────────────────────────────────────────

def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    before = len(df)
    df = df.drop_duplicates()
    removed = before - len(df)
    print(f"\n── Duplicates: removed {removed} rows → {len(df)} remaining")
    return df


# ── 4. STANDARDIZE CATEGORICALS ───────────────────────────────────────────────

def standardize_categoricals(df: pd.DataFrame) -> pd.DataFrame:
    """Lowercase + strip whitespace on string columns."""
    cat_cols = ["catalyst_type", "solvent"]
    for col in cat_cols:
        if col in df.columns:
            df[col] = df[col].str.strip().str.lower()
    
    # Show unique values after standardization
    for col in cat_cols:
        if col in df.columns:
            print(f"\n── {col} unique values: {sorted(df[col].dropna().unique())}")
    
    return df


# ── 5. HANDLE MISSING VALUES ───────────────────────────────────────────────────

def handle_missing(df: pd.DataFrame) -> pd.DataFrame:
    """
    Strategy:
      - Numeric columns → impute with median (robust to outliers)
      - Categorical columns → impute with mode
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    # Exclude outcome/target from imputation
    numeric_cols = [c for c in numeric_cols if c not in ["outcome"]]

    cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
    cat_cols = [c for c in cat_cols if c not in ["experiment_id"]]

    print("\n── Missing Value Imputation ──")
    for col in numeric_cols:
        n_missing = df[col].isnull().sum()
        if n_missing > 0:
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
            print(f"  {col}: filled {n_missing} nulls with median ({median_val:.2f})")

    for col in cat_cols:
        n_missing = df[col].isnull().sum()
        if n_missing > 0:
            mode_val = df[col].mode()[0]
            df[col] = df[col].fillna(mode_val)
            print(f"  {col}: filled {n_missing} nulls with mode ('{mode_val}')")

    return df


# ── 6. FLAG AND HANDLE OUTLIERS ────────────────────────────────────────────────

def handle_outliers(df: pd.DataFrame, z_threshold: float = 3.0) -> pd.DataFrame:
    """
    Flag outliers using Z-score. 
    Values beyond z_threshold standard deviations are capped (winsorized)
    rather than dropped — preserving row count while reducing noise.
    """
    numeric_cols = ["temperature_C", "pressure_bar", "concentration_mM",
                    "reaction_time_hr", "pH", "yield_pct"]
    numeric_cols = [c for c in numeric_cols if c in df.columns]

    print("\n── Outlier Detection (Z-score, threshold=3.0) ──")
    df["outlier_flag"] = False

    for col in numeric_cols:
        mean = df[col].mean()
        std = df[col].std()
        z_scores = (df[col] - mean) / std
        outliers = z_scores.abs() > z_threshold
        n_outliers = outliers.sum()

        if n_outliers > 0:
            # Cap to 3 std from mean
            lower = mean - z_threshold * std
            upper = mean + z_threshold * std
            df.loc[outliers, "outlier_flag"] = True
            df[col] = df[col].clip(lower=lower, upper=upper)
            print(f"  {col}: {n_outliers} outlier(s) capped to [{lower:.2f}, {upper:.2f}]")

    print(f"  Total rows flagged: {df['outlier_flag'].sum()}")
    return df


# ── 7. SAVE ────────────────────────────────────────────────────────────────────

def save_clean(df: pd.DataFrame, filepath: str) -> None:
    df.to_csv(filepath, index=False)
    print(f"\n── Saved clean data → {filepath}")
    print(f"   Final shape: {df.shape[0]} rows × {df.shape[1]} columns")


# ── MAIN PIPELINE ──────────────────────────────────────────────────────────────

if __name__ == "__main__":
    df = load_data("lab_data_raw.csv")
    inspect(df)
    df = remove_duplicates(df)
    df = standardize_categoricals(df)
    df = handle_missing(df)
    df = handle_outliers(df)
    save_clean(df, "lab_data_clean.csv")

    print("\n✓ Pipeline complete. Ready for feature engineering + modeling.\n")

FileNotFoundError: [Errno 2] No such file or directory: 'lab_data_raw.csv'