In [1]:
%%writefile extract_band_stats.py
# src/features/extract_band_stats.py
import pandas as pd
import numpy as np
import logging
from pathlib import Path
from tqdm import tqdm
from joblib import Parallel, delayed
from scipy.stats import kurtosis, skew

# CONFIG
BANDS = [f"B{i}" for i in range(1, 13)]
N_JOBS = 4

def process_npy_file(row):
    try:
        file_path = Path(row["sentinel_path"])
        ID = row["ID"]

        if not file_path.exists():
            logging.warning(f"[MISSING] {file_path}")
            return None

        arr = np.load(file_path)  # (H, W, 12)
        if arr.shape[-1] != 12:
            logging.warning(f"[SKIPPED] {ID}: got {arr.shape}")
            return None

        H, W = arr.shape[:2]
        flat_pixels = arr.reshape(-1, 12)
        valid = ~np.all(flat_pixels == 0, axis=1)
        flat_pixels = flat_pixels[valid]

        if flat_pixels.shape[0] == 0:
            logging.warning(f"[EMPTY] {ID}")
            return None

        df = pd.DataFrame(flat_pixels, columns=BANDS)
        df["ID"] = ID
        return df
    except Exception as e:
        logging.error(f"[ERROR] {file_path}: {e}")
        return None

def summarize_id_stats(group_tuple):
    ID, df = group_tuple
    stats = {"ID": ID}
    for band in BANDS:
        vals = df[band].values
        finite = vals[np.isfinite(vals)]
        if finite.size == 0: continue
        stats.update({
            f"{band}_mean": np.mean(finite),
            f"{band}_median": np.median(finite),
            f"{band}_min": np.min(finite),
            f"{band}_max": np.max(finite),
            f"{band}_std": np.std(finite),
            f"{band}_kurtosis": kurtosis(finite),
            f"{band}_skew": skew(finite),
            f"{band}_q25": np.percentile(finite, 25),
            f"{band}_q75": np.percentile(finite, 75),
        })
    return stats

def extract_features(train_df, test_df, output_dir="output"):
    Path(output_dir).mkdir(exist_ok=True, parents=True)
    logging.info("=== Extracting Sentinel-2 features ===")

    df_all = pd.concat([train_df.assign(split='train'), test_df.assign(split='test')])

    with Parallel(n_jobs=N_JOBS) as parallel:
        results = parallel(delayed(process_npy_file)(row) for _, row in tqdm(df_all.iterrows(), total=len(df_all)))

    valid_results = [r for r in results if r is not None]
    if not valid_results:
        raise RuntimeError("No valid .npy files processed.")

    pxdf = pd.concat(valid_results, ignore_index=True)
    logging.info(f"Pixel data shape: {pxdf.shape}")

    id_groups = pxdf.groupby("ID")
    with Parallel(n_jobs=N_JOBS) as parallel:
        stats_list = parallel(delayed(summarize_id_stats)(group) for group in tqdm(id_groups, total=len(id_groups)))

    agg_df = pd.DataFrame(stats_list)
    agg_df.to_csv(Path(output_dir) / "agg_df.csv", index=False)

    train_features = train_df.merge(agg_df, on="ID", how="left")
    test_features = test_df.merge(agg_df, on="ID", how="left")
    train_features.to_csv(Path(output_dir) / "train_features.csv", index=False)
    test_features.to_csv(Path(output_dir) / "test_features.csv", index=False)

    return train_features, test_features


Writing extract_band_stats.py


In [2]:
%%writefile feature_engineering_lightgbm.py
import pandas as pd
import numpy as np
# === Feature Engineering ===
def add_band_indices(df):
    df['B4_B3_ratio'] = df['B4_mean'] / (df['B3_mean'] + 1e-6)
    df['B3_B2_ratio'] = df['B3_mean'] / (df['B2_mean'] + 1e-6)
    df['B2_B1_ratio'] = df['B2_mean'] / (df['B1_mean'] + 1e-6)
    df['B4_B2_ratio'] = df['B4_mean'] / (df['B2_mean'] + 1e-6)
    df['NDI_B4_B3'] = (df['B4_mean'] - df['B3_mean']) / (df['B4_mean'] + df['B3_mean'] + 1e-6)
    df['NDI_B3_B2'] = (df['B3_mean'] - df['B2_mean']) / (df['B3_mean'] + df['B2_mean'] + 1e-6)
    df['NDI_B2_B1'] = (df['B2_mean'] - df['B1_mean']) / (df['B2_mean'] + df['B1_mean'] + 1e-6)
    df['SAR_diff'] = df['B8_mean'] - df['B5_mean']
    return df

def add_stat_features(df):
    for b in range(1, 13):
        df[f'B{b}_range'] = df[f'B{b}_max'] - df[f'B{b}_min']
        df[f'B{b}_iqr'] = df[f'B{b}_q75'] - df[f'B{b}_q25']
        df[f'B{b}_cv'] = df[f'B{b}_std'] / (df[f'B{b}_mean'] + 1e-6)
    return df

def add_texture_features(df):
    for b in range(1, 13):
        df[f'B{b}_texture'] = np.abs(df[f'B{b}_skew']) + np.abs(df[f'B{b}_kurtosis'])
    return df

def add_aggregations(df):
    band_means = [f'B{i}_mean' for i in range(1,13)]
    band_stds = [f'B{i}_std' for i in range(1,13)]
    band_ranges = [f'B{i}_max' for i in range(1,13)]
    df['mean_of_means'] = df[band_means].mean(axis=1)
    df['std_of_means'] = df[band_means].std(axis=1)
    df['sum_of_ranges'] = df[band_ranges].sum(axis=1)
    df['std_of_stds'] = df[band_stds].std(axis=1)
    return df

def engineer_features(df):
    df = add_band_indices(df)
    df = add_stat_features(df)
    df = add_texture_features(df)
    df = add_aggregations(df)
    return df

Writing feature_engineering_lightgbm.py


In [3]:
%%writefile train_lightgbm.py
# src/models/train_lightgbm.py
import argparse
import pandas as pd
import numpy as np
import lightgbm as lgb
import joblib
from feature_engineering_lightgbm import *
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.utils import class_weight
from pathlib import Path

# === LightGBM Training ===
def train_lgb(train_path, model_dir, n_splits=10):
    df = pd.read_csv(train_path)
    df = engineer_features(df)
    X = df.drop(columns=['ID', 'sentinel_path','label'])
    y = df['label']

    classes = np.unique(y)
    class_weights = class_weight.compute_class_weight('balanced', classes=classes, y=y)
    sample_weights = class_weights[y]

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    models, f1_scores = [], []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        sw_train = sample_weights[train_idx]

        train_data = lgb.Dataset(X_train, label=y_train, weight=sw_train)
        val_data = lgb.Dataset(X_val, label=y_val)

        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'verbosity': -1,
            'random_state': 42,
            'early_stopping_rounds':100,
        }

        model = lgb.train(
            params,
            train_data,
            num_boost_round=1000,
            valid_sets=[val_data],
        )

        y_proba = model.predict(X_val, num_iteration=model.best_iteration)
        y_pred = (y_proba > 0.5).astype(int)
        fold_f1 = f1_score(y_val, y_pred)
        f1_scores.append(fold_f1)
        print(f"Fold {fold+1} F1: {fold_f1:.4f}")

        Path(model_dir).mkdir(parents=True, exist_ok=True)
        joblib.dump(model, f"{model_dir}/lgb_fold{fold}.pkl")
        models.append(model)

    print(f"Mean F1: {np.mean(f1_scores):.4f}")
    return models

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_path", type=str, default="output/train_features.csv")
    parser.add_argument("--model_dir", type=str, default="models/lightgbm")
    args = parser.parse_args()
    train_lgb(args.train_path, args.model_dir)

Writing train_lightgbm.py


In [4]:
%%writefile infer_lightgbm.py
# src/models/infer_lightgbm.py
from feature_engineering_lightgbm import *
import argparse
import pandas as pd
import numpy as np
import joblib

def predict_lgb(test_path, model_dir, output_path, threshold=0.55):
    df = pd.read_csv(test_path)
    df = engineer_features(df)
    X = df.drop(columns=['ID', 'sentinel_path'])
    ids = df['ID']
    
    models = [joblib.load(f"{model_dir}/lgb_fold{i}.pkl") for i in range(10)]
    
    probs = np.zeros(X.shape[0])
    for model in models:
        probs += model.predict(X, num_iteration=model.best_iteration)
    probs /= len(models)
    preds = (probs > threshold).astype(int)
    
    pd.DataFrame({"ID": ids, "Probs": probs}).to_csv(output_path.replace(".csv", "_probs.csv"), index=False)
    pd.DataFrame({"ID": ids, "Target": preds}).to_csv(output_path, index=False)
    print(f"Saved predictions to {output_path}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--test_path", type=str, required=True)
    parser.add_argument("--model_dir", type=str, required=True)
    parser.add_argument("--output_path", type=str, required=True)
    args = parser.parse_args()
    predict_lgb(args.test_path, args.model_dir, args.output_path)

Writing infer_lightgbm.py


In [5]:
%%writefile main.py
# main.py
import argparse
import os
from pathlib import Path

# Import the pipeline functions
from extract_band_stats import extract_features
from train_lightgbm import train_lgb
from infer_lightgbm import predict_lgb
import pandas as pd

def main(mode, train_csv, test_csv, train_sentinel_dir, test_sentinel_dir, output_dir, model_dir, submission_path):
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    Path(model_dir).mkdir(parents=True, exist_ok=True)

    if mode in ["preprocess", "all"]:
        print("=== STEP 1: Feature Extraction ===")
        train_df = pd.read_csv(train_csv)
        test_df = pd.read_csv(test_csv)
        train_df['sentinel_path'] = train_df['ID'].apply(lambda id: os.path.join(train_sentinel_dir, f"{id}.npy"))
        test_df['sentinel_path'] = test_df['ID'].apply(lambda id: os.path.join(test_sentinel_dir, f"{id}.npy"))
        extract_features(train_df, test_df, output_dir=output_dir)

    if mode in ["train", "all"]:
        print("=== STEP 2: Training LightGBM ===")
        train_lgb(os.path.join(output_dir, "train_features.csv"), model_dir=model_dir)

    if mode in ["inference", "all"]:
        print("=== STEP 3: Inference ===")
        predict_lgb(
            test_path=os.path.join(output_dir, "test_features.csv"),
            model_dir=model_dir,
            output_path=submission_path
        )

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--mode", type=str, choices=["preprocess", "train", "inference", "all"], required=True,
                        help="Which step to run: preprocess | train | inference | all")
    parser.add_argument("--train_csv", type=str, default="/kaggle/input/slideandseekclasificationlandslidedetectiondataset/Train.csv")
    parser.add_argument("--test_csv", type=str, default="/kaggle/input/slideandseekclasificationlandslidedetectiondataset/Test.csv")
    parser.add_argument("--train_sentinel_dir", type=str, default="/kaggle/input/slideandseekclasificationlandslidedetectiondataset/train_data/train_data/")
    parser.add_argument("--test_sentinel_dir", type=str, default="/kaggle/input/slideandseekclasificationlandslidedetectiondataset/test_data/test_data/")
    parser.add_argument("--output_dir", type=str, default="output")
    parser.add_argument("--model_dir", type=str, default="models/lightgbm")
    parser.add_argument("--submission_path", type=str, default="lgbm_submission.csv")
    args = parser.parse_args()

    main(
        mode=args.mode,
        train_csv=args.train_csv,
        test_csv=args.test_csv,
        train_sentinel_dir=args.train_sentinel_dir,
        test_sentinel_dir=args.test_sentinel_dir,
        output_dir=args.output_dir,
        model_dir=args.model_dir,
        submission_path=args.submission_path
    )

Writing main.py


In [6]:
!python main.py --mode all

=== STEP 1: Feature Extraction ===
100%|████████████████████████████████████| 12545/12545 [00:58<00:00, 216.05it/s]
100%|█████████████████████████████████████| 12544/12544 [02:45<00:00, 75.58it/s]
=== STEP 2: Training LightGBM ===
Fold 1 F1: 0.8359
Fold 2 F1: 0.8915
Fold 3 F1: 0.8471
Fold 4 F1: 0.8175
Fold 5 F1: 0.8852
Fold 6 F1: 0.8775
Fold 7 F1: 0.8539
Fold 8 F1: 0.8968
Fold 9 F1: 0.8400
Fold 10 F1: 0.8492
Mean F1: 0.8595
=== STEP 3: Inference ===
Saved predictions to lgbm_submission.csv
