Importing Necessary Library:

In [1]:
import os
import cv2
import numpy as np
import joblib
import time
from tqdm import tqdm
import rasterio
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, classification_report,
    confusion_matrix, precision_score,
    recall_score, f1_score
)
from skopt import BayesSearchCV
from skopt.space import Integer, Real

In [36]:
# --------------------- Step 1: GUI Directory Selection ---------------------
IMAGE_DIR = "E:/Faruq/Business/DEEPRESENSE/Experimental/binary classification/water body/Dataset/True Color dataset/Sentinel Data/10m resolution/image_dataset"
LABEL_DIR = "E:/Faruq/Business/DEEPRESENSE/Experimental/binary classification/water body/Dataset/True Color dataset/Sentinel Data/10m resolution/label_dataset"
MODEL_PATH = "E:/Faruq/Business/DEEPRESENSE/Experimental/binary classification/water body/Model/XGBoost Algorithm/xgboost_model_10m_new.pkl"

# --------------------- Start Timer ---------------------
start_time = time.time()

In [32]:
# --------------------- Step 2: Load and Preprocess Data ---------------------
print("📂 Loading and preprocessing data...")
image_files = sorted(os.listdir(IMAGE_DIR))
label_files = sorted(os.listdir(LABEL_DIR))
features, labels = [], []

with tqdm(zip(image_files, label_files), total=len(image_files), desc="Loading Files") as pbar:
    for img_file, lbl_file in pbar:
        img_path = os.path.join(IMAGE_DIR, img_file)
        lbl_path = os.path.join(LABEL_DIR, lbl_file)

        # Remove coordinates if present (GeoTIFF → array only)
        try:
            with rasterio.open(img_path) as src:
                img = src.read([1, 2, 3])  # Read first 3 bands
                img = np.transpose(img, (1, 2, 0))  # Convert to HWC
        except:
            img = cv2.imread(img_path)

        label = cv2.imread(lbl_path, cv2.IMREAD_UNCHANGED)

        if img is None or label is None:
            pbar.write(f"⚠️ Skipping {img_file}/{lbl_file}")
            continue

        if img.shape[:2] != label.shape[:2]:
            label = cv2.resize(label, (img.shape[1], img.shape[0]),
                               interpolation=cv2.INTER_NEAREST)

        features.append(img.reshape(-1, 3))
        labels.append(label.flatten())

X, y = np.concatenate(features), np.concatenate(labels)
print("\n🖼️ First 5 feature rows (X):")
print(X[:5])

print("\n🏷️ First 5 label values (y):")
print(y[:5])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

📂 Loading and preprocessing data...


Loading Files: 100%|██████████| 25/25 [00:00<00:00, 82.72it/s]



🖼️ First 5 feature rows (X):
[[24 52 23]
 [28 59 23]
 [33 72 30]
 [28 64 25]
 [28 59 22]]

🏷️ First 5 label values (y):
[0. 0. 0. 0. 0.]


In [34]:
# --------------------- Step 3: Problem Type Detection ---------------------
n_classes = len(np.unique(y))
is_multiclass = n_classes > 2
params = {
    'eval_metric': 'mlogloss' if is_multiclass else 'logloss',
    'objective': 'multi:softmax' if is_multiclass else 'binary:logistic'
}
if is_multiclass:
    params['num_class'] = str(n_classes)

print("Number of Class:",n_classes)


Number of Class: 2


In [35]:
# --------------------- Step 4: Hyperparameter Search ---------------------
opt = BayesSearchCV(
    XGBClassifier(**params, n_jobs=-1, random_state=42),
    {
        'n_estimators': Integer(50, 500),
        'learning_rate': Real(0.01, 0.3, 'log-uniform'),
        'max_depth': Integer(3, 12),
        'reg_lambda': Real(0.1, 10.0, 'log-uniform'),
        'reg_alpha': Real(0.1, 10.0, 'log-uniform'),
        'subsample': Real(0.6, 1.0),
        'colsample_bytree': Real(0.6, 1.0)
    },
    n_iter=50,
    cv=3,
    scoring='f1',
    # refit=True,
    # random_state=42,
    # return_train_score=True,
    n_jobs=1
)

print(f"🔍 Training {'multi-class' if is_multiclass else 'binary'} model...")
with tqdm(total=opt.n_iter, desc="Hyperparameter Search") as pbar:
    opt.fit(X_train, y_train,
            verbose=False,
            eval_set=[(X_test, y_test)],
            callback=[lambda _: pbar.update(1)])
    pbar.set_postfix({"Best F1": f"{opt.best_score_:.4f}"})

🔍 Training binary model...


Hyperparameter Search:   2%|▏         | 1/50 [11:25<9:19:52, 685.55s/it, Best F1=0.9248]


In [37]:
# --------------------- Step 5: Save & Evaluate ---------------------
joblib.dump(opt.best_estimator_, MODEL_PATH)
y_pred = opt.best_estimator_.predict(X_test)

# Final metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\n🏆 Best Parameters: {opt.best_params_}")
print("\n📊 Model Performance:")
print(f"✅ Accuracy:  {accuracy:.4f}")
print(f"🎯 Precision: {precision:.4f}")
print(f"🔍 Recall:    {recall:.4f}")
print(f"💯 F1 Score:  {f1:.4f}")

print("\n📋 Detailed Classification Report:")
print(classification_report(y_test, y_pred))

print("\n🧩 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"\n🕒 Total Runtime: {time.time() - start_time:.2f} seconds")


🏆 Best Parameters: OrderedDict([('colsample_bytree', 0.6736823539472551), ('learning_rate', 0.011874924390037553), ('max_depth', 5), ('n_estimators', 186), ('reg_alpha', 1.7526102286773533), ('reg_lambda', 0.12427736115210138), ('subsample', 0.7156126116743664)])

📊 Model Performance:
✅ Accuracy:  0.9439
🎯 Precision: 0.9438
🔍 Recall:    0.9439
💯 F1 Score:  0.9437

📋 Detailed Classification Report:
              precision    recall  f1-score   support

         0.0       0.95      0.97      0.96   3098059
         1.0       0.94      0.91      0.92   1901941

    accuracy                           0.94   5000000
   macro avg       0.94      0.94      0.94   5000000
weighted avg       0.94      0.94      0.94   5000000


🧩 Confusion Matrix:
[[2990940  107119]
 [ 173425 1728516]]

🕒 Total Runtime: 21.12 seconds
