In [None]:
"""
XGBoost Classification Model
============================

This script was developed as part of an undergraduate thesis project.

The objective of this work is to apply XGBoost for classification tasks
with proper data preprocessing, feature selection, class imbalance handling,
and hyperparameter tuning.

Notes:
- Dataset paths are placeholders.
- Original datasets are confidential and not included.
- This code is shared for academic and portfolio purposes only.
"""

# =========================
# IMPORT LIBRARIES
# =========================
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import (
    StratifiedShuffleSplit,
    RandomizedSearchCV
)
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns


# =========================
# STRATIFIED SPLIT FUNCTION
# =========================
def stratified_split(X, y, test_size=0.2, random_state=42):
    """
    Perform stratified train-test split to ensure
    class distribution consistency.
    """
    sss = StratifiedShuffleSplit(
        n_splits=1,
        test_size=test_size,
        random_state=random_state
    )

    for train_idx, test_idx in sss.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    return X_train, X_test, y_train, y_test


# =========================
# LOAD DATA (PLACEHOLDER)
# =========================
# NOTE:
# Replace this path with your own dataset location.
# Original dataset is confidential and not included.
DATA_PATH = "data/dataset.xlsx"

data = pd.read_excel(DATA_PATH)

# Drop unnecessary columns if present
data = data.drop(
    columns=["Measurement Date", "Measurement Time"],
    errors="ignore"
)


# =========================
# LABEL ENCODING
# =========================
label_encoder = LabelEncoder()
data["label_encoded"] = label_encoder.fit_transform(data["label"])


# =========================
# FEATURE SELECTION
# =========================
FEATURE_COLUMNS = [
    "Irradiance",
    "Temperature Thermocouple 2",
    "Pmax",
    "Vmpp",
    "Impp",
    "Voc",
    "Isc"
]

X = data[FEATURE_COLUMNS]
y = data["label_encoded"]


# =========================
# TRAIN-TEST SPLIT
# =========================
X_train, X_test, y_train, y_test = stratified_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Total samples       : {len(data)}")
print(f"Training samples    : {len(X_train)}")
print(f"Testing samples     : {len(X_test)}")

baseline_accuracy = y_test.value_counts(normalize=True).max()
print(f"Baseline Accuracy   : {baseline_accuracy:.4f}")


# =========================
# SCALING
# =========================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# =========================
# FEATURE SELECTION (ANOVA)
# =========================
selector = SelectKBest(score_func=f_classif, k="all")
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)


# =========================
# HANDLE CLASS IMBALANCE
# =========================
sample_weights = compute_sample_weight(
    class_weight="balanced",
    y=y_train
)


# =========================
# XGBOOST + RANDOM SEARCH
# =========================
param_dist = {
    "n_estimators": [100, 200],
    "max_depth": [3, 4, 5],
    "learning_rate": [0.001, 0.005, 0.01],
    "subsample": [0.6, 0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9],
    "min_child_weight": [2, 3, 4],
    "gamma": [0, 0.1],
    "reg_alpha": [0, 3],   # L1 regularization
    "reg_lambda": [0, 5]   # L2 regularization
}

xgb_model = xgb.XGBClassifier(
    tree_method="hist",
    n_jobs=-1,
    random_state=42
)

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=10,
    scoring="f1_weighted",
    cv=3,
    verbose=1,
    random_state=42
)

random_search.fit(
    X_train_selected,
    y_train,
    sample_weight=sample_weights
)

best_model = random_search.best_estimator_


# =========================
# EVALUATION (TEST SET)
# =========================
y_pred = best_model.predict(X_test_selected)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")

print("\nBest Parameters:")
print(random_search.best_params_)
print(f"Test Accuracy : {accuracy:.4f}")
print(f"Test F1 Score : {f1:.4f}")


# =========================
# FEATURE IMPORTANCE
# =========================
importance = best_model.feature_importances_
feature_names = selector.get_feature_names_out(FEATURE_COLUMNS)

plt.figure(figsize=(10, 6))
plt.barh(feature_names, importance)
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.title("XGBoost Feature Importance")
plt.tight_layout()
plt.show()


# =========================
# CONFUSION MATRIX (TEST)
# =========================
conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(
    conf_matrix,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_,
    cbar=False
)

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Test Dataset")
plt.tight_layout()
plt.show()

print("\nProgram finished successfully.")
