# Model Comparison: Random Forest vs XGBoost

This notebook compares the performance of Random Forest
and XGBoost classification models developed for the
undergraduate thesis project.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Load dataset (local, not included in repository)
file_path = r'C:\Users\lenovo\Downloads\Data\combined_with_labels.xlsx'
data = pd.read_excel(file_path)

data = data.drop(
    columns=['Measurement Date', 'Measurement Time'],
    errors='ignore'
)

label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])

X = data[
    [
        'Irradiance',
        'Temperature Thermocouple 2',
        'Pmax',
        'Vmpp',
        'Impp',
        'Voc',
        'Isc'
    ]
]
y = data['label_encoded']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42
)

rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softmax',
    eval_metric='mlogloss',
    random_state=42
)

xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

results = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost'],
    'Accuracy': [
        accuracy_score(y_test, rf_pred),
        accuracy_score(y_test, xgb_pred)
    ],
    'F1 Score (Weighted)': [
        f1_score(y_test, rf_pred, average='weighted'),
        f1_score(y_test, xgb_pred, average='weighted')
    ]
})

results

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.heatmap(
    confusion_matrix(y_test, rf_pred),
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_,
    ax=axes[0],
    cbar=False
)
axes[0].set_title('Random Forest')

sns.heatmap(
    confusion_matrix(y_test, xgb_pred),
    annot=True,
    fmt='d',
    cmap='Greens',
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_,
    ax=axes[1],
    cbar=False
)
axes[1].set_title('XGBoost')

plt.show()

rf_importance = rf_model.feature_importances_
xgb_importance = xgb_model.feature_importances_

importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Random Forest': rf_importance,
    'XGBoost': xgb_importance
})

importance_df.set_index('Feature').plot(
    kind='bar',
    figsize=(10, 6)
)

plt.title('Feature Importance Comparison')
plt.ylabel('Importance Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



## Conclusion

Both Random Forest and XGBoost demonstrate strong performance
for the classification task.

XGBoost shows improved performance on complex patterns,
while Random Forest provides more interpretable feature importance.
