In [None]:
# ==================================================
# Random Forest Classification
# Undergraduate Thesis
# ==================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# --------------------------------------------------
# Load Dataset
# NOTE: Dataset is stored locally and not included
#       in this repository due to confidentiality
# --------------------------------------------------
file_path = r'C:\Users\lenovo\Downloads\Data\combined_with_labels.xlsx'
data = pd.read_excel(file_path)

# Drop unused columns
data = data.drop(
    columns=['Measurement Date', 'Measurement Time'],
    errors='ignore'
)

# Encode labels
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])

# Features and target
X = data[
    [
        'Irradiance',
        'Temperature Thermocouple 2',
        'Pmax',
        'Vmpp',
        'Impp',
        'Voc',
        'Isc'
    ]
]
y = data['label_encoded']

# --------------------------------------------------
# Train-Test Split
# --------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f'Original Dataset: {len(data)} samples')
print(f'Training Set: {len(X_train)} samples')
print(f'Testing Set: {len(X_test)} samples')

# --------------------------------------------------
# Hyperparameter Tuning (Randomized Search)
# --------------------------------------------------
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'bootstrap': [True, False],
    'max_features': ['sqrt', 'log2', None],
    'criterion': ['gini', 'entropy']
}

rf_model = RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    n_iter=50,
    scoring='f1_weighted',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)
print("RandomizedSearchCV complete.")

best_model = random_search.best_estimator_

# --------------------------------------------------
# Evaluation on Test Set
# --------------------------------------------------
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Best Parameters: {random_search.best_params_}')
print(f'Accuracy on test set: {accuracy:.4f}')
print(f'F1 Score on test set: {f1:.4f}')

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(
    conf_matrix,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_,
    cbar=False
)
plt.title('Confusion Matrix - Test Set')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=0, fontsize=8)
plt.yticks(fontsize=8)
plt.show()

# --------------------------------------------------
# Feature Importance
# --------------------------------------------------
importances = best_model.feature_importances_
indices = np.argsort(importances)[::-1]
features = X.columns

plt.figure(figsize=(10, 6))
plt.title('Feature Importance - Random Forest')
plt.bar(range(len(importances)), importances[indices])
plt.xticks(range(len(importances)), features[indices], rotation=90)
plt.tight_layout()
plt.show()

# --------------------------------------------------
# Evaluation on Validation Dataset
# --------------------------------------------------
new_data_path = r'C:\Users\lenovo\Downloads\Data\validation_data.xlsx'
new_data = pd.read_excel(new_data_path)

new_data = new_data.drop(
    columns=['Measurement Date', 'Measurement Time'],
    errors='ignore'
)

# IMPORTANT: use the SAME label encoder
new_data['label_encoded'] = label_encoder.transform(new_data['label'])

X_new = new_data[
    [
        'Irradiance',
        'Temperature Thermocouple 2',
        'Pmax',
        'Vmpp',
        'Impp',
        'Voc',
        'Isc'
    ]
]
y_new = new_data['label_encoded']

y_new_pred = best_model.predict(X_new)

accuracy_new = accuracy_score(y_new, y_new_pred)
f1_new = f1_score(y_new, y_new_pred, average='weighted')

print(f'Accuracy on validation data: {accuracy_new:.4f}')
print(f'F1 Score on validation data: {f1_new:.4f}')

conf_matrix_new = confusion_matrix(y_new, y_new_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(
    conf_matrix_new,
    annot=True,
    fmt='d',
    cmap='Greens',
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_,
    cbar=False
)
plt.title('Confusion Matrix - Validation Dataset')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=0, fontsize=8)
plt.yticks(fontsize=8)
plt.show()

print("Program complete.")
