In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
import lightgbm as lgb
from sklearn.neural_network import MLPRegressor
import joblib
import warnings
from sklearn.linear_model import Ridge

warnings.filterwarnings("ignore")

# Load data
clean_data = pd.read_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\!Systematic_Experiments\experiment_batch_data.csv")
test_data = pd.read_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\!Literature_Data\WMR\WMR_Z.csv", delimiter=";")
print(test_data.columns.tolist())

# Remove coal samples
clean_data = clean_data[clean_data['fuel_category_coal'] != 1]
clean_data = clean_data.drop('fuel_category_coal', axis=1)

# Train-test split: cellulose samples as test set
train_data = clean_data

# Extract features and target
X_train = train_data.drop(columns=['sample', 'fuel_type', 'devol_yield'])
y_train = train_data['devol_yield']
X_test = test_data.drop(columns=['sample', 'fuel_type', 'devol_yield'])
y_test = test_data['devol_yield']


# Imputation
imputer = KNNImputer(n_neighbors=3)
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Define base models
mlp_model = MLPRegressor(max_iter=2000, random_state=42)
lgb_model = lgb.LGBMRegressor(random_state=42)

# Hyperparameter search for MLP
param_dist_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (128, 64, 32)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'lbfgs'],
    'alpha': [0.01, 0.05, 0.1],
    'learning_rate': ['constant', 'adaptive'],
    'learning_rate_init': [0.001, 0.01, 0.1]
}
random_search_mlp = RandomizedSearchCV(mlp_model, param_distributions=param_dist_mlp, n_iter=20, cv=5, n_jobs=-1, random_state=42, scoring='r2')
random_search_mlp.fit(X_train_scaled, y_train)
best_mlp = random_search_mlp.best_estimator_

# Hyperparameter search for LightGBM
param_dist_lgb = {
    'n_estimators': [200, 500, 700],
    'learning_rate': [0.05, 0.1, 0.2],
    'num_leaves': [31, 50, 100],
    'max_depth': [5, 10, -1],
    'min_child_samples': [20, 50, 100],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0.1, 0.5, 1.0],
    'reg_lambda': [0.1, 0.5, 1.0]
}
random_search_lgb = RandomizedSearchCV(lgb_model, param_distributions=param_dist_lgb, n_iter=200, cv=5, n_jobs=-1, random_state=42, scoring='r2')
random_search_lgb.fit(X_train_scaled, y_train)
best_lgb = random_search_lgb.best_estimator_

# Define the Stacking Regressor
stacking_model = StackingRegressor(
    estimators=[('MLP', best_mlp), ('LGB', best_lgb)],
    final_estimator=Ridge(alpha=0.1)
)

# Train the stacking model
stacking_model.fit(X_train_scaled, y_train)

# Evaluate on cellulose test set
y_pred_stack = stacking_model.predict(X_test_scaled)
r2_stack = r2_score(y_test, y_pred_stack)
rmse_stack = np.sqrt(mean_squared_error(y_test, y_pred_stack))
mae_stack = mean_absolute_error(y_test, y_pred_stack)

print(f"\nStacking Model Evaluation on Cellulose Test Set:")
print(f"R² Score: {r2_stack:.4f}")
print(f"RMSE: {rmse_stack:.4f}")
print(f"MAE: {mae_stack:.4f}")

# Save the stacking model and pipelines
joblib.dump(stacking_model, r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Models\stacking_model.pkl")
joblib.dump(imputer, r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Models\knn_imputer_stacking.pkl")
joblib.dump(scaler, r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Models\scaler_stacking.pkl")

print("\nStacking model and preprocessing pipeline saved.")


['sample', 'temperature', 'residence_time', 'pressure', 'heat_rate', 'wc', 'vm', 'fc', 'ac', 'c', 'h', 'o', 'n', 's', 'cl', 'hc', 'oc', 'lhv', 'fuel_type', 'fuel_category_biomass', 'fuel_category_mix', 'fuel_category_plastic', 'devol_yield']
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000186 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1354
[LightGBM] [Info] Number of data points in the train set: 1770, number of used features: 20
[LightGBM] [Info] Start training from score 54.088495
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1354
[LightGBM] [Info] Number of data points in the train set: 1770, number of used features: 20
[LightGBM] [Info] Start training from score 54.088495
[LightGBM]