In [2]:
import pandas as pd
import numpy as np
import pickle
import tkinter as tk
from tkinter import messagebox, filedialog
from sklearn.preprocessing import RobustScaler, PolynomialFeatures
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import lightgbm as lgb
import optuna

# Load data and preprocess
def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    
    # Drop the 'sample_id' column if it exists
    if 'sample_id' in data.columns:
        data.drop(columns=['sample_id'], inplace=True)

    # Features and target
    X = data[['curing_days', 'cement', 'flyash', 'water', 'sa', 'viscosity', 'max_airt', 'max_var']]
    y = data["UCS"]

    # Apply log transformation to the target variable
    y_log = np.log1p(y)

    # Robust scaling
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)

    # Feature Engineering - Adding polynomial features
    poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
    X_poly = poly.fit_transform(X)
    X_scaled = np.concatenate([X_scaled, X_poly[:, len(X.columns):]], axis=1)

    return X_scaled, y_log, scaler, poly

# Define the objective function for Optuna
def objective(trial, X_scaled, y_log):
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 5, 50),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 50),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_list = []

    for train_index, valid_index in kf.split(X_scaled):
        X_train, X_valid = X_scaled[train_index], X_scaled[valid_index]
        y_train, y_valid = y_log[train_index], y_log[valid_index]

        model = lgb.LGBMRegressor(**param, verbose=-1)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_valid)
        rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
        rmse_list.append(rmse)

    return np.mean(rmse_list)

# Run Optuna optimization
def optimize_model(X_scaled, y_log):
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective(trial, X_scaled, y_log), n_trials=100)
    return study.best_params

# Save model, scaler, and polynomial features
def save_model_and_scaler(model, scaler, poly):
    with open('lightgbm_model.pkl', 'wb') as model_file:
        pickle.dump(model, model_file)
    with open('scaler.pkl', 'wb') as scaler_file:
        pickle.dump(scaler, scaler_file)
    with open('poly.pkl', 'wb') as poly_file:
        pickle.dump(poly, poly_file)

# Save predictions and metrics to Excel
def save_predictions_to_excel(y_valid, y_pred, metrics):
    predictions_df = pd.DataFrame({
        'Actual': y_valid,
        'Predicted': y_pred
    })

    # Add metrics to the DataFrame
    for key, value in metrics.items():
        predictions_df[key] = value

    # Save to Excel
    output_file = 'predictions_and_metrics.xlsx'
    predictions_df.to_excel(output_file, index=False)
    messagebox.showinfo("Success", f"Predictions and metrics saved to {output_file}")

# GUI Application
class ModelApp:
    def __init__(self, master):
        self.master = master
        master.title("UCS Prediction using LightGBM")

        # Developer information
        self.dev_label = tk.Label(master, text="Developer: Arienkhe Endurance Osemudiamhen")
        self.dev_label.pack()
        self.dev_id_label = tk.Label(master, text="Student_ID: FS22020004E")
        self.dev_id_label.pack()
        self.dev_email_label = tk.Label(master, text="Email: endurance@cumt.edu.cn")
        self.dev_email_label.pack()

        self.label = tk.Label(master, text="Load CSV Data:")
        self.label.pack()

        self.load_data_button = tk.Button(master, text="Load Data", command=self.load_data)
        self.load_data_button.pack()

        self.optimize_button = tk.Button(master, text="Optimize Model", command=self.optimize_model, state=tk.DISABLED)
        self.optimize_button.pack()

        self.result_label = tk.Label(master, text="")
        self.result_label.pack()

        self.data = None
        self.scaler = None
        self.poly = None
        self.model = None

    def load_data(self):
        file_path = filedialog.askopenfilename(filetypes=[("CSV files", "*.csv")])
        if file_path:
            self.data = load_and_preprocess_data(file_path)
            self.result_label.config(text="Data loaded and preprocessed!")
            self.optimize_button.config(state=tk.NORMAL)

    def optimize_model(self):
        X_scaled, y_log, self.scaler, self.poly = self.data
        best_params = optimize_model(X_scaled, y_log)

        # Train the final model with the best parameters
        self.model = lgb.LGBMRegressor(**best_params)
        kf = KFold(n_splits=5, shuffle=True, random_state=42)

        metrics_list = []
        all_y_valid = []
        all_y_pred = []
        
        for train_index, valid_index in kf.split(X_scaled):
            X_train, X_valid = X_scaled[train_index], X_scaled[valid_index]
            y_train, y_valid = y_log[train_index], y_log[valid_index]

            self.model.fit(X_train, y_train)
            y_pred = self.model.predict(X_valid)

            # Collect actual and predicted values
            all_y_valid.extend(np.expm1(y_valid))  # Reverse log transformation
            all_y_pred.extend(np.expm1(y_pred))    # Reverse log transformation

            # Calculate metrics
            rmse = np.sqrt(mean_squared_error(np.expm1(y_valid), np.expm1(y_pred)))
            r2 = r2_score(np.expm1(y_valid), np.expm1(y_pred))
            mae = mean_absolute_error(np.expm1(y_valid), np.expm1(y_pred))

            metrics = {
                'RMSE': rmse,
                'R²': r2,
                'MAE': mae
            }
            metrics_list.append(metrics)

        # Average metrics
        avg_metrics = {
            'RMSE': np.mean([m['RMSE'] for m in metrics_list]),
            'R²': np.mean([m['R²'] for m in metrics_list]),
            'MAE': np.mean([m['MAE'] for m in metrics_list])
        }

        # Save the final model and scaler
        save_model_and_scaler(self.model, self.scaler, self.poly)
        
        # Save predictions and metrics to Excel
        save_predictions_to_excel(all_y_valid, all_y_pred, avg_metrics)

        self.result_label.config(text="Model optimized and results saved!")

if __name__ == "__main__":
    root = tk.Tk()
    app = ModelApp(root)
    root.mainloop()