In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import RobustScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import optuna
import numpy as np
import pandas as pd
import pickle

# Load data
data = pd.read_csv('data.csv')  # Replace with your file path

# Features and target
X = data[['curing_days', 'cement', 'flyash', 'water', 'sa', 'viscosity', 'max_airt', 'max_var']]
y = data["UCS"]

# Apply log transformation to the target variable
y_log = np.log1p(y)  # Log transform target variable

# Feature Engineering - Optional: Only use interactions for selected features
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_poly = poly.fit_transform(X[['cement', 'flyash', 'water', 'sa', 'viscosity']])  # Interaction terms for selected features
X = np.concatenate([X, X_poly[:, len(X.columns):]], axis=1)  # Add interaction terms to data

# Robust scaling
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_log, test_size=0.2, random_state=42)

# Objective function for Optuna
def objective(trial):
    param = {
        "max_depth": trial.suggest_int("max_depth", 5, 15),  # Lower max_depth
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),  # More conservative learning rate
        "n_estimators": trial.suggest_int("n_estimators", 1000, 3000),  # More boosting rounds
        "gamma": trial.suggest_float("gamma", 0, 0.5),  # More regularization
        "subsample": trial.suggest_float("subsample", 0.7, 1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 0.5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 0.5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
    }

    # Define pipeline
    pipeline = Pipeline([
        ('xgb', XGBRegressor(**param))
    ])

    # Perform KFold cross-validation
    cv = KFold(n_splits=10, shuffle=True, random_state=42)
    score = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring="neg_mean_squared_error")
    return -score.mean()

# Run Optuna optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=300, show_progress_bar=True)  # Increase trials

# Retrieve best parameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Train final model with best parameters
final_pipeline = Pipeline([
    ('xgb', XGBRegressor(**best_params))
])

final_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = final_pipeline.predict(X_test)

# Reverse the log transformation
y_pred_original = np.expm1(y_pred)  # Convert predictions back to original scale
y_test_original = np.expm1(y_test)  # Convert test values back to original scale

# Evaluate the model
r2 = r2_score(y_test_original, y_pred_original)
rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
mae = mean_absolute_error(y_test_original, y_pred_original)

# Print evaluation metrics
print(f"Test R² Score: {r2:.4f}")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test MAE: {mae:.4f}")


[I 2024-12-03 21:29:02,905] A new study created in memory with name: no-name-9f346ad2-1cd9-47eb-8a0c-c2587c806b2c


  0%|          | 0/300 [00:00<?, ?it/s]

[I 2024-12-03 21:29:12,760] Trial 0 finished with value: 0.04895955777626775 and parameters: {'max_depth': 8, 'learning_rate': 0.07898693785011869, 'n_estimators': 2501, 'gamma': 0.3630336872296933, 'subsample': 0.9494339351206216, 'colsample_bytree': 0.898118538126138, 'reg_alpha': 0.03498512185155933, 'reg_lambda': 0.41515390361631355, 'min_child_weight': 10}. Best is trial 0 with value: 0.04895955777626775.
[I 2024-12-03 21:29:26,697] Trial 1 finished with value: 0.03797286881228254 and parameters: {'max_depth': 15, 'learning_rate': 0.066402290308752, 'n_estimators': 2830, 'gamma': 0.029030322741688486, 'subsample': 0.959471561763183, 'colsample_bytree': 0.9412891485489545, 'reg_alpha': 0.40294127617070014, 'reg_lambda': 0.4077939444345267, 'min_child_weight': 7}. Best is trial 1 with value: 0.03797286881228254.
[I 2024-12-03 21:29:38,062] Trial 2 finished with value: 0.033974469752955455 and parameters: {'max_depth': 14, 'learning_rate': 0.014786084672569297, 'n_estimators': 1868, 