# 1. Random forest (Rừng ngẫu nhiên)

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
import joblib
from mealpy import FloatVar, GA

# 1. Đọc và chia bộ dữ liệu
data = pd.read_csv('data of CFDST.csv', index_col='No')
input_data = data.iloc[:, :-1]
output_data = data.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.15, random_state=42)

# Normalize dữ liệu:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

def objective_func(x):
    n_estimators = int(x[0])
    max_depth = int(x[1])
    min_samples_split = int(x[2])

    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, random_state=42)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores_rmse = cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_root_mean_squared_error')
    return -cv_scores_rmse.mean()

lb = [10, 1, 2]
ub = [200, 10, 10]
problem_dict = {
    "obj_func": objective_func,
    "bounds": FloatVar(lb=lb, ub=ub),
    "minmax": "min",
    "save_population": True
}
term_dict = {"max_early_stop": 50}

optimizer = GA.BaseGA(epoch=100, pop_size=10)
optimizer.solve(problem_dict, termination=term_dict)
print(optimizer.g_best.solution)

# Cài đặt các tham số sau khi đã tối ưu
optimal_model = RandomForestRegressor(
    n_estimators=int(optimizer.g_best.solution[0]),
    max_depth=int(optimizer.g_best.solution[1]),
    min_samples_split=int(optimizer.g_best.solution[2]),
    random_state=42
)
optimal_model.fit(X_train, y_train)

joblib.dump(scaler, 'scaler_rf')
joblib.dump(optimal_model, 'optimal_model_rf')

print(optimizer.g_best.target.fitness)
best_solution = np.array(optimizer.g_best.solution)
np.savetxt('best_solution.csv', best_solution, delimiter=',', comments='')
print(best_solution)

# Load lại scale và model đã huấn luyện
scaler = joblib.load('scaler_rf')
model = joblib.load('optimal_model_rf')

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Đánh giá bằng metric
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
mae_train = mean_absolute_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)
print("Train:")
print("Root Mean Squared Error:", rmse_train)
print("Mean Absolute Error:", mae_train)
print("R-squared:", r2_train)

rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
print("Test:")
print("Root Mean Squared Error:", rmse_test)
print("Mean Absolute Error:", mae_test)
print("R-squared:", r2_test)


2025/02/11 05:28:17 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: Solving single objective optimization problem.
2025/02/11 05:28:21 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 1, Current best: 241.39418649143664, Global best: 241.39418649143664, Runtime: 2.07153 seconds
2025/02/11 05:28:24 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 2, Current best: 249.89339736242042, Global best: 241.39418649143664, Runtime: 3.27778 seconds
2025/02/11 05:28:28 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 3, Current best: 249.89339736242042, Global best: 241.39418649143664, Runtime: 3.57678 seconds
2025/02/11 05:28:32 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 4, Current best: 249.89339736242042, Global best: 241.39418649143664, Runtime: 3.59138 seconds
2025/02/11 05:28:35 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 5, Current best: 249.89339736242042, Global best: 241.3941864

[119.5621895    9.67088787   2.4780528 ]
235.15633128107166
[119.5621895    9.67088787   2.4780528 ]
Train:
Root Mean Squared Error: 87.74899104262254
Mean Absolute Error: 63.2750944108856
R-squared: 0.9940857688374537
Test:
Root Mean Squared Error: 230.9108071322845
Mean Absolute Error: 144.06823727381393
R-squared: 0.9663115514992408




# 2. XGBoost

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
import joblib
from mealpy import FloatVar, GA

# 1. Đọc và chia bộ dữ liệu
data = pd.read_csv('data of CFDST.csv', index_col='No')
input_data = data.iloc[:, :-1]
output_data = data.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.15, random_state=42)

# Normalize dữ liệu:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

def objective_func(x):
    n_estimators = int(x[0])
    max_depth = int(x[1])
    learning_rate = x[2]

    model = XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, random_state=42)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores_rmse = cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_root_mean_squared_error')
    return -cv_scores_rmse.mean()

lb = [10, 1, 0.01]
ub = [200, 10, 0.3]
problem_dict = {
    "obj_func": objective_func,
    "bounds": FloatVar(lb=lb, ub=ub),
    "minmax": "min",
    "save_population": True
}
term_dict = {"max_early_stop": 50}

optimizer = GA.BaseGA(epoch=100, pop_size=10)
optimizer.solve(problem_dict, termination=term_dict)
print(optimizer.g_best.solution)

# Cài đặt các tham số sau khi đã tối ưu
optimal_model = XGBRegressor(
    n_estimators=int(optimizer.g_best.solution[0]),
    max_depth=int(optimizer.g_best.solution[1]),
    learning_rate=optimizer.g_best.solution[2],
    random_state=42
)
optimal_model.fit(X_train, y_train)

joblib.dump(scaler, 'scaler_xgb')
joblib.dump(optimal_model, 'optimal_model_xgb')

print(optimizer.g_best.target.fitness)
best_solution = np.array(optimizer.g_best.solution)
np.savetxt('best_solution_xgb.csv', best_solution, delimiter=',', comments='')
print(best_solution)

# Load lại scale và model đã huấn luyện
scaler = joblib.load('scaler_xgb')
model = joblib.load('optimal_model_xgb')

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Đánh giá bằng metric
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
mae_train = mean_absolute_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)
print("Train:")
print("Root Mean Squared Error:", rmse_train)
print("Mean Absolute Error:", mae_train)
print("R-squared:", r2_train)

rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
print("Test:")
print("Root Mean Squared Error:", rmse_test)
print("Mean Absolute Error:", mae_test)
print("R-squared:", r2_test)


2025/02/11 05:35:51 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: Solving single objective optimization problem.
2025/02/11 05:35:53 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 1, Current best: 247.25978860476658, Global best: 247.25978860476658, Runtime: 0.98943 seconds
2025/02/11 05:35:54 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 2, Current best: 247.25978860476658, Global best: 247.25978860476658, Runtime: 1.33501 seconds
2025/02/11 05:35:56 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 3, Current best: 246.074623359427, Global best: 246.074623359427, Runtime: 2.11029 seconds
2025/02/11 05:35:59 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 4, Current best: 245.8901945790128, Global best: 245.8901945790128, Runtime: 2.36084 seconds
2025/02/11 05:36:01 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 5, Current best: 245.8901945790128, Global best: 245.8901945790128,

[156.04188685   3.36112334   0.2614724 ]
232.3307631902103
[156.04188685   3.36112334   0.2614724 ]
Train:
Root Mean Squared Error: 31.74821682724147
Mean Absolute Error: 14.312013692652926
R-squared: 0.9992258008189888
Test:
Root Mean Squared Error: 230.41028699730444
Mean Absolute Error: 119.00275395320013
R-squared: 0.9664574387560276




# 3. Gradient Boosting tree

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
import joblib
from mealpy import FloatVar, GA

# 1. Đọc và chia bộ dữ liệu
data = pd.read_csv('data of CFDST.csv', index_col='No')
input_data = data.iloc[:, :-1]
output_data = data.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.15, random_state=42)

# Normalize dữ liệu:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

def objective_func(x):
    n_estimators = int(x[0])
    max_depth = int(x[1])
    learning_rate = x[2]

    model = GradientBoostingRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, random_state=42)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores_rmse = cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_root_mean_squared_error')
    return -cv_scores_rmse.mean()

lb = [10, 1, 0.01]
ub = [200, 10, 0.3]
problem_dict = {
    "obj_func": objective_func,
    "bounds": FloatVar(lb=lb, ub=ub),
    "minmax": "min",
    "save_population": True
}
term_dict = {"max_early_stop": 50}

optimizer = GA.BaseGA(epoch=100, pop_size=10)
optimizer.solve(problem_dict, termination=term_dict)
print(optimizer.g_best.solution)

# Cài đặt các tham số sau khi đã tối ưu
optimal_model = GradientBoostingRegressor(
    n_estimators=int(optimizer.g_best.solution[0]),
    max_depth=int(optimizer.g_best.solution[1]),
    learning_rate=optimizer.g_best.solution[2],
    random_state=42
)
optimal_model.fit(X_train, y_train)

joblib.dump(scaler, 'scaler_gbt')
joblib.dump(optimal_model, 'optimal_model_gbt')

print(optimizer.g_best.target.fitness)
best_solution = np.array(optimizer.g_best.solution)
np.savetxt('best_solution.csv', best_solution, delimiter=',', comments='')
print(best_solution)

# Load lại scale và model đã huấn luyện
scaler = joblib.load('scaler_gbt')
model = joblib.load('optimal_model_gbt')

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Đánh giá bằng metric
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
mae_train = mean_absolute_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)
print("Train:")
print("Root Mean Squared Error:", rmse_train)
print("Mean Absolute Error:", mae_train)
print("R-squared:", r2_train)

rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
print("Test:")
print("Root Mean Squared Error:", rmse_test)
print("Mean Absolute Error:", mae_test)
print("R-squared:", r2_test)


2025/02/11 05:43:11 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: Solving single objective optimization problem.
2025/02/11 05:43:15 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 1, Current best: 195.48210914110877, Global best: 195.48210914110877, Runtime: 1.61218 seconds
2025/02/11 05:43:17 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 2, Current best: 195.48210914110877, Global best: 195.48210914110877, Runtime: 1.51176 seconds
2025/02/11 05:43:18 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 3, Current best: 195.48210914110877, Global best: 195.48210914110877, Runtime: 1.51766 seconds
2025/02/11 05:43:20 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 4, Current best: 195.48210914110877, Global best: 195.48210914110877, Runtime: 1.61098 seconds
2025/02/11 05:43:21 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 5, Current best: 195.48210914110877, Global best: 195.4821091

[1.99989774e+02 4.88685303e+00 1.78047811e-01]
193.49759792054573
[1.99989774e+02 4.88685303e+00 1.78047811e-01]
Train:
Root Mean Squared Error: 30.14493940507195
Mean Absolute Error: 8.177540817132838
R-squared: 0.9993020201862193
Test:
Root Mean Squared Error: 297.4005064891345
Mean Absolute Error: 141.68818448832755
R-squared: 0.9441174906737522


