In [65]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
import pandas as pd
import numpy as np
from os import system
from numpy import inf
import pickle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np
from tqdm.notebook import tqdm as tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, KFold
import pickle
from pymoo.algorithms.soo.nonconvex.pso import PSO, PSOAnimation
from pymoo.optimize import minimize
from pymoo.factory import get_termination
from pymoo.core.callback import Callback

from pymoo.core.mixed import MixedVariableGA
from pymoo.core.variable import Real, Integer
from pymoo.optimize import minimize

from pymoo.util.display.column import Column
from pymoo.util.display.output import Output

from pymoo.core.problem import ElementwiseProblem
from pymoo.core.variable import Real, Integer, Choice, Binary
import time

In [66]:
df = pd.read_csv('../preprocessing/processed_data/wine.csv')
y_wine = df[['quality']]
x_wine = df.drop(columns='quality')
x_wine = x_wine.drop(columns='Unnamed: 0')

In [67]:
ITERATIONS = 32
POPULATION = 32

In [68]:
class MyOutput(Output):

    def __init__(self):
        super().__init__()
        global pbar 
        pbar = tqdm(total=ITERATIONS)
        self.score = Column("score", width=13)
        self.Parameters = Column("Parameters", width=35)
        self.columns += [self.score, self.Parameters]

    def update(self, algorithm):
        super().update(algorithm)
        self.score.set(-np.min(algorithm.pop.get("F")))
        #self.Parameters.set(algorithm.pop.get("X")[0])
        pbar.update(1)
        if pbar.n == ITERATIONS: pbar.close()

# Genetic Algorithm

In [69]:
class MixedVariableProblem(ElementwiseProblem):
    def __init__(self, **kwargs):
        #params reference 1
        xgb_params = {
        'n_estimators' : Integer(bounds=(3, 10000)), #high num
        'learning_rate' : Real(bounds=(0.001, 0.3)),
        'max_depth' : Integer(bounds=(3, 15)),
        'subsample' : Real(bounds=(0.05, 1.0)),
        'colsample_bytree' : Real(bounds=(0.2, 1.0)),
        'gamma'            : Real(bounds=(0, 10)),
        'min_child_weight' : Real(bounds=(0, 10)),
        'reg_lambda'       : Real(bounds=(0, 1)), #l2 lambda
        'reg_alpha'        : Real(bounds=(0, 1)), #L1 alpha
        }
        super().__init__(vars=xgb_params, n_obj=1, **kwargs)

    def _evaluate(self, X, out, *args, **kwargs):
        n_estimators = X['n_estimators']
        learning_rate = X['learning_rate']
        max_depth = X['max_depth']
        subsample = X['subsample']
        colsample_bytree = X['colsample_bytree']
        gamma = X['gamma']
        min_child_weight = X['min_child_weight']
        reg_lambda = X['reg_lambda']
        reg_alpha = X['reg_alpha']
        
        model_xgboost = xgb.XGBClassifier(
            n_estimators = n_estimators,
            learning_rate = learning_rate,
            max_depth = max_depth,
            subsample = subsample,
            colsample_bytree = colsample_bytree,
            gamma = gamma,
            min_child_weight = min_child_weight,
            reg_lambda = reg_lambda,
            reg_alpha = reg_alpha,
            n_jobs = -1
            )
        
        kfold = KFold(n_splits = 3, shuffle = True)
        
        scores = cross_val_score(model_xgboost,  x_wine, y_wine, cv = kfold, n_jobs=-1, scoring='f1_weighted')  
        result = scores.mean()     
        if result == np.nan:
            result = 0
        #print(result)
        #return result
        out["F"] = -result

In [70]:
def run_accuracy_ga(ITERATIONS=32, POPULATION=32):
    problem = MixedVariableProblem()

    algorithm = MixedVariableGA(pop_size=POPULATION)

    term = get_termination("n_gen", ITERATIONS)

    start_time = time.time()

    res = minimize(problem,
                   algorithm,
                   termination=term,
                   verbose=True,
                   output=MyOutput(),
                   # seed=1,
                   )

    end_time = time.time()

    execution_time = end_time - start_time
    #print(f"Execution time: {execution_time:.2f} seconds")
    #print("Best solution found: \nX = %s\nF = %s" % (res.X, res.F))
    params, score = res.X, res.F[0]
    return params, -score, execution_time


In [71]:
def full_run_ga(iterations):
    filename = './XGBoost_wine_data.csv'
    try:
        XGBoost_data = pd.read_csv(filename)
    except:
        open(filename, "a")
        XGBoost_data = pd.DataFrame(columns=['Algorithm', 'Model', 'F1', 'Time', 'params'])

    
    for i in tqdm(range(iterations)):
        params, score, execution_time = run_accuracy_ga()
        
        temp = pd.DataFrame({'Algorithm' : ['GA'], 
                             'Model' : ['XGBoost'], 
                            'F1' : [score], 
                            'Time' : [execution_time],
                            'params' : [params]
                            })
        
        XGBoost_data = pd.concat([XGBoost_data, temp], ignore_index=True)
        XGBoost_data[['Algorithm', 'Model', 'F1', 'Time', 'params']].to_csv(filename, index=False)

# PSO

https://pymoo.org/algorithms/soo/pso.html?highlight=pso

In [96]:
n_estimators_min = 3, 
n_estimators_max = 10000
learning_rate_min = 0.001, 
learning_rate_max = 0.3
max_depth_min = 3, 
max_depth_max = 15
subsample_min = 0.05, 
subsample_max = 1.0
colsample_bytree_min = 0.2, 
colsample_bytree_max = 1.0
gamma_min = 0, 
gamma_max = 10
min_child_weight_min = 0, 
min_child_weight_max = 10
reg_lambda_min = 0, 
reg_lambda_max = 1
reg_alpha_min = 0, 
reg_alpha_max = 1

In [97]:
latex_formula = r"\[ \text{{scaled\_value}} = \text{{min\_val}} + (\text{{max\_val}} - \text{{min\_val}}) \cdot \frac{x}{1000} \]"

# Print the LaTeX formula in Markdown format
print(f"```markdown\n{latex_formula}\n```")

```markdown
\[ \text{{scaled\_value}} = \text{{min\_val}} + (\text{{max\_val}} - \text{{min\_val}}) \cdot \frac{x}{1000} \]
```


In [98]:
def search_range(x, max_val, min_val):
    # Ensure that x is within the range [0, 1000]
    #x = max(0, min(1000, x))
    
    # Scale x to the range [min_val, max_val]
    scaled_value = min_val + (max_val - min_val) * (x / 1000)
    
    return scaled_value
# Example usage
result = search_range(499, 1.0, 0.2)
print(result)

0.5992


In [101]:
def round_to_integers(positions, lower_bound, upper_bound):
    return np.clip(np.round(positions), lower_bound, upper_bound).astype(int)

In [102]:
import numpy as np
from pymoo.algorithms.soo.nonconvex.ga import GA
from pymoo.core.problem import Problem
from pymoo.operators.crossover.sbx import SBX
from pymoo.operators.mutation.pm import PM
from pymoo.operators.repair.rounding import RoundingRepair
from pymoo.operators.sampling.rnd import IntegerRandomSampling
from pymoo.optimize import minimize

class PSO_Problem(Problem):

    def __init__(self):
        super().__init__(n_var=9, n_obj=1, n_constr=1, 
                         #xl=[n_estimators_min, 0, max_depth_min, 0, 0, 0, 0, 0, 0],
                        xl=[0, learning_rate_min, 0, subsample_min, colsample_bytree_min, gamma_min, min_child_weight_min, reg_lambda_min, reg_alpha_min], 
                        xu=[1, learning_rate_max, 1, subsample_max, colsample_bytree_max, gamma_max, min_child_weight_max, reg_lambda_max, reg_alpha_max], 
                         #xu=[n_estimators_max, 1000, max_depth_max, 1000, 1000, 1000, 1000, 1000, 1000], 
                         vtype=int)

    def _evaluate(self, X, out, *args, **kwargs):
        n_estimators = round_to_integers(X[0] * n_estimators_max, n_estimators_min, n_estimators_max)
        learning_rate = X[1] 
        max_depth = round_to_integers(X[2] * max_depth_max, max_depth_min, max_depth_max)
        subsample = X[3]
        colsample_bytree = X[4]
        gamma = X[5]
        min_child_weight = X[6]
        reg_lambda = X[7]
        reg_alpha = X[8]
        
        print('n_estimators', n_estimators)
        print('learning_rate', learning_rate)
        print('max_depth', max_depth)
        print('subsample', subsample)
        print('colsample_bytree', colsample_bytree)
        print('gamma', gamma)
        print('min_child_weight', min_child_weight)
        print('reg_lambda', reg_lambda)
        print('reg_alpha', reg_alpha)
        
        model_xgboost = xgb.XGBClassifier(
            n_estimators = n_estimators,
            learning_rate = learning_rate,
            max_depth = max_depth,
            subsample = subsample,
            colsample_bytree = colsample_bytree,
            gamma = gamma,
            min_child_weight = min_child_weight,
            reg_lambda = reg_lambda,
            reg_alpha = reg_alpha,
            n_jobs = -1
            )
        
        kfold = KFold(n_splits = 3, shuffle = True)
        
        scores = cross_val_score(model_xgboost,  x_wine, y_wine, cv = kfold, n_jobs=-1, scoring='f1_weighted')  
        result = scores.mean()     
        if result == np.nan:
            result = 0
        #print(result)
        #return result
        out["F"] = -result

problem = PSO_Problem()

algorithm = PSO(pop_size=32, w=0.9, c1=2.0, c2=2.0, adaptive=True, initial_velocity='random', max_velocity_rate=0.20, pertube_best=True)

res = minimize(problem,
               algorithm,
               seed=1,
               verbose=False)

print("Best solution found: %s" % res.X)
print("Function value: %s" % res.F)
print("Constraint violation: %s" % res.CV)


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (9,) + inhomogeneous part.