In [218]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
import pandas as pd
import numpy as np

In [219]:
df = pd.read_csv('../preprocessing/processed_data/wine.csv')

In [220]:
df.columns

Index(['Unnamed: 0', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')

In [221]:
y_wine = df[['quality']]
x_wine = df.drop(columns='quality')
x_wine = x_wine.drop(columns='Unnamed: 0')

In [222]:
np.unique(y_wine.values)

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [223]:
from numpy import inf
import pickle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np
from tqdm.notebook import tqdm as tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, KFold
import pickle
from pymoo.algorithms.soo.nonconvex.pso import PSO, PSOAnimation
from pymoo.optimize import minimize
from pymoo.factory import get_termination
from pymoo.core.callback import Callback

import numpy as np
def special_floor(x):
    x = int(np.round(x))
    if x == 0:
        x = 1
    return x

ITERATIONS = 32
POPULATION = 32

DIMENSIONS = 16


n_estimators_max = 1000
learning_rate_max = 0.6
subsample_max = 1.0
colsample_bytree_max = 1.0
gamma_max = 0.5
max_depth_max = 10
min_child_weight_max = 10
reg_alpha_max = 0.1
reg_lambda_max = 1
scale_pos_weight_max = 10
base_score_max = 1

n_estimators_min = 10
learning_rate_min = 0.0001
subsample_min = 0.6
colsample_bytree_min = 0.6
gamma_min = 0
max_depth_min = 3
min_child_weight_min = 1
reg_alpha_min = 0
reg_lambda_min = 0
scale_pos_weight_min = 1
base_score_min = 0

import numpy as np
from pymoo.core.problem import ElementwiseProblem

class OptimizeWithF1(ElementwiseProblem):

    def __init__(self):
        super().__init__(n_var= 11,
                         n_obj=1,
                         n_constr=11,
                         types = np.array([int, float, float, float, float, int, int, float, float, float, float]),
                         xl=np.array([
                          n_estimators_min,
                            learning_rate_min,
                            subsample_min,
                            colsample_bytree_min,
                            gamma_min,
                            max_depth_min,
                            min_child_weight_min,
                            reg_alpha_min,
                            reg_lambda_min,
                            scale_pos_weight_min,
                            base_score_min
                          ]),
                         xu=np.array([
                            n_estimators_max,
                            learning_rate_max,
                            subsample_max,
                            colsample_bytree_max,
                            gamma_max,
                            max_depth_max,
                            min_child_weight_max,
                            reg_alpha_max,
                            reg_lambda_max,
                            scale_pos_weight_max,
                            base_score_max,
                            ])
                        )

    def _evaluate(self, x, out, *args, **kwargs):
        #num_leaves, min_child_samples, n_estimators, learning_rate, subsample_for_bin, min_split_gain, min_child_weight, reg_alpha, reg_lambda
       
        model_xgboost = xgb.XGBClassifier(
                          n_estimators = int(np.round(x[0])),
                          learning_rate = x[1],
                          subsample = x[2],
                          colsample_bytree = x[3],
                          gamma = x[4],
                          max_depth = special_floor(x[5]),
                          min_child_weight = int(np.round(x[6])),
                          reg_alpha = x[7],
                          reg_lambda = x[8],
                          scale_pos_weight = int(x[9]),
                          base_score       = x[10],
                          n_jobs = -1
                                       )
        
        kfold = KFold(n_splits = 3, shuffle = True)

        scores = cross_val_score(model_xgboost, x_wine, y_wine, cv = kfold, scoring='f1_weighted', n_jobs=-1)  
        result = scores.mean()
        out['F'] = -1 * result

problemF1 = OptimizeWithF1()

from pymoo.util.display.column import Column
from pymoo.util.display.output import Output

class MyOutput(Output):

    def __init__(self):
        super().__init__()
        global pbar 
        pbar = tqdm(total=ITERATIONS)
        self.score = Column("score", width=13)
        self.Parameters = Column("Parameters", width=35)
        self.columns += [self.score, self.Parameters]

    def update(self, algorithm):
        super().update(algorithm)
        self.score.set(-np.min(algorithm.pop.get("F")))
        #self.Parameters.set(algorithm.pop.get("X")[0])
        pbar.update(1)
        if pbar.n == ITERATIONS: pbar.close()
        
import numpy as np

xl=np.array([n_estimators_min,
             learning_rate_min,
             subsample_min,
             colsample_bytree_min,
             gamma_min,
             max_depth_min,
             min_child_weight_min,
             reg_alpha_min,
             reg_lambda_min,
             scale_pos_weight_min,
             base_score_min])
xu=np.array([n_estimators_max,
             learning_rate_max,
             subsample_max,
             colsample_bytree_max,
             gamma_max,
             max_depth_max,
             min_child_weight_max,
             reg_alpha_max,
             reg_lambda_max,
             scale_pos_weight_max,
             base_score_max])

def PSO_Optimize_F1(values):
    x = values[0] 
    model_xgboost = xgb.XGBClassifier(
                          n_estimators = int(np.round(x[0])),
                          learning_rate = x[1],
                          subsample = x[2],
                          colsample_bytree = x[3],
                          gamma = x[4],
                          max_depth = special_floor(x[5]),
                          min_child_weight = int(np.round(x[6])),
                          reg_alpha = x[7],
                          reg_lambda = x[8],
                          scale_pos_weight = int(x[9]),
                          base_score       = x[10],
                          n_jobs = -1
                                    )
    
    kfold = KFold(n_splits = 10, shuffle = True)
    
    scores = cross_val_score(model_xgboost,  x_wine, y_wine, cv = kfold, n_jobs=-1, scoring='f1_weighted')  
    result = scores.mean()     
    if result == np.nan:
        result = 0
    print(result)
    return -result

from pymoo.algorithms.soo.nonconvex.pso import PSO

# Define the number of cores you want to use
n_cores = 16

def run_f1_pso(ITERATIONS=2, POPULATION=3):
    algorithm = PSO(pop_size = POPULATION, c1 = 1.5, c2 = 1.5, w = 0.5)

    term = get_termination("n_gen", ITERATIONS)

    res = minimize(problemF1,
                   algorithm,
                   save_history=False,
                   verbose=True,
                   vtype = ['int', 'float', 'float', 'float', 'float', 'float', 'int', 'float', 'float', 'int', 'float'],
                   output=MyOutput(),
                   termination=term)

    index_best_individual = np.where(res.pop.get('F') == np.min(res.pop.get('F')))[0][0]
    score_best_individual = res.pop.get('F')[index_best_individual]
    parameters_best_individual = res.pop.get('X')[index_best_individual]
    
    return score_best_individual, parameters_best_individual, res


In [224]:
#score_best_individual, parameters_best_individual, res = run_f1_pso()

In [225]:
from pymoo.algorithms.soo.nonconvex.ga import GA
def run_f1_ga(ITERATIONS = 32, POPULATION = 32):
    algorithm = GA(pop_size=POPULATION)

    term = get_termination("n_gen", ITERATIONS)

    res = minimize(problemF1,
                algorithm,
                save_history=False,
                vtype = ['int', 'float', 'float', 'float', 'float', 'float', 'int', 'float', 'float', 'int', 'float'],
                verbose=True,
                output=MyOutput(),
                termination = term)


    index_best_individual = np.where(res.pop.get('F') == np.min(res.pop.get('F')))[0][0]
    score_best_individual = res.pop.get('F')[index_best_individual]
    parameters_best_individual = res.pop.get('X')[index_best_individual]

    #print(f'Best F1 Score {-score_best_individual}')
    #print(f'Model parameters: \n {parameters_best_individual}')
    
    return score_best_individual, parameters_best_individual, res

In [226]:
with_vartype_list = [] 
for i in range(100):
    score_best_individual, parameters_best_individual, res = run_f1_ga(10, 10)
    with_vartype_list.append(-score_best_individual[0])

  term = get_termination("n_gen", ITERATIONS)


  0%|          | 0/32 [00:00<?, ?it/s]

n_gen  |  n_eval  |     score     |              Parameters            
     1 |       10 |  0.6245778803 |                                   -
     2 |       20 |  0.6439697088 |                                   -
     3 |       30 |  0.6439697088 |                                   -
     4 |       40 |  0.6439697088 |                                   -
     5 |       50 |  0.6439697088 |                                   -
     6 |       60 |  0.6439697088 |                                   -


1 fits failed out of a total of 3.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Felps\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Felps\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\Felps\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qb

     7 |       70 |  0.6514054194 |                                   -
     8 |       80 |  0.6514054194 |                                   -
     9 |       90 |  0.6514054194 |                                   -
    10 |      100 |  0.6514054194 |                                   -


  0%|          | 0/32 [00:00<?, ?it/s]

n_gen  |  n_eval  |     score     |              Parameters            
     1 |       10 |  0.6239266008 |                                   -
     2 |       20 |  0.6297685574 |                                   -
     3 |       30 |  0.6297685574 |                                   -
     4 |       40 |  0.6297685574 |                                   -
     5 |       50 |  0.6297685574 |                                   -
     6 |       60 |  0.6297685574 |                                   -
     7 |       70 |  0.6300654155 |                                   -
     8 |       80 |  0.6371089640 |                                   -
     9 |       90 |  0.6371089640 |                                   -
    10 |      100 |  0.6371089640 |                                   -


  0%|          | 0/32 [00:00<?, ?it/s]

n_gen  |  n_eval  |     score     |              Parameters            
     1 |       10 |  0.6223561044 |                                   -
     2 |       20 |  0.6395659647 |                                   -
     3 |       30 |  0.6395659647 |                                   -
     4 |       40 |  0.6395659647 |                                   -
     5 |       50 |  0.6395659647 |                                   -
     6 |       60 |  0.6395659647 |                                   -
     7 |       70 |  0.6395659647 |                                   -
     8 |       80 |  0.6395659647 |                                   -
     9 |       90 |  0.6395659647 |                                   -
    10 |      100 |  0.6418073732 |                                   -


  0%|          | 0/32 [00:00<?, ?it/s]

n_gen  |  n_eval  |     score     |              Parameters            
     1 |       10 |  0.6292249490 |                                   -
     2 |       20 |  0.6292249490 |                                   -
     3 |       30 |  0.6292249490 |                                   -
     4 |       40 |  0.6479013696 |                                   -
     5 |       50 |  0.6479013696 |                                   -
     6 |       60 |  0.6479013696 |                                   -
     7 |       70 |  0.6479013696 |                                   -
     8 |       80 |  0.6479013696 |                                   -
     9 |       90 |  0.6479013696 |                                   -
    10 |      100 |  0.6479013696 |                                   -


  0%|          | 0/32 [00:00<?, ?it/s]

n_gen  |  n_eval  |     score     |              Parameters            
     1 |       10 |  0.6281572800 |                                   -
     2 |       20 |  0.6281572800 |                                   -
     3 |       30 |  0.6421925769 |                                   -
     4 |       40 |  0.6447503288 |                                   -
     5 |       50 |  0.6447503288 |                                   -
     6 |       60 |  0.6447503288 |                                   -
     7 |       70 |  0.6473951861 |                                   -
     8 |       80 |  0.6476955332 |                                   -
     9 |       90 |  0.6476955332 |                                   -
    10 |      100 |  0.6476955332 |                                   -


  0%|          | 0/32 [00:00<?, ?it/s]

n_gen  |  n_eval  |     score     |              Parameters            
     1 |       10 |  0.6261409179 |                                   -
     2 |       20 |  0.6378867062 |                                   -
     3 |       30 |  0.6462835653 |                                   -
     4 |       40 |  0.6462835653 |                                   -
     5 |       50 |  0.6462835653 |                                   -
     6 |       60 |  0.6466579117 |                                   -
     7 |       70 |  0.6466579117 |                                   -
     8 |       80 |  0.6466579117 |                                   -
     9 |       90 |  0.6466579117 |                                   -
    10 |      100 |  0.6466579117 |                                   -


  0%|          | 0/32 [00:00<?, ?it/s]

n_gen  |  n_eval  |     score     |              Parameters            
     1 |       10 |  0.6457663986 |                                   -
     2 |       20 |  0.6457663986 |                                   -
     3 |       30 |  0.6457663986 |                                   -
     4 |       40 |  0.6457663986 |                                   -
     5 |       50 |  0.6457663986 |                                   -
     6 |       60 |  0.6515297211 |                                   -
     7 |       70 |  0.6515297211 |                                   -
     8 |       80 |  0.6515297211 |                                   -
     9 |       90 |  0.6515297211 |                                   -
    10 |      100 |  0.6515297211 |                                   -


  0%|          | 0/32 [00:00<?, ?it/s]

n_gen  |  n_eval  |     score     |              Parameters            
     1 |       10 |  0.6386330581 |                                   -
     2 |       20 |  0.6386330581 |                                   -
     3 |       30 |  0.6386330581 |                                   -
     4 |       40 |  0.6386330581 |                                   -
     5 |       50 |  0.6386330581 |                                   -
     6 |       60 |  0.6386330581 |                                   -
     7 |       70 |  0.6386330581 |                                   -
     8 |       80 |  0.6386330581 |                                   -
     9 |       90 |  0.6386330581 |                                   -
    10 |      100 |  0.6386330581 |                                   -


  0%|          | 0/32 [00:00<?, ?it/s]

n_gen  |  n_eval  |     score     |              Parameters            
     1 |       10 |  0.6285528946 |                                   -
     2 |       20 |  0.6285528946 |                                   -
     3 |       30 |  0.6285528946 |                                   -
     4 |       40 |  0.6304677381 |                                   -
     5 |       50 |  0.6304677381 |                                   -
     6 |       60 |  0.6436422075 |                                   -
     7 |       70 |  0.6436422075 |                                   -
     8 |       80 |  0.6436422075 |                                   -
     9 |       90 |  0.6436422075 |                                   -
    10 |      100 |  0.6436422075 |                                   -


  0%|          | 0/32 [00:00<?, ?it/s]

n_gen  |  n_eval  |     score     |              Parameters            
     1 |       10 |  0.6135530627 |                                   -
     2 |       20 |  0.6198580625 |                                   -
     3 |       30 |  0.6417110020 |                                   -
     4 |       40 |  0.6417110020 |                                   -
     5 |       50 |  0.6417110020 |                                   -
     6 |       60 |  0.6417110020 |                                   -
     7 |       70 |  0.6417110020 |                                   -
     8 |       80 |  0.6417110020 |                                   -
     9 |       90 |  0.6417110020 |                                   -
    10 |      100 |  0.6417110020 |                                   -


  0%|          | 0/32 [00:00<?, ?it/s]

n_gen  |  n_eval  |     score     |              Parameters            
     1 |       10 |  0.6293063806 |                                   -
     2 |       20 |  0.6293063806 |                                   -
     3 |       30 |  0.6293063806 |                                   -
     4 |       40 |  0.6293063806 |                                   -
     5 |       50 |  0.6293063806 |                                   -
     6 |       60 |  0.6293063806 |                                   -
     7 |       70 |  0.6293063806 |                                   -
     8 |       80 |  0.6381924504 |                                   -
     9 |       90 |  0.6381924504 |                                   -
    10 |      100 |  0.6381924504 |                                   -


  0%|          | 0/32 [00:00<?, ?it/s]

n_gen  |  n_eval  |     score     |              Parameters            
     1 |       10 |  0.6248919025 |                                   -
     2 |       20 |  0.6248919025 |                                   -
     3 |       30 |  0.6248919025 |                                   -
     4 |       40 |  0.6300066290 |                                   -
     5 |       50 |  0.6443226996 |                                   -
     6 |       60 |  0.6443226996 |                                   -
     7 |       70 |  0.6443226996 |                                   -
     8 |       80 |  0.6443226996 |                                   -
     9 |       90 |  0.6443226996 |                                   -
    10 |      100 |  0.6500034524 |                                   -


  0%|          | 0/32 [00:00<?, ?it/s]

n_gen  |  n_eval  |     score     |              Parameters            
     1 |       10 |  0.6314123859 |                                   -
     2 |       20 |  0.6354224867 |                                   -
     3 |       30 |  0.6354224867 |                                   -
     4 |       40 |  0.6354224867 |                                   -
     5 |       50 |  0.6354224867 |                                   -
     6 |       60 |  0.6457450436 |                                   -
     7 |       70 |  0.6457450436 |                                   -
     8 |       80 |  0.6457450436 |                                   -
     9 |       90 |  0.6457450436 |                                   -
    10 |      100 |  0.6457450436 |                                   -


  0%|          | 0/32 [00:00<?, ?it/s]

n_gen  |  n_eval  |     score     |              Parameters            
     1 |       10 |  0.6377416780 |                                   -
     2 |       20 |  0.6377416780 |                                   -
     3 |       30 |  0.6377416780 |                                   -
     4 |       40 |  0.6377416780 |                                   -
     5 |       50 |  0.6377416780 |                                   -
     6 |       60 |  0.6377416780 |                                   -
     7 |       70 |  0.6377416780 |                                   -
     8 |       80 |  0.6377416780 |                                   -
     9 |       90 |  0.6377416780 |                                   -
    10 |      100 |  0.6377416780 |                                   -


  0%|          | 0/32 [00:00<?, ?it/s]

n_gen  |  n_eval  |     score     |              Parameters            
     1 |       10 |  0.6306876198 |                                   -
     2 |       20 |  0.6459821629 |                                   -
     3 |       30 |  0.6459821629 |                                   -
     4 |       40 |  0.6459821629 |                                   -
     5 |       50 |  0.6459821629 |                                   -
     6 |       60 |  0.6459821629 |                                   -
     7 |       70 |  0.6459821629 |                                   -
     8 |       80 |  0.6459821629 |                                   -
     9 |       90 |  0.6501937076 |                                   -
    10 |      100 |  0.6501937076 |                                   -


  0%|          | 0/32 [00:00<?, ?it/s]

n_gen  |  n_eval  |     score     |              Parameters            
     1 |       10 |  0.6294539052 |                                   -
     2 |       20 |  0.6390947331 |                                   -
     3 |       30 |  0.6390947331 |                                   -
     4 |       40 |  0.6390947331 |                                   -
     5 |       50 |  0.6390947331 |                                   -
     6 |       60 |  0.6390947331 |                                   -
     7 |       70 |  0.6390947331 |                                   -
     8 |       80 |  0.6390947331 |                                   -
     9 |       90 |  0.6390947331 |                                   -
    10 |      100 |  0.6390947331 |                                   -


  0%|          | 0/32 [00:00<?, ?it/s]

n_gen  |  n_eval  |     score     |              Parameters            
     1 |       10 |  0.6345603302 |                                   -
     2 |       20 |  0.6345603302 |                                   -
     3 |       30 |  0.6345603302 |                                   -
     4 |       40 |  0.6345603302 |                                   -
     5 |       50 |  0.6345603302 |                                   -


KeyboardInterrupt: 

In [None]:
from pymoo.algorithms.soo.nonconvex.ga import GA
def run_f1_ga(ITERATIONS = 32, POPULATION = 32):
    algorithm = GA(pop_size=POPULATION)

    term = get_termination("n_gen", ITERATIONS)

    res = minimize(problemF1,
                algorithm,
                save_history=False,
                verbose=True,
                output=MyOutput(),
                termination = term)


    index_best_individual = np.where(res.pop.get('F') == np.min(res.pop.get('F')))[0][0]
    score_best_individual = res.pop.get('F')[index_best_individual]
    parameters_best_individual = res.pop.get('X')[index_best_individual]

    #print(f'Best F1 Score {-score_best_individual}')
    #print(f'Model parameters: \n {parameters_best_individual}')
    
    return score_best_individual, parameters_best_individual, res

In [None]:
without_vartype_list = [] 
for i in range(100):
    score_best_individual, parameters_best_individual, res = run_f1_ga(10, 10)
    without_vartype_list.append(-score_best_individual[0])

  term = get_termination("n_gen", ITERATIONS)


  0%|          | 0/32 [00:00<?, ?it/s]

n_gen  |  n_eval  |     score     |              Parameters            
     1 |       10 |  0.6324535750 |                                   -
     2 |       20 |  0.6324535750 |                                   -
     3 |       30 |  0.6324535750 |                                   -


1 fits failed out of a total of 3.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Felps\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Felps\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\Felps\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qb

     4 |       40 |  0.6324535750 |                                   -
     5 |       50 |  0.6324535750 |                                   -
     6 |       60 |  0.6324535750 |                                   -
     7 |       70 |  0.6324535750 |                                   -
     8 |       80 |  0.6329938246 |                                   -
     9 |       90 |  0.6435392336 |                                   -
    10 |      100 |  0.6473288788 |                                   -
