In [1]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../preprocessing/processed_data/wine.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')

In [4]:
y_wine = df[['quality']]
x_wine = df.drop(columns='quality')
x_wine = x_wine.drop(columns='Unnamed: 0')

In [5]:
np.unique(y_wine.values)

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [6]:
from numpy import inf
import pickle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np
from tqdm.notebook import tqdm as tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, KFold
import pickle
from pymoo.algorithms.soo.nonconvex.pso import PSO, PSOAnimation
from pymoo.optimize import minimize
from pymoo.factory import get_termination
from pymoo.core.callback import Callback

import numpy as np
def special_floor(x):
    x = int(np.round(x))
    if x == 0:
        x = 1
    return x

ITERATIONS = 32
POPULATION = 32

DIMENSIONS = 16


n_estimators_max = 1000
learning_rate_max = 0.6
subsample_max = 1.0
colsample_bytree_max = 1.0
gamma_max = 0.5
max_depth_max = 10
min_child_weight_max = 10
reg_alpha_max = 0.1
reg_lambda_max = 1
scale_pos_weight_max = 10
base_score_max = 1

n_estimators_min = 10
learning_rate_min = 0.0001
subsample_min = 0.6
colsample_bytree_min = 0.6
gamma_min = 0
max_depth_min = 3
min_child_weight_min = 1
reg_alpha_min = 0
reg_lambda_min = 0
scale_pos_weight_min = 1
base_score_min = 0

import numpy as np
from pymoo.core.problem import ElementwiseProblem

class OptimizeWithF1(ElementwiseProblem):

    def __init__(self):
        super().__init__(n_var= 11,
                         n_obj=1,
                         n_constr=11,
                         types = np.array([int, float, float, float, float, int, int, float, float, float, float]),
                         xl=np.array([
                          n_estimators_min,
                            learning_rate_min,
                            subsample_min,
                            colsample_bytree_min,
                            gamma_min,
                            max_depth_min,
                            min_child_weight_min,
                            reg_alpha_min,
                            reg_lambda_min,
                            scale_pos_weight_min,
                            base_score_min
                          ]),
                         xu=np.array([
                            n_estimators_max,
                            learning_rate_max,
                            subsample_max,
                            colsample_bytree_max,
                            gamma_max,
                            max_depth_max,
                            min_child_weight_max,
                            reg_alpha_max,
                            reg_lambda_max,
                            scale_pos_weight_max,
                            base_score_max,
                            ])
                        )

    def _evaluate(self, x, out, *args, **kwargs):
        #num_leaves, min_child_samples, n_estimators, learning_rate, subsample_for_bin, min_split_gain, min_child_weight, reg_alpha, reg_lambda
       
        model_xgboost = xgb.XGBClassifier(
                          n_estimators = int(np.round(x[0])),
                          learning_rate = x[1],
                          subsample = x[2],
                          colsample_bytree = x[3],
                          gamma = x[4],
                          max_depth = special_floor(x[5]),
                          min_child_weight = int(np.round(x[6])),
                          reg_alpha = x[7],
                          reg_lambda = x[8],
                          scale_pos_weight = int(x[9]),
                          base_score       = x[10],
                          n_jobs = -1
                                       )
        
        kfold = KFold(n_splits = 3, shuffle = True)

        scores = cross_val_score(model_xgboost, x_wine, y_wine, cv = kfold, scoring='f1_weighted', n_jobs=-1)  
        result = scores.mean()
        out['F'] = -1 * result

problemF1 = OptimizeWithF1()

from pymoo.util.display.column import Column
from pymoo.util.display.output import Output

class MyOutput(Output):

    def __init__(self):
        super().__init__()
        global pbar 
        pbar = tqdm(total=ITERATIONS)
        self.score = Column("score", width=13)
        self.Parameters = Column("Parameters", width=35)
        self.columns += [self.score, self.Parameters]

    def update(self, algorithm):
        super().update(algorithm)
        self.score.set(-np.min(algorithm.pop.get("F")))
        #self.Parameters.set(algorithm.pop.get("X")[0])
        pbar.update(1)
        if pbar.n == ITERATIONS: pbar.close()
        
import numpy as np

xl=np.array([n_estimators_min,
             learning_rate_min,
             subsample_min,
             colsample_bytree_min,
             gamma_min,
             max_depth_min,
             min_child_weight_min,
             reg_alpha_min,
             reg_lambda_min,
             scale_pos_weight_min,
             base_score_min])
xu=np.array([n_estimators_max,
             learning_rate_max,
             subsample_max,
             colsample_bytree_max,
             gamma_max,
             max_depth_max,
             min_child_weight_max,
             reg_alpha_max,
             reg_lambda_max,
             scale_pos_weight_max,
             base_score_max])

def PSO_Optimize_F1(values):
    x = values[0] 
    model_xgboost = xgb.XGBClassifier(
                          n_estimators = int(np.round(x[0])),
                          learning_rate = x[1],
                          subsample = x[2],
                          colsample_bytree = x[3],
                          gamma = x[4],
                          max_depth = special_floor(x[5]),
                          min_child_weight = int(np.round(x[6])),
                          reg_alpha = x[7],
                          reg_lambda = x[8],
                          scale_pos_weight = int(x[9]),
                          base_score       = x[10],
                          n_jobs = -1
                                    )
    
    kfold = KFold(n_splits = 10, shuffle = True)
    
    scores = cross_val_score(model_xgboost,  x_wine, y_wine, cv = kfold, n_jobs=-1, scoring='f1_weighted')  
    result = scores.mean()     
    if result == np.nan:
        result = 0
    print(result)
    return -result

from pymoo.algorithms.soo.nonconvex.pso import PSO

# Define the number of cores you want to use
n_cores = 16

def run_f1_pso(ITERATIONS=2, POPULATION=3):
    algorithm = PSO(pop_size = POPULATION, c1 = 1.5, c2 = 1.5, w = 0.5)

    term = get_termination("n_gen", ITERATIONS)

    res = minimize(problemF1,
                   algorithm,
                   save_history=False,
                   verbose=True,
                   vtype = ['int', 'float', 'float', 'float', 'float', 'float', 'int', 'float', 'float', 'int', 'float'],
                   output=MyOutput(),
                   termination=term)

    index_best_individual = np.where(res.pop.get('F') == np.min(res.pop.get('F')))[0][0]
    score_best_individual = res.pop.get('F')[index_best_individual]
    parameters_best_individual = res.pop.get('X')[index_best_individual]
    
    return score_best_individual, parameters_best_individual, res


In [7]:
#score_best_individual, parameters_best_individual, res = run_f1_pso()

In [8]:
from pymoo.algorithms.soo.nonconvex.ga import GA
def run_f1_ga(ITERATIONS = 32, POPULATION = 32):
    algorithm = GA(pop_size=POPULATION)

    term = get_termination("n_gen", ITERATIONS)

    res = minimize(problemF1,
                algorithm,
                save_history=False,
                vtype = ['int', 'float', 'float', 'float', 'float', 'float', 'int', 'float', 'float', 'int', 'float'],
                verbose=True,
                output=MyOutput(),
                termination = term)


    index_best_individual = np.where(res.pop.get('F') == np.min(res.pop.get('F')))[0][0]
    score_best_individual = res.pop.get('F')[index_best_individual]
    parameters_best_individual = res.pop.get('X')[index_best_individual]

    #print(f'Best F1 Score {-score_best_individual}')
    #print(f'Model parameters: \n {parameters_best_individual}')
    
    return score_best_individual, parameters_best_individual, res

In [9]:
#with_vartype_list = [] 
#for i in range(100):
#    score_best_individual, parameters_best_individual, res = run_f1_ga(10, 10)
#    with_vartype_list.append(-score_best_individual[0])

In [10]:
from pymoo.algorithms.soo.nonconvex.ga import GA
def run_f1_ga(ITERATIONS = 32, POPULATION = 32):
    algorithm = GA(pop_size=POPULATION)

    term = get_termination("n_gen", ITERATIONS)

    res = minimize(problemF1,
                algorithm,
                save_history=False,
                verbose=True,
                output=MyOutput(),
                termination = term)


    index_best_individual = np.where(res.pop.get('F') == np.min(res.pop.get('F')))[0][0]
    score_best_individual = res.pop.get('F')[index_best_individual]
    parameters_best_individual = res.pop.get('X')[index_best_individual]

    #print(f'Best F1 Score {-score_best_individual}')
    #print(f'Model parameters: \n {parameters_best_individual}')
    
    return score_best_individual, parameters_best_individual, res

In [11]:
#without_vartype_list = [] 
#for i in range(100):
#    score_best_individual, parameters_best_individual, res = run_f1_ga(10, 10)
#    without_vartype_list.append(-score_best_individual[0])

In [12]:
from pymoo.util.display.column import Column
from pymoo.util.display.output import Output

class MyOutput(Output):

    def __init__(self):
        super().__init__()
        global pbar 
        pbar = tqdm(total=ITERATIONS)
        self.score = Column("score", width=13)
        self.Parameters = Column("Parameters", width=35)
        self.columns += [self.score, self.Parameters]

    def update(self, algorithm):
        super().update(algorithm)
        self.score.set(-np.min(algorithm.pop.get("F")))
        #self.Parameters.set(algorithm.pop.get("X")[0])
        pbar.update(1)
        if pbar.n == ITERATIONS: pbar.close()

In [13]:
from pymoo.core.problem import ElementwiseProblem
from pymoo.core.variable import Real, Integer, Choice, Binary


class MixedVariableProblem(ElementwiseProblem):
    def __init__(self, **kwargs):
        #params reference 1
        xgb_params = {
        'n_estimators' : Integer(bounds=(3, 10000)), #high num
        'learning_rate' : Real(bounds=(0.001, 0.3)),
        'max_depth' : Integer(bounds=(3, 15)),
        'subsample' : Real(bounds=(0.5, 1.0)),
        'colsample_bytree' : Real(bounds=(0.5, 1.0)),
        'gamma'            : Real(bounds=(0, 10)),
        'min_child_weight' : Real(bounds=(0, 10)),
        'reg_lambda'       : Real(bounds=(0, 1)),
        'reg_alpha'        : Real(bounds=(0, 1)),
        }
        super().__init__(vars=xgb_params, n_obj=1, **kwargs)

    def _evaluate(self, X, out, *args, **kwargs):
        n_estimators = X['n_estimators']
        learning_rate = X['learning_rate']
        max_depth = X['max_depth']
        subsample = X['subsample']
        colsample_bytree = X['colsample_bytree']
        gamma = X['gamma']
        min_child_weight = X['min_child_weight']
        reg_lambda = X['reg_lambda']
        reg_alpha = X['reg_alpha']
        
        model_xgboost = xgb.XGBClassifier(
            n_estimators = n_estimators,
            learning_rate = learning_rate,
            max_depth = max_depth,
            subsample = subsample,
            colsample_bytree = colsample_bytree,
            gamma = gamma,
            min_child_weight = min_child_weight,
            reg_lambda = reg_lambda,
            reg_alpha = reg_alpha,
            n_jobs = -1
            )
        
        kfold = KFold(n_splits = 10, shuffle = True)
        
        scores = cross_val_score(model_xgboost,  x_wine, y_wine, cv = kfold, n_jobs=-1, scoring='f1_weighted')  
        result = scores.mean()     
        if result == np.nan:
            result = 0
        #print(result)
        #return result
        out["F"] = -result

In [14]:
from pymoo.core.mixed import MixedVariableGA
from pymoo.core.variable import Real, Integer
from pymoo.optimize import minimize

problem = MixedVariableProblem()

algorithm = MixedVariableGA(pop=10)

res = minimize(problem,
               algorithm,
               termination=('n_evals', 10),
               verbose=True,
               output=MyOutput(),
               #seed=1,
               )

print("Best solution found: \nX = %s\nF = %s" % (res.X, res.F))

  0%|          | 0/32 [00:00<?, ?it/s]

0.603764674977461
0.5585447156711691
0.5683563479016474
0.5783697479841305
0.6495280759354054
0.5775855831249915
0.611049150650071
0.5883142719508437
0.5983453894984121
0.557220320015371
0.5863797086506161
0.567275727681778
0.5639282573955346
0.5807761780173972
0.5875179703060676
0.5896034638879276
0.5676557242046202
0.5757135588918711
0.5615024831585984
0.5876653771950764
0.6396292033264956
0.5619429446723058
0.5501956810029345
0.5700358849942875
0.5909852259034543
0.5589014267904691
0.5704972894590368
0.5927076858095559
0.6297649101290276
0.6176886465534872
0.5606067625255953
0.5836848809392778
0.5764494636960229
0.567785312491312
0.5611847207278593
0.6007447088123821
0.5626280424891547
0.5917151314832225
0.5899845476284911
0.5912560999948177
0.5692565571005821
0.5879499168513378
0.5609912758992806
0.5818903521168626
0.5669519562382431
0.5993788819772825
0.5752444733716421
0.5640193985570635
0.5790994412262886
0.5592187052618721
n_gen  |  n_eval  |     score     |              Parame