# **FEATURE SELECTION**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/machine learning/Datasets/merged dataset_FE_LM_GC_DC_class balancer.csv', sep=',', encoding='iso-8859-1')

In [3]:

import warnings
warnings.filterwarnings("ignore")

# **DATA PREPROCESSING**


## **Transforming nominal categorical variables into ordinal categorical variabless**

In [4]:
df['modifier_type'].replace('abstract', 0.0, inplace=True)
df['modifier_type'].replace('final', 1.0, inplace=True)
df['modifier_type'].replace('other', 2.0, inplace=True)
df['visibility_type'].replace('public', 0.0, inplace=True)
df['visibility_type'].replace('private', 1.0, inplace=True)
df['visibility_type'].replace('protected', 2.0, inplace=True)
df['visibility_type'].replace('package', 3.0, inplace=True)

## **Predictor and Target Attributes**

In [5]:
predictors = df.iloc[:, 8:92].values
predictors_chi_original = df.iloc[:, [10,11,13,14,15,17,19,21,22,23,25,30,31,33,34,35,37,40,44,54,60,61,62,63,64]].values

In [6]:
target = df.iloc[:, 7].values


## **Data Scaling**

Standardization (uses the mean and standard deviation as a reference).

Normalization (uses maximum and minimum values as a reference).

In [7]:
from sklearn.preprocessing import (
    Normalizer, StandardScaler
)
predictors_norm = Normalizer().fit_transform(predictors)
predictors_stand = StandardScaler().fit_transform(predictors)

from sklearn.feature_selection import SelectKBest, chi2, f_classif

## **PREDICTORS WITHOUT DATA PREPROCESSING AND WITH CHI-SQUARE**

In [8]:
X = predictors
y = target

# Use SelectKBest with the chi-square test
selector10 = SelectKBest(chi2, k=9)   # Select the 9 best resources, approximately 10%
selector15 = SelectKBest(chi2, k=13)  # Select the 13 best resources, approximately 15%
selector20 = SelectKBest(chi2, k=17)  # Select the 17 best resources, approximately 20%
selector25 = SelectKBest(chi2, k=21)  # Select the 21 best resources, approximately 25%
selector30 = SelectKBest(chi2, k=25)  # Select the 25 best resources, approximately 30%
X_10 = selector10.fit_transform(X, y)
X_15 = selector15.fit_transform(X, y)
X_20 = selector20.fit_transform(X, y)
X_25 = selector25.fit_transform(X, y)
X_30 = selector30.fit_transform(X, y)

start_index_independent_variables = 8

selector10_indices = selector10.get_support(indices=True)
for j in range(len(selector10_indices)):
  selector10_indices[j] = selector10_indices[j]+start_index_independent_variables

selector15_indices = selector15.get_support(indices=True)
for j in range(len(selector15_indices)):
  selector15_indices[j] = selector15_indices[j]+start_index_independent_variables

selector20_indices = selector20.get_support(indices=True)
for j in range(len(selector20_indices)):
  selector20_indices[j] = selector20_indices[j]+start_index_independent_variables

selector25_indices = selector25.get_support(indices=True)
for j in range(len(selector25_indices)):
  selector25_indices[j] = selector25_indices[j]+start_index_independent_variables

selector30_indices = selector30.get_support(indices=True)
for j in range(len(selector30_indices)):
  selector30_indices[j] = selector30_indices[j]+start_index_independent_variables

print("10% of selected features:", selector10_indices)
print("15% of selected features:", selector15_indices)
print("20% of selected features:", selector20_indices)
print("25% of selected features:", selector25_indices)
print("30% of selected features:", selector30_indices)

10% of selected features: [14 29 37 38 39 40 55 56 63]
15% of selected features: [14 29 30 37 38 39 40 52 53 55 56 62 63]
20% of selected features: [14 15 19 27 29 30 37 38 39 40 52 53 55 56 60 62 63]
25% of selected features: [14 15 17 19 27 29 30 31 33 36 37 38 39 40 52 53 55 56 60 62 63]
30% of selected features: [10 14 15 17 19 25 27 29 30 31 33 36 37 38 39 40 45 46 52 53 55 56 60 62
 63]


In [9]:
predictors_chi_10 = df.iloc[:, selector10_indices].values
predictors_chi_15 = df.iloc[:, selector15_indices].values
predictors_chi_20 = df.iloc[:, selector20_indices].values
predictors_chi_25 = df.iloc[:, selector25_indices].values
predictors_chi_30 = df.iloc[:, selector30_indices].values

## **NORMALIZATION AND CHI-SQUARE**

In [10]:
X = predictors_norm
y = target

# Use SelectKBest with the chi-square test
selector10 = SelectKBest(chi2, k=9)   # Select the 9 best resources, approximately 10%
selector15 = SelectKBest(chi2, k=13)  # Select the 13 best resources, approximately 15%
selector20 = SelectKBest(chi2, k=17)  # Select the 17 best resources, approximately 20%
selector25 = SelectKBest(chi2, k=21)  # Select the 21 best resources, approximately 25%
selector30 = SelectKBest(chi2, k=25)  # Select the 25 best resources, approximately 30%
X_10 = selector10.fit_transform(X, y)
X_15 = selector15.fit_transform(X, y)
X_20 = selector20.fit_transform(X, y)
X_25 = selector25.fit_transform(X, y)
X_30 = selector30.fit_transform(X, y)

start_index_independent_variables = 8

selector10_indices = selector10.get_support(indices=True)
for j in range(len(selector10_indices)):
  selector10_indices[j] = selector10_indices[j]+start_index_independent_variables

selector15_indices = selector15.get_support(indices=True)
for j in range(len(selector15_indices)):
  selector15_indices[j] = selector15_indices[j]+start_index_independent_variables

selector20_indices = selector20.get_support(indices=True)
for j in range(len(selector20_indices)):
  selector20_indices[j] = selector20_indices[j]+start_index_independent_variables

selector25_indices = selector25.get_support(indices=True)
for j in range(len(selector25_indices)):
  selector25_indices[j] = selector25_indices[j]+start_index_independent_variables

selector30_indices = selector30.get_support(indices=True)
for j in range(len(selector30_indices)):
  selector30_indices[j] = selector30_indices[j]+start_index_independent_variables

print("10% of selected features:", selector10_indices)
print("15% of selected features:", selector15_indices)
print("20% of selected features:", selector20_indices)
print("25% of selected features:", selector25_indices)
print("30% of selected features:", selector30_indices)


10% of selected features: [14 15 30 37 38 40 62 63 64]
15% of selected features: [14 15 19 29 30 36 37 38 40 56 62 63 64]
20% of selected features: [10 14 15 19 27 29 30 36 37 38 40 41 53 56 62 63 64]
25% of selected features: [10 14 15 19 27 29 30 31 36 37 38 40 41 52 53 55 56 60 62 63 64]
30% of selected features: [10 14 15 17 19 25 27 29 30 31 33 36 37 38 40 41 46 52 53 55 56 60 62 63
 64]


In [11]:
predictors_chi_norm10 = df.iloc[:, selector10_indices].values
predictors_chi_norm15 = df.iloc[:, selector15_indices].values
predictors_chi_norm20 = df.iloc[:, selector20_indices].values
predictors_chi_norm25 = df.iloc[:, selector25_indices].values
predictors_chi_norm30 = df.iloc[:, selector30_indices].values

## **PREDICTORS WITHOUT DATA PREPROCESSING AND WITH ANOVA F-VALUE (analysis of variance)**

In [12]:
X = predictors
y = target

# Use SelectKBest with the f_classif test
selector10 = SelectKBest(f_classif, k=9)   # Select the 9 best resources, approximately 10%
selector15 = SelectKBest(f_classif, k=13)  # Select the 13 best resources, approximately 15%
selector20 = SelectKBest(f_classif, k=17)  # Select the 17 best resources, approximately 20%
selector25 = SelectKBest(f_classif, k=21)  # Select the 21 best resources, approximately 25%
selector30 = SelectKBest(f_classif, k=25)  # Select the 25 best resources, approximately 30%
X_10 = selector10.fit_transform(X, y)
X_15 = selector15.fit_transform(X, y)
X_20 = selector20.fit_transform(X, y)
X_25 = selector25.fit_transform(X, y)
X_30 = selector30.fit_transform(X, y)

start_index_independent_variables = 8

selector10_indices = selector10.get_support(indices=True)
for j in range(len(selector10_indices)):
  selector10_indices[j] = selector10_indices[j]+start_index_independent_variables

selector15_indices = selector15.get_support(indices=True)
for j in range(len(selector15_indices)):
  selector15_indices[j] = selector15_indices[j]+start_index_independent_variables

selector20_indices = selector20.get_support(indices=True)
for j in range(len(selector20_indices)):
  selector20_indices[j] = selector20_indices[j]+start_index_independent_variables

selector25_indices = selector25.get_support(indices=True)
for j in range(len(selector25_indices)):
  selector25_indices[j] = selector25_indices[j]+start_index_independent_variables

selector30_indices = selector30.get_support(indices=True)
for j in range(len(selector30_indices)):
  selector30_indices[j] = selector30_indices[j]+start_index_independent_variables

print("10% of selected features:", selector10_indices)
print("15% of selected features:", selector15_indices)
print("20% of selected features:", selector20_indices)
print("25% of selected features:", selector25_indices)
print("30% of selected features:", selector30_indices)

10% of selected features: [10 11 13 14 15 17 19 21 25]
15% of selected features: [10 11 13 14 15 17 19 21 22 25 33 35 40]
20% of selected features: [ 8 10 11 13 14 15 17 19 21 22 25 31 33 35 37 40 60]
25% of selected features: [ 8 10 11 13 14 15 17 19 21 22 23 24 25 31 33 35 37 40 44 60 84]
30% of selected features: [ 8 10 11 13 14 15 17 18 19 21 22 23 24 25 26 31 33 35 37 40 43 44 60 70
 84]


In [13]:
predictors_anova_10 = df.iloc[:, selector10_indices].values
predictors_anova_15 = df.iloc[:, selector15_indices].values
predictors_anova_20 = df.iloc[:, selector20_indices].values
predictors_anova_25 = df.iloc[:, selector25_indices].values
predictors_anova_30 = df.iloc[:, selector30_indices].values

## **NORMALIZATION AND ANOVA F-VALUE**



In [14]:
X = predictors_norm
y = target

# Use SelectKBest with the f_classif test
selector10 = SelectKBest(f_classif, k=9)   # Select the 9 best resources, approximately 10%
selector15 = SelectKBest(f_classif, k=13)  # Select the 13 best resources, approximately 15%
selector20 = SelectKBest(f_classif, k=17)  # Select the 17 best resources, approximately 20%
selector25 = SelectKBest(f_classif, k=21)  # Select the 21 best resources, approximately 25%
selector30 = SelectKBest(f_classif, k=25)  # Select the 25 best resources, approximately 30%
X_10 = selector10.fit_transform(X, y)
X_15 = selector15.fit_transform(X, y)
X_20 = selector20.fit_transform(X, y)
X_25 = selector25.fit_transform(X, y)
X_30 = selector30.fit_transform(X, y)

start_index_independent_variables = 8

selector10_indices = selector10.get_support(indices=True)
for j in range(len(selector10_indices)):
  selector10_indices[j] = selector10_indices[j]+start_index_independent_variables

selector15_indices = selector15.get_support(indices=True)
for j in range(len(selector15_indices)):
  selector15_indices[j] = selector15_indices[j]+start_index_independent_variables

selector20_indices = selector20.get_support(indices=True)
for j in range(len(selector20_indices)):
  selector20_indices[j] = selector20_indices[j]+start_index_independent_variables

selector25_indices = selector25.get_support(indices=True)
for j in range(len(selector25_indices)):
  selector25_indices[j] = selector25_indices[j]+start_index_independent_variables

selector30_indices = selector30.get_support(indices=True)
for j in range(len(selector30_indices)):
  selector30_indices[j] = selector30_indices[j]+start_index_independent_variables

print("10% of selected features:", selector10_indices)
print("15% of selected features:", selector15_indices)
print("20% of selected features:", selector20_indices)
print("25% of selected features:", selector25_indices)
print("30% of selected features:", selector30_indices)

10% of selected features: [11 14 15 17 19 21 22 25 37]
15% of selected features: [10 11 13 14 15 17 19 21 22 25 33 37 40]
20% of selected features: [10 11 13 14 15 16 17 19 21 22 24 25 33 37 40 62 63]
25% of selected features: [ 9 10 11 13 14 15 16 17 19 21 22 23 24 25 33 35 37 40 62 63 89]
30% of selected features: [ 9 10 11 13 14 15 16 17 18 19 21 22 23 24 25 26 33 34 35 37 40 54 62 63
 89]


In [15]:
predictors_anova_norm10 = df.iloc[:, selector10_indices].values
predictors_anova_norm15 = df.iloc[:, selector15_indices].values
predictors_anova_norm20 = df.iloc[:, selector20_indices].values
predictors_anova_norm25 = df.iloc[:, selector25_indices].values
predictors_anova_norm30 = df.iloc[:, selector30_indices].values

## **STANDARDIZATION AND ANOVA F-VALUE**

In [16]:
X = predictors_stand
y = target

# Use SelectKBest with the f_classif test
selector10 = SelectKBest(f_classif, k=9)   # Select the 9 best resources, approximately 10%
selector15 = SelectKBest(f_classif, k=13)  # Select the 13 best resources, approximately 15%
selector20 = SelectKBest(f_classif, k=17)  # Select the 17 best resources, approximately 20%
selector25 = SelectKBest(f_classif, k=21)  # Select the 21 best resources, approximately 25%
selector30 = SelectKBest(f_classif, k=25)  # Select the 25 best resources, approximately 30%
X_10 = selector10.fit_transform(X, y)
X_15 = selector15.fit_transform(X, y)
X_20 = selector20.fit_transform(X, y)
X_25 = selector25.fit_transform(X, y)
X_30 = selector30.fit_transform(X, y)

start_index_independent_variables = 8

selector10_indices = selector10.get_support(indices=True)
for j in range(len(selector10_indices)):
  selector10_indices[j] = selector10_indices[j]+start_index_independent_variables

selector15_indices = selector15.get_support(indices=True)
for j in range(len(selector15_indices)):
  selector15_indices[j] = selector15_indices[j]+start_index_independent_variables

selector20_indices = selector20.get_support(indices=True)
for j in range(len(selector20_indices)):
  selector20_indices[j] = selector20_indices[j]+start_index_independent_variables

selector25_indices = selector25.get_support(indices=True)
for j in range(len(selector25_indices)):
  selector25_indices[j] = selector25_indices[j]+start_index_independent_variables

selector30_indices = selector30.get_support(indices=True)
for j in range(len(selector30_indices)):
  selector30_indices[j] = selector30_indices[j]+start_index_independent_variables

print("10% of selected features:", selector10_indices)
print("15% of selected features:", selector15_indices)
print("20% of selected features:", selector20_indices)
print("25% of selected features:", selector25_indices)
print("30% of selected features:", selector30_indices)

10% of selected features: [10 11 13 14 15 17 19 21 25]
15% of selected features: [10 11 13 14 15 17 19 21 22 25 33 35 40]
20% of selected features: [ 8 10 11 13 14 15 17 19 21 22 25 31 33 35 37 40 60]
25% of selected features: [ 8 10 11 13 14 15 17 19 21 22 23 24 25 31 33 35 37 40 44 60 84]
30% of selected features: [ 8 10 11 13 14 15 17 18 19 21 22 23 24 25 26 31 33 35 37 40 43 44 60 70
 84]


In [17]:
predictors_anova_stand10 = df.iloc[:, selector10_indices].values
predictors_anova_stand15 = df.iloc[:, selector15_indices].values
predictors_anova_stand20 = df.iloc[:, selector20_indices].values
predictors_anova_stand25 = df.iloc[:, selector25_indices].values
predictors_anova_stand30 = df.iloc[:, selector30_indices].values

In [18]:
predictors_dict = {'predictors': predictors,
                   'predictors_chi_original': predictors_chi_original,
                   'predictors_chi_10': predictors_chi_10,
                   'predictors_chi_15': predictors_chi_15,
                   'predictors_chi_20': predictors_chi_20,
                   'predictors_chi_25': predictors_chi_25,
                   'predictors_chi_30': predictors_chi_30,
                   'predictors_chi_norm10': predictors_chi_norm10,
                   'predictors_chi_norm15': predictors_chi_norm15,
                   'predictors_chi_norm20': predictors_chi_norm20,
                   'predictors_chi_norm25': predictors_chi_norm25,
                   'predictors_chi_norm30': predictors_chi_norm30,
                   'predictors_anova_10': predictors_anova_10,
                   'predictors_anova_15': predictors_anova_15,
                   'predictors_anova_20': predictors_anova_20,
                   'predictors_anova_25': predictors_anova_25,
                   'predictors_anova_30': predictors_anova_30,
                   'predictors_anova_norm10': predictors_anova_norm10,
                   'predictors_anova_norm15': predictors_anova_norm15,
                   'predictors_anova_norm20': predictors_anova_norm20,
                   'predictors_anova_norm25': predictors_anova_norm25,
                   'predictors_anova_norm30': predictors_anova_norm30,
                   'predictors_anova_stand10': predictors_anova_stand10,
                   'predictors_anova_stand15': predictors_anova_stand15,
                   'predictors_anova_stand20': predictors_anova_stand20,
                   'predictors_anova_stand25': predictors_anova_stand25,
                   'predictors_anova_stand30': predictors_anova_stand30,
                   }

# **scikit-optimize**

https://scikit-learn.org/stable/modules/tree.html

In [None]:
!pip install scikit-optimize



# **RANDOM FOREST**

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

# **Nornalization and Chi-Square**

# **RandomizedSearchCV**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 5, shuffle=True, random_state = 42)

# Define the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid for the search
param_dist = {
    'n_estimators': np.arange(50, 1000, 2),
    'criterion': ['entropy', 'gini', 'log_loss'],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': np.arange(1, 20, 1),
    'min_samples_split': np.arange(2, 20, 1),
    'min_samples_leaf': np.arange(1, 20, 1),
    'bootstrap': [True, False]
}

# Configure RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, scoring='f1', n_iter=50, cv=5, random_state=42, n_jobs=-1)

predictors_keys = list(predictors_dict.keys())

for predictor in predictors_keys:

    predictors_array = predictors_dict[predictor]

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(predictors_array, target, test_size=0.3, random_state=42)

    # Adjust the model
    random_search.fit(X_train, y_train)

    # Best hyperparameter combination
    best_params = random_search.best_params_

    # Evaluate the optimized model on the test set
    best_estimator = random_search.best_estimator_
    y_pred_test = best_estimator.predict(X_test)
    y_pred_train = best_estimator.predict(X_train)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)

    result = cross_val_score(best_estimator, predictors_array, target, cv = kfold)

    print(f"{predictor};{best_params};{accuracy_train*100:.2f};{accuracy_test*100:.2f};{result.mean()*100:.2f};{result.std()*100:.2f}")

# **BayesSearchCV**

In [None]:
from skopt import BayesSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 5, shuffle=True, random_state = 42)

# Define the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid for the search
param_dist = {
    'n_estimators': np.arange(50, 1000, 2),
    'criterion': ['entropy', 'gini', 'log_loss'],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': np.arange(1, 20, 1),
    'min_samples_split': np.arange(2, 20, 1),
    'min_samples_leaf': np.arange(1, 20, 1),
    'bootstrap': [True, False]
}

# Configure BayesSearchCV
bayes_search = BayesSearchCV(estimator=rf, search_spaces=param_dist, scoring='f1', n_iter=50, cv=5, random_state=42, n_jobs=-1)

predictors_keys = list(predictors_dict.keys())

for predictor in predictors_keys:

    predictors_array = predictors_dict[predictor]

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(predictors_array, target, test_size=0.3, random_state=42)

    # Adjust the model
    bayes_search.fit(X_train, y_train)

    # Best hyperparameter combination
    best_params = bayes_search.best_params_

    # Evaluate the optimized model on the test set
    best_estimator = bayes_search.best_estimator_
    y_pred_test = best_estimator.predict(X_test)
    y_pred_train = best_estimator.predict(X_train)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)

    result = cross_val_score(best_estimator, predictors_array, target, cv = kfold)

    print(f"{predictor};{best_params};{accuracy_train*100:.2f};{accuracy_test*100:.2f};{result.mean()*100:.2f};{result.std()*100:.2f}")

# **XGBOOST**

# **The first approach involving Standardization, Chi-square with XGBoost**

https://xgboost.readthedocs.io/en/stable/

In [None]:
pip install xgboost


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# **RandomizedSearchCV**

> Adicionar aspas



In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 5, shuffle=True, random_state = 42)

# Define the XGBoost model
xg = XGBClassifier(random_state=42, verbosity=0)

# Define the hyperparameter grid for the search
param_dist = {
    'n_estimators': np.arange(50, 1000, 2),
    'max_depth': np.arange(1, 20, 1),
    'booster' : ['gbtree', 'dart'],
    'tree_method' : ['exact', 'approx', 'hist'],
    'grow_policy' : ['depthwise', 'lossguide'],
    'learning_rate' : np.linspace(0.01, 0.5, 30)
}

# Configure RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=xg, param_distributions=param_dist, scoring='f1', n_iter=50, cv=5, random_state=42, n_jobs=-1)

predictors_keys = list(predictors_dict.keys())

for predictor in predictors_keys:

    predictors_array = predictors_dict[predictor]

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(predictors_array, target, test_size=0.3, random_state=42)

    # Adjust the model
    random_search.fit(X_train, y_train)

    # Best hyperparameter combination
    best_params = random_search.best_params_

    # Evaluate the optimized model on the test set
    best_estimator = random_search.best_estimator_
    y_pred_test = best_estimator.predict(X_test)
    y_pred_train = best_estimator.predict(X_train)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)

    result = cross_val_score(best_estimator, predictors_array, target, cv = kfold)

    print(f"{predictor};{best_params};{accuracy_train*100:.2f};{accuracy_test*100:.2f};{result.mean()*100:.2f};{result.std()*100:.2f}")

# **BayesSearchCV**

In [None]:
from xgboost import XGBClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Integer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import cupy as cp
import time

kfold = KFold(n_splits = 5, shuffle=True, random_state = 42)

# Define the XGBoost model
xg = XGBClassifier(random_state=42, device='cuda', verbosity=0)

# Define the hyperparameter grid for the search
param_dist = {
    'n_estimators': np.arange(50, 1000, 2),
    'max_depth': np.arange(1, 20, 1),
    'booster' : ['gbtree', 'dart'],
    'tree_method' : ['approx', 'hist'],
    'grow_policy' : ['depthwise', 'lossguide'],
    'learning_rate' : np.linspace(0.01, 0.5, 30)
}

# Configure BayesSearchCV
bayes_search = BayesSearchCV(estimator=xg, search_spaces=param_dist, scoring='f1', n_iter=50, cv=5, random_state=42, n_jobs=-1)

predictors_keys = list(predictors_dict.keys())

for predictor in predictors_keys:

    start = time.time()

    predictors_array = predictors_dict[predictor]
    predictors_array = cp.array(predictors_array).get()

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(predictors_array, target, test_size=0.3, random_state=42)

    # Adjust the model
    bayes_search.fit(X_train, y_train)

    # Best hyperparameter combination
    best_params = bayes_search.best_params_

    # Evaluate the optimized model on the test set
    best_estimator = bayes_search.best_estimator_
    y_pred_test = best_estimator.predict(X_test)
    y_pred_train = best_estimator.predict(X_train)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)

    result = cross_val_score(best_estimator, predictors_array, target, cv = kfold)

    print(f"{predictor};{best_params};{accuracy_train*100:.2f};{accuracy_test*100:.2f};{result.mean()*100:.2f};{result.std()*100:.2f};{(str(time.time() - start))}")

# **CATBOOST**

https://catboost.ai/en/docs/

In [None]:
#Instalação
!pip install catboost



# **RandomizedSearchCV**

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 5, shuffle=True, random_state = 42)

# Define the CatBoost model
catboost = CatBoostClassifier(task_type='CPU', thread_count=-1, random_state = 42, verbose=False)

# Define the hyperparameter grid for the search
param_dist = {
    'iterations': np.arange(50, 1000, 2),
    'depth': np.arange(1, 16, 1),
    'learning_rate': np.linspace(0.01, 0.5, 30),
    'l2_leaf_reg': np.arange(1, 10, 1),
    'border_count': np.arange(32, 256, 16),
    'feature_border_type' : ['Median', 'Uniform', 'UniformAndQuantiles', 'GreedyLogSum', 'MaxLogSum', 'MinEntropy'],
    'leaf_estimation_method' : ['Newton', 'Gradient'],
    'auto_class_weights' : ['Balanced', 'SqrtBalanced'],
    'grow_policy' : ['SymmetricTree', 'Lossguide', 'Depthwise'],
    'bootstrap_type' : ['Bayesian', 'Bernoulli', 'MVS', 'No']
}

# Configure RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=catboost, param_distributions=param_dist, scoring='f1', cv=5, n_iter=50, n_jobs=-1, random_state=42)

predictors_keys = list(predictors_dict.keys())

for predictor in predictors_keys:

    predictors_array = predictors_dict[predictor]

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(predictors_array, target, test_size=0.3, random_state=42)

    # Adjust the model
    random_search.fit(X_train, y_train)

    # Best hyperparameter combination
    best_params = random_search.best_params_

    # Evaluate the optimized model on the test set
    best_estimator = random_search.best_estimator_
    y_pred_test = best_estimator.predict(X_test)
    y_pred_train = best_estimator.predict(X_train)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)

    result = cross_val_score(best_estimator, predictors_array, target, cv = kfold)

    print(f"{predictor};{best_params};{accuracy_train*100:.2f};{accuracy_test*100:.2f};{result.mean()*100:.2f};{result.std()*100:.2f}")

# **BayesSearchCV**

In [None]:
from catboost import CatBoostClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Integer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 5, shuffle=True, random_state = 42)

# Define the CatBoost model
catboost = CatBoostClassifier(task_type='CPU', thread_count=-1, random_state = 42, verbose=False)

# Define the hyperparameter grid for the search
param_dist = {
    'iterations': np.arange(50, 1000, 2),
    'depth': np.arange(1, 16, 1),
    'learning_rate': np.linspace(0.01, 0.5, 30),
    'l2_leaf_reg': np.arange(1, 10, 1),
    'border_count': np.arange(32, 256, 16),
    'feature_border_type' : ['Median', 'Uniform', 'UniformAndQuantiles', 'GreedyLogSum', 'MaxLogSum', 'MinEntropy'],
    'leaf_estimation_method' : ['Newton', 'Gradient'],
    'auto_class_weights' : ['Balanced', 'SqrtBalanced'],
    'grow_policy' : ['SymmetricTree', 'Lossguide', 'Depthwise'],
    'bootstrap_type' : ['Bayesian', 'Bernoulli', 'MVS', 'No']
}

# Configure BayesSearchCV
bayes_search = BayesSearchCV(estimator=catboost, search_spaces=param_dist, scoring='f1', cv=5, n_iter=50, random_state=42)

predictors_keys = list(predictors_dict.keys())

for predictor in predictors_keys:

    predictors_array = predictors_dict[predictor]

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(predictors_array, target, test_size=0.3, random_state=42)

    # Adjust the model
    bayes_search.fit(X_train, y_train)

    # Best hyperparameter combination
    best_params = bayes_search.best_params_

    # Evaluate the optimized model on the test set
    best_estimator = bayes_search.best_estimator_
    y_pred_test = best_estimator.predict(X_test)
    y_pred_train = best_estimator.predict(X_train)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)

    result = cross_val_score(best_estimator, predictors_array, target, cv = kfold)

    print(f"{predictor};{best_params};{accuracy_train*100:.2f};{accuracy_test*100:.2f};{result.mean()*100:.2f};{result.std()*100:.2f}")